diff --git a/.gitignore b/.gitignore index 3dd929e..fe89c85 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,5 @@ Thumbs.db # Logs *.log +cache/ + diff --git a/CMakeLists.txt b/CMakeLists.txt index bd68b15..0d07eec 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,6 +6,12 @@ project(polymech-cli LANGUAGES CXX C ) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_SOURCE_DIR}/dist") +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG "${CMAKE_SOURCE_DIR}/dist") +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE "${CMAKE_SOURCE_DIR}/dist") +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELWITHDEBINFO "${CMAKE_SOURCE_DIR}/dist") +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_MINSIZEREL "${CMAKE_SOURCE_DIR}/dist") + # ── C++ standard ───────────────────────────────────────────────────────────── set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) @@ -35,7 +41,30 @@ FetchContent_Declare( GIT_SHALLOW TRUE ) -FetchContent_MakeAvailable(cli11 tomlplusplus Catch2) +FetchContent_Declare( + asio + GIT_REPOSITORY https://github.com/chriskohlhoff/asio.git + GIT_TAG asio-1-28-0 + GIT_SHALLOW TRUE +) + +FetchContent_Declare( + concurrentqueue + GIT_REPOSITORY https://github.com/cameron314/concurrentqueue.git + GIT_TAG v1.0.4 + GIT_SHALLOW TRUE +) + +FetchContent_Declare( + taskflow + GIT_REPOSITORY https://github.com/taskflow/taskflow.git + GIT_TAG v3.6.0 + GIT_SHALLOW TRUE +) + +set(TF_BUILD_TESTS OFF CACHE BOOL "" FORCE) +set(TF_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE) +FetchContent_MakeAvailable(cli11 tomlplusplus Catch2 asio concurrentqueue taskflow) # ── Packages ───────────────────────────────────────────────────────────────── add_subdirectory(packages/logger) @@ -49,13 +78,29 @@ add_subdirectory(packages/geo) add_subdirectory(packages/gadm_reader) add_subdirectory(packages/grid) add_subdirectory(packages/search) +add_subdirectory(packages/enrichers) # ── Sources ────────────────────────────────────────────────────────────────── add_executable(${PROJECT_NAME} src/main.cpp + src/cmd_gridsearch.cpp + src/cmd_gridsearch-uds.cpp + src/cmd_gridsearch-postgres.cpp + src/gridsearch_serialize.cpp + src/sys_metrics.cpp ) -target_link_libraries(${PROJECT_NAME} PRIVATE CLI11::CLI11 tomlplusplus::tomlplusplus logger html postgres http json polymech ipc geo gadm_reader grid search) +target_link_libraries(${PROJECT_NAME} PRIVATE CLI11::CLI11 tomlplusplus::tomlplusplus logger html postgres http json polymech ipc geo gadm_reader grid search enrichers) + +target_include_directories(${PROJECT_NAME} PRIVATE + ${asio_SOURCE_DIR}/asio/include + ${taskflow_SOURCE_DIR} + ${concurrentqueue_SOURCE_DIR} +) + +# Define standalone ASIO (since it's not boost) +target_compile_definitions(${PROJECT_NAME} PRIVATE ASIO_STANDALONE=1 ASIO_NO_DEPRECATED=1) + # ── Compiler warnings ─────────────────────────────────────────────────────── if(MSVC) diff --git a/README.md b/README.md index 4edfddd..2a8c6e3 100644 --- a/README.md +++ b/README.md @@ -30,4 +30,10 @@ polymech-cli --version ## License -BSD-3-Clause \ No newline at end of file +BSD-3-Clause + +## Requirements + +- [https://github.com/taskflow/taskflow](https://github.com/taskflow/taskflow) +- [https://github.com/cameron314/concurrentqueue](https://github.com/cameron314/concurrentqueue) +- [https://github.com/chriskohlhoff/asio](https://github.com/chriskohlhoff/asio) diff --git a/build-linux.sh b/build-linux.sh new file mode 100644 index 0000000..b36ee79 --- /dev/null +++ b/build-linux.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash +#rm -rf /tmp/polymech-build +mkdir -p /tmp/polymech-build +export PATH="/snap/bin:$PATH" +cmake -S /mnt/hgfs/Desktop/polymech/pm-pics/server/cpp -B /tmp/polymech-build -DCMAKE_BUILD_TYPE=Release +cmake --build /tmp/polymech-build diff --git a/cache/gadm/boundary_ABW_0.json b/cache/gadm/boundary_ABW_0.json deleted file mode 100644 index 6f71b3f..0000000 --- a/cache/gadm/boundary_ABW_0.json +++ /dev/null @@ -1 +0,0 @@ -{"features":[{"geometry":{"coordinates":[[[-69.9782,12.46986],[-69.98736,12.48097],[-69.99792,12.47847],[-70.00208,12.48486],[-70.0107,12.48875],[-70.0107,12.49347],[-70.02847,12.50319],[-70.03208,12.51347],[-70.04292,12.51875],[-70.06347,12.53931],[-70.05514,12.55458],[-70.05597,12.55986],[-70.04792,12.56875],[-70.04486,12.58069],[-70.05375,12.6107],[-70.05875,12.61625],[-70.05625,12.62319],[-70.05125,12.62403],[-70.04181,12.61403],[-70.02375,12.60347],[-70.01347,12.58681],[-69.98764,12.55958],[-69.9782,12.55931],[-69.97903,12.55653],[-69.9707,12.55208],[-69.9707,12.54792],[-69.96958,12.55153],[-69.96291,12.54625],[-69.95819,12.54653],[-69.95597,12.5368],[-69.94681,12.5407],[-69.92819,12.52486],[-69.92709,12.51514],[-69.92431,12.51542],[-69.91764,12.50597],[-69.9093,12.50264],[-69.89625,12.48569],[-69.88958,12.48486],[-69.88458,12.47847],[-69.88153,12.46375],[-69.87347,12.44764],[-69.87375,12.43875],[-69.86625,12.4157],[-69.87347,12.41236],[-69.88403,12.41292],[-69.88736,12.42042],[-69.89569,12.42069],[-69.9082,12.43097],[-69.92709,12.43236],[-69.9257,12.43931],[-69.94041,12.4418],[-69.95403,12.45042],[-69.97598,12.46875],[-69.97486,12.47458],[-69.9782,12.46986]]],"type":"Polygon"},"properties":{"GID_0":"ABW","NAME_0":"Aruba","ghsBuiltCenter":[-69.99304,12.51234],"ghsBuiltCenters":[[-70.01503,12.50648,8970.0],[-70.05108,12.53423,8710.0],[-69.99892,12.48281,8660.0],[-69.9548,12.45505,7461.0],[-69.89409,12.42486,7435.0]],"ghsBuiltMax":8970.0,"ghsBuiltWeight":22900682.0,"ghsPopCenter":[-69.99866,12.51683],"ghsPopCenters":[[-70.04183,12.53341,104.0],[-69.90443,12.4322,98.0],[-70.01465,12.51627,81.0],[-69.98646,12.52933,51.0],[-69.96467,12.46566,51.0]],"ghsPopMaxDensity":104.0,"ghsPopulation":104847.0,"isOuter":true},"type":"Feature"}],"type":"FeatureCollection"} \ No newline at end of file diff --git a/cache/gadm/boundary_ABW_1.json b/cache/gadm/boundary_ABW_1.json deleted file mode 100644 index f6232d3..0000000 --- a/cache/gadm/boundary_ABW_1.json +++ /dev/null @@ -1 +0,0 @@ -{"features":[{"geometry":{"coordinates":[[[-69.9782,12.46986],[-69.98736,12.48097],[-69.99792,12.47847],[-70.00208,12.48486],[-70.0107,12.48875],[-70.0107,12.49347],[-70.02847,12.50319],[-70.03208,12.51347],[-70.04292,12.51875],[-70.06347,12.53931],[-70.05514,12.55458],[-70.05597,12.55986],[-70.04792,12.56875],[-70.04486,12.58069],[-70.05375,12.6107],[-70.05875,12.61625],[-70.05625,12.62319],[-70.05125,12.62403],[-70.04181,12.61403],[-70.02375,12.60347],[-70.01347,12.58681],[-69.98764,12.55958],[-69.9782,12.55931],[-69.97903,12.55653],[-69.9707,12.55208],[-69.9707,12.54792],[-69.96958,12.55153],[-69.96291,12.54625],[-69.95819,12.54653],[-69.95597,12.5368],[-69.94681,12.5407],[-69.92819,12.52486],[-69.92709,12.51514],[-69.92431,12.51542],[-69.91764,12.50597],[-69.9093,12.50264],[-69.89625,12.48569],[-69.88958,12.48486],[-69.88458,12.47847],[-69.88153,12.46375],[-69.87347,12.44764],[-69.87375,12.43875],[-69.86625,12.4157],[-69.87347,12.41236],[-69.88403,12.41292],[-69.88736,12.42042],[-69.89569,12.42069],[-69.9082,12.43097],[-69.92709,12.43236],[-69.9257,12.43931],[-69.94041,12.4418],[-69.95403,12.45042],[-69.97598,12.46875],[-69.97486,12.47458],[-69.9782,12.46986]]],"type":"Polygon"},"properties":{"GID_1":"","NAME_1":"","ghsBuiltCenter":[-69.99304,12.51234],"ghsBuiltCenters":[[-70.01503,12.50648,8970.0],[-70.05108,12.53423,8710.0],[-69.99892,12.48281,8660.0],[-69.9548,12.45505,7461.0],[-69.89409,12.42486,7435.0]],"ghsBuiltMax":8970.0,"ghsBuiltWeight":22900682.0,"ghsPopCenter":[-69.99866,12.51683],"ghsPopCenters":[[-70.04183,12.53341,104.0],[-69.90443,12.4322,98.0],[-70.01465,12.51627,81.0],[-69.98646,12.52933,51.0],[-69.96467,12.46566,51.0]],"ghsPopMaxDensity":104.0,"ghsPopulation":104847.0},"type":"Feature"}],"type":"FeatureCollection"} \ No newline at end of file diff --git a/cache/gadm/boundary_ABW_2.json b/cache/gadm/boundary_ABW_2.json deleted file mode 100644 index 9c90ab8..0000000 --- a/cache/gadm/boundary_ABW_2.json +++ /dev/null @@ -1 +0,0 @@ -{"features":[{"geometry":{"coordinates":[[[-69.9782,12.46986],[-69.98736,12.48097],[-69.99792,12.47847],[-70.00208,12.48486],[-70.0107,12.48875],[-70.0107,12.49347],[-70.02847,12.50319],[-70.03208,12.51347],[-70.04292,12.51875],[-70.06347,12.53931],[-70.05514,12.55458],[-70.05597,12.55986],[-70.04792,12.56875],[-70.04486,12.58069],[-70.05375,12.6107],[-70.05875,12.61625],[-70.05625,12.62319],[-70.05125,12.62403],[-70.04181,12.61403],[-70.02375,12.60347],[-70.01347,12.58681],[-69.98764,12.55958],[-69.9782,12.55931],[-69.97903,12.55653],[-69.9707,12.55208],[-69.9707,12.54792],[-69.96958,12.55153],[-69.96291,12.54625],[-69.95819,12.54653],[-69.95597,12.5368],[-69.94681,12.5407],[-69.92819,12.52486],[-69.92709,12.51514],[-69.92431,12.51542],[-69.91764,12.50597],[-69.9093,12.50264],[-69.89625,12.48569],[-69.88958,12.48486],[-69.88458,12.47847],[-69.88153,12.46375],[-69.87347,12.44764],[-69.87375,12.43875],[-69.86625,12.4157],[-69.87347,12.41236],[-69.88403,12.41292],[-69.88736,12.42042],[-69.89569,12.42069],[-69.9082,12.43097],[-69.92709,12.43236],[-69.9257,12.43931],[-69.94041,12.4418],[-69.95403,12.45042],[-69.97598,12.46875],[-69.97486,12.47458],[-69.9782,12.46986]]],"type":"Polygon"},"properties":{"GID_2":"","NAME_2":"","ghsBuiltCenter":[-69.99304,12.51234],"ghsBuiltCenters":[[-70.01503,12.50648,8970.0],[-70.05108,12.53423,8710.0],[-69.99892,12.48281,8660.0],[-69.9548,12.45505,7461.0],[-69.89409,12.42486,7435.0]],"ghsBuiltMax":8970.0,"ghsBuiltWeight":22900682.0,"ghsPopCenter":[-69.99866,12.51683],"ghsPopCenters":[[-70.04183,12.53341,104.0],[-69.90443,12.4322,98.0],[-70.01465,12.51627,81.0],[-69.98646,12.52933,51.0],[-69.96467,12.46566,51.0]],"ghsPopMaxDensity":104.0,"ghsPopulation":104847.0},"type":"Feature"}],"type":"FeatureCollection"} \ No newline at end of file diff --git a/cache/gadm/boundary_ABW_3.json b/cache/gadm/boundary_ABW_3.json deleted file mode 100644 index 93ba4c0..0000000 --- a/cache/gadm/boundary_ABW_3.json +++ /dev/null @@ -1 +0,0 @@ -{"features":[{"geometry":{"coordinates":[[[-69.9782,12.46986],[-69.98736,12.48097],[-69.99792,12.47847],[-70.00208,12.48486],[-70.0107,12.48875],[-70.0107,12.49347],[-70.02847,12.50319],[-70.03208,12.51347],[-70.04292,12.51875],[-70.06347,12.53931],[-70.05514,12.55458],[-70.05597,12.55986],[-70.04792,12.56875],[-70.04486,12.58069],[-70.05375,12.6107],[-70.05875,12.61625],[-70.05625,12.62319],[-70.05125,12.62403],[-70.04181,12.61403],[-70.02375,12.60347],[-70.01347,12.58681],[-69.98764,12.55958],[-69.9782,12.55931],[-69.97903,12.55653],[-69.9707,12.55208],[-69.9707,12.54792],[-69.96958,12.55153],[-69.96291,12.54625],[-69.95819,12.54653],[-69.95597,12.5368],[-69.94681,12.5407],[-69.92819,12.52486],[-69.92709,12.51514],[-69.92431,12.51542],[-69.91764,12.50597],[-69.9093,12.50264],[-69.89625,12.48569],[-69.88958,12.48486],[-69.88458,12.47847],[-69.88153,12.46375],[-69.87347,12.44764],[-69.87375,12.43875],[-69.86625,12.4157],[-69.87347,12.41236],[-69.88403,12.41292],[-69.88736,12.42042],[-69.89569,12.42069],[-69.9082,12.43097],[-69.92709,12.43236],[-69.9257,12.43931],[-69.94041,12.4418],[-69.95403,12.45042],[-69.97598,12.46875],[-69.97486,12.47458],[-69.9782,12.46986]]],"type":"Polygon"},"properties":{"GID_3":"","NAME_3":"","ghsBuiltCenter":[-69.99304,12.51234],"ghsBuiltCenters":[[-70.01503,12.50648,8970.0],[-70.05108,12.53423,8710.0],[-69.99892,12.48281,8660.0],[-69.9548,12.45505,7461.0],[-69.89409,12.42486,7435.0]],"ghsBuiltMax":8970.0,"ghsBuiltWeight":22900682.0,"ghsPopCenter":[-69.99866,12.51683],"ghsPopCenters":[[-70.04183,12.53341,104.0],[-69.90443,12.4322,98.0],[-70.01465,12.51627,81.0],[-69.98646,12.52933,51.0],[-69.96467,12.46566,51.0]],"ghsPopMaxDensity":104.0,"ghsPopulation":104847.0},"type":"Feature"}],"type":"FeatureCollection"} \ No newline at end of file diff --git a/cache/gadm/boundary_ABW_4.json b/cache/gadm/boundary_ABW_4.json deleted file mode 100644 index 10697e9..0000000 --- a/cache/gadm/boundary_ABW_4.json +++ /dev/null @@ -1 +0,0 @@ -{"features":[{"geometry":{"coordinates":[[[-69.9782,12.46986],[-69.98736,12.48097],[-69.99792,12.47847],[-70.00208,12.48486],[-70.0107,12.48875],[-70.0107,12.49347],[-70.02847,12.50319],[-70.03208,12.51347],[-70.04292,12.51875],[-70.06347,12.53931],[-70.05514,12.55458],[-70.05597,12.55986],[-70.04792,12.56875],[-70.04486,12.58069],[-70.05375,12.6107],[-70.05875,12.61625],[-70.05625,12.62319],[-70.05125,12.62403],[-70.04181,12.61403],[-70.02375,12.60347],[-70.01347,12.58681],[-69.98764,12.55958],[-69.9782,12.55931],[-69.97903,12.55653],[-69.9707,12.55208],[-69.9707,12.54792],[-69.96958,12.55153],[-69.96291,12.54625],[-69.95819,12.54653],[-69.95597,12.5368],[-69.94681,12.5407],[-69.92819,12.52486],[-69.92709,12.51514],[-69.92431,12.51542],[-69.91764,12.50597],[-69.9093,12.50264],[-69.89625,12.48569],[-69.88958,12.48486],[-69.88458,12.47847],[-69.88153,12.46375],[-69.87347,12.44764],[-69.87375,12.43875],[-69.86625,12.4157],[-69.87347,12.41236],[-69.88403,12.41292],[-69.88736,12.42042],[-69.89569,12.42069],[-69.9082,12.43097],[-69.92709,12.43236],[-69.9257,12.43931],[-69.94041,12.4418],[-69.95403,12.45042],[-69.97598,12.46875],[-69.97486,12.47458],[-69.9782,12.46986]]],"type":"Polygon"},"properties":{"GID_4":"","NAME_4":"","ghsBuiltCenter":[-69.99304,12.51234],"ghsBuiltCenters":[[-70.01503,12.50648,8970.0],[-70.05108,12.53423,8710.0],[-69.99892,12.48281,8660.0],[-69.9548,12.45505,7461.0],[-69.89409,12.42486,7435.0]],"ghsBuiltMax":8970.0,"ghsBuiltWeight":22900682.0,"ghsPopCenter":[-69.99866,12.51683],"ghsPopCenters":[[-70.04183,12.53341,104.0],[-69.90443,12.4322,98.0],[-70.01465,12.51627,81.0],[-69.98646,12.52933,51.0],[-69.96467,12.46566,51.0]],"ghsPopMaxDensity":104.0,"ghsPopulation":104847.0},"type":"Feature"}],"type":"FeatureCollection"} \ No newline at end of file diff --git a/cache/gadm/boundary_ABW_5.json b/cache/gadm/boundary_ABW_5.json deleted file mode 100644 index 38ca66e..0000000 --- a/cache/gadm/boundary_ABW_5.json +++ /dev/null @@ -1 +0,0 @@ -{"features":[{"geometry":{"coordinates":[[[-69.9782,12.46986],[-69.98736,12.48097],[-69.99792,12.47847],[-70.00208,12.48486],[-70.0107,12.48875],[-70.0107,12.49347],[-70.02847,12.50319],[-70.03208,12.51347],[-70.04292,12.51875],[-70.06347,12.53931],[-70.05514,12.55458],[-70.05597,12.55986],[-70.04792,12.56875],[-70.04486,12.58069],[-70.05375,12.6107],[-70.05875,12.61625],[-70.05625,12.62319],[-70.05125,12.62403],[-70.04181,12.61403],[-70.02375,12.60347],[-70.01347,12.58681],[-69.98764,12.55958],[-69.9782,12.55931],[-69.97903,12.55653],[-69.9707,12.55208],[-69.9707,12.54792],[-69.96958,12.55153],[-69.96291,12.54625],[-69.95819,12.54653],[-69.95597,12.5368],[-69.94681,12.5407],[-69.92819,12.52486],[-69.92709,12.51514],[-69.92431,12.51542],[-69.91764,12.50597],[-69.9093,12.50264],[-69.89625,12.48569],[-69.88958,12.48486],[-69.88458,12.47847],[-69.88153,12.46375],[-69.87347,12.44764],[-69.87375,12.43875],[-69.86625,12.4157],[-69.87347,12.41236],[-69.88403,12.41292],[-69.88736,12.42042],[-69.89569,12.42069],[-69.9082,12.43097],[-69.92709,12.43236],[-69.9257,12.43931],[-69.94041,12.4418],[-69.95403,12.45042],[-69.97598,12.46875],[-69.97486,12.47458],[-69.9782,12.46986]]],"type":"Polygon"},"properties":{"GID_5":"","NAME_5":"","ghsBuiltCenter":[-69.99304,12.51234],"ghsBuiltCenters":[[-70.01503,12.50648,8970.0],[-70.05108,12.53423,8710.0],[-69.99892,12.48281,8660.0],[-69.9548,12.45505,7461.0],[-69.89409,12.42486,7435.0]],"ghsBuiltMax":8970.0,"ghsBuiltWeight":22900682.0,"ghsPopCenter":[-69.99866,12.51683],"ghsPopCenters":[[-70.04183,12.53341,104.0],[-69.90443,12.4322,98.0],[-70.01465,12.51627,81.0],[-69.98646,12.52933,51.0],[-69.96467,12.46566,51.0]],"ghsPopMaxDensity":104.0,"ghsPopulation":104847.0},"type":"Feature"}],"type":"FeatureCollection"} \ No newline at end of file diff --git a/cache/gadm/boundary_AFG.1.1_1_2.json b/cache/gadm/boundary_AFG.1.1_1_2.json deleted file mode 100644 index 9dc4cdb..0000000 --- a/cache/gadm/boundary_AFG.1.1_1_2.json +++ /dev/null @@ -1 +0,0 @@ -{"features":[{"geometry":{"coordinates":[[[71.41149,36.55717],[71.40954,36.55237],[71.37395,36.55474],[71.36436,36.55226],[71.31843,36.53446],[71.3019,36.52355],[71.28774,36.52113],[71.28183,36.51721],[71.27595,36.49977],[71.25977,36.48325],[71.24686,36.47709],[71.23186,36.47865],[71.22168,36.4843],[71.20222,36.48003],[71.1881,36.48441],[71.18169,36.49196],[71.1856,36.49435],[71.20061,36.52153],[71.20232,36.53118],[71.19587,36.54671],[71.16729,36.56328],[71.14429,36.58761],[71.12424,36.5924],[71.11617,36.59662],[71.11034,36.60459],[71.10903,36.66565],[71.02873,36.79067],[71.01995,36.80966],[71.01716,36.83664],[71.0187,36.86476],[71.01316,36.88173],[70.99757,36.9066],[70.95702,36.93212],[70.89158,36.95346],[70.85869,36.96816],[70.83394,36.98849],[70.8073,37.00198],[70.78426,37.02771],[70.75385,37.05461],[70.71236,37.07621],[70.7171,37.09387],[70.73001,37.104],[70.8381,37.13387],[70.85306,37.14125],[70.86099,37.15163],[70.85707,37.16642],[70.86703,37.17764],[70.86687,37.18637],[70.85607,37.19467],[70.84901,37.20512],[70.82961,37.2114],[70.80228,37.25319],[70.78971,37.2621],[70.78635,37.27388],[70.78272,37.27678],[70.80682,37.2773],[70.81486,37.28134],[70.81709,37.29346],[70.82154,37.29682],[70.83209,37.29845],[70.84579,37.29279],[70.8627,37.29437],[70.87193,37.29905],[70.87048,37.30951],[70.87566,37.31231],[70.8936,37.31329],[70.90681,37.30573],[70.92742,37.30981],[70.93779,37.32669],[70.95306,37.33867],[70.96316,37.35409],[71.01808,37.36797],[71.02711,37.37658],[71.03407,37.39542],[71.04252,37.40275],[71.05182,37.40542],[71.06761,37.40209],[71.09549,37.38773],[71.1464,37.38029],[71.17564,37.38212],[71.21194,37.39499],[71.23927,37.39724],[71.28146,37.39539],[71.30579,37.39853],[71.31545,37.39558],[71.3167,37.39105],[71.31016,37.38471],[71.29498,37.38023],[71.28695,37.37336],[71.29104,37.36458],[71.28975,37.36018],[71.27915,37.35599],[71.25114,37.35282],[71.22665,37.34412],[71.2082,37.33265],[71.1979,37.31739],[71.19634,37.30693],[71.20239,37.28747],[71.24394,37.26496],[71.30699,37.25485],[71.31807,37.24994],[71.30641,37.21708],[71.31552,37.19207],[71.30919,37.16819],[71.30914,37.13688],[71.32497,37.114],[71.33672,37.08014],[71.34899,37.0579],[71.36018,37.02248],[71.39261,36.98609],[71.3889,36.98077],[71.36919,36.97119],[71.35229,36.95037],[71.32862,36.94063],[71.32713,36.93637],[71.35377,36.91397],[71.35461,36.90298],[71.35136,36.88972],[71.33064,36.8632],[71.31949,36.85485],[71.30568,36.85132],[71.25354,36.85461],[71.2429,36.84789],[71.24265,36.83884],[71.24941,36.82845],[71.26183,36.81963],[71.26368,36.81425],[71.25796,36.80269],[71.24998,36.79636],[71.22235,36.78543],[71.21882,36.77991],[71.21972,36.77044],[71.26667,36.74249],[71.27195,36.73764],[71.27148,36.73246],[71.25753,36.73465],[71.25262,36.73069],[71.25707,36.72251],[71.25055,36.71885],[71.23594,36.72063],[71.23033,36.71722],[71.23382,36.70341],[71.22665,36.68107],[71.24262,36.65906],[71.26329,36.64516],[71.29496,36.61458],[71.34565,36.58645],[71.3614,36.58263],[71.37926,36.56792],[71.41149,36.55717]]],"type":"Polygon"},"properties":{"GID_2":"AFG.1.1_1","NAME_2":"Baharak","ghsBuiltCenter":[71.10105,37.04904],"ghsBuiltCenters":[[71.13941,37.07362,2693.0],[71.09857,37.04895,2582.0],[71.04391,37.03397,2090.0],[71.06012,36.9142,1765.0],[71.17729,37.056,1322.0]],"ghsBuiltMax":2693.0,"ghsBuiltWeight":229321.0,"ghsPopCenter":[71.09335,37.02337],"ghsPopCenters":[[71.06012,36.9142,1843.0],[71.13941,37.07362,542.0],[71.09857,37.04895,519.0],[71.04391,37.03397,420.0],[71.07823,36.87635,323.0]],"ghsPopMaxDensity":1843.0,"ghsPopulation":56538.0,"isOuter":true},"type":"Feature"}],"type":"FeatureCollection"} \ No newline at end of file diff --git a/cache/gadm/boundary_AFG.1.1_1_3.json b/cache/gadm/boundary_AFG.1.1_1_3.json deleted file mode 100644 index 6b53638..0000000 --- a/cache/gadm/boundary_AFG.1.1_1_3.json +++ /dev/null @@ -1 +0,0 @@ -{"features":[{"geometry":{"coordinates":[[[71.41149,36.55717],[71.40954,36.55237],[71.37395,36.55474],[71.36436,36.55226],[71.31843,36.53446],[71.3019,36.52355],[71.28774,36.52113],[71.28183,36.51721],[71.27595,36.49977],[71.25977,36.48325],[71.24686,36.47709],[71.23186,36.47865],[71.22168,36.4843],[71.20222,36.48003],[71.1881,36.48441],[71.18169,36.49196],[71.1856,36.49435],[71.20061,36.52153],[71.20232,36.53118],[71.19587,36.54671],[71.16729,36.56328],[71.14429,36.58761],[71.12424,36.5924],[71.11617,36.59662],[71.11034,36.60459],[71.10903,36.66565],[71.02873,36.79067],[71.01995,36.80966],[71.01716,36.83664],[71.0187,36.86476],[71.01316,36.88173],[70.99757,36.9066],[70.95702,36.93212],[70.89158,36.95346],[70.85869,36.96816],[70.83394,36.98849],[70.8073,37.00198],[70.78426,37.02771],[70.75385,37.05461],[70.71236,37.07621],[70.7171,37.09387],[70.73001,37.104],[70.8381,37.13387],[70.85306,37.14125],[70.86099,37.15163],[70.85707,37.16642],[70.86703,37.17764],[70.86687,37.18637],[70.85607,37.19467],[70.84901,37.20512],[70.82961,37.2114],[70.80228,37.25319],[70.78971,37.2621],[70.78635,37.27388],[70.78272,37.27678],[70.80682,37.2773],[70.81486,37.28134],[70.81709,37.29346],[70.82154,37.29682],[70.83209,37.29845],[70.84579,37.29279],[70.8627,37.29437],[70.87193,37.29905],[70.87048,37.30951],[70.87566,37.31231],[70.8936,37.31329],[70.90681,37.30573],[70.92742,37.30981],[70.93779,37.32669],[70.95306,37.33867],[70.96316,37.35409],[71.01808,37.36797],[71.02711,37.37658],[71.03407,37.39542],[71.04252,37.40275],[71.05182,37.40542],[71.06761,37.40209],[71.09549,37.38773],[71.1464,37.38029],[71.17564,37.38212],[71.21194,37.39499],[71.23927,37.39724],[71.28146,37.39539],[71.30579,37.39853],[71.31545,37.39558],[71.3167,37.39105],[71.31016,37.38471],[71.29498,37.38023],[71.28695,37.37336],[71.29104,37.36458],[71.28975,37.36018],[71.27915,37.35599],[71.25114,37.35282],[71.22665,37.34412],[71.2082,37.33265],[71.1979,37.31739],[71.19634,37.30693],[71.20239,37.28747],[71.24394,37.26496],[71.30699,37.25485],[71.31807,37.24994],[71.30641,37.21708],[71.31552,37.19207],[71.30919,37.16819],[71.30914,37.13688],[71.32497,37.114],[71.33672,37.08014],[71.34899,37.0579],[71.36018,37.02248],[71.39261,36.98609],[71.3889,36.98077],[71.36919,36.97119],[71.35229,36.95037],[71.32862,36.94063],[71.32713,36.93637],[71.35377,36.91397],[71.35461,36.90298],[71.35136,36.88972],[71.33064,36.8632],[71.31949,36.85485],[71.30568,36.85132],[71.25354,36.85461],[71.2429,36.84789],[71.24265,36.83884],[71.24941,36.82845],[71.26183,36.81963],[71.26368,36.81425],[71.25796,36.80269],[71.24998,36.79636],[71.22235,36.78543],[71.21882,36.77991],[71.21972,36.77044],[71.26667,36.74249],[71.27195,36.73764],[71.27148,36.73246],[71.25753,36.73465],[71.25262,36.73069],[71.25707,36.72251],[71.25055,36.71885],[71.23594,36.72063],[71.23033,36.71722],[71.23382,36.70341],[71.22665,36.68107],[71.24262,36.65906],[71.26329,36.64516],[71.29496,36.61458],[71.34565,36.58645],[71.3614,36.58263],[71.37926,36.56792],[71.41149,36.55717]]],"type":"Polygon"},"properties":{"GID_3":"","NAME_3":"","ghsBuiltCenter":[71.10105,37.04904],"ghsBuiltCenters":[[71.13941,37.07362,2693.0],[71.09857,37.04895,2582.0],[71.04391,37.03397,2090.0],[71.06012,36.9142,1765.0],[71.17729,37.056,1322.0]],"ghsBuiltMax":2693.0,"ghsBuiltWeight":229321.0,"ghsPopCenter":[71.09335,37.02337],"ghsPopCenters":[[71.06012,36.9142,1843.0],[71.13941,37.07362,542.0],[71.09857,37.04895,519.0],[71.04391,37.03397,420.0],[71.07823,36.87635,323.0]],"ghsPopMaxDensity":1843.0,"ghsPopulation":56538.0},"type":"Feature"}],"type":"FeatureCollection"} \ No newline at end of file diff --git a/cache/gadm/boundary_AFG.1.1_1_4.json b/cache/gadm/boundary_AFG.1.1_1_4.json deleted file mode 100644 index 883880e..0000000 --- a/cache/gadm/boundary_AFG.1.1_1_4.json +++ /dev/null @@ -1 +0,0 @@ -{"features":[{"geometry":{"coordinates":[[[71.41149,36.55717],[71.40954,36.55237],[71.37395,36.55474],[71.36436,36.55226],[71.31843,36.53446],[71.3019,36.52355],[71.28774,36.52113],[71.28183,36.51721],[71.27595,36.49977],[71.25977,36.48325],[71.24686,36.47709],[71.23186,36.47865],[71.22168,36.4843],[71.20222,36.48003],[71.1881,36.48441],[71.18169,36.49196],[71.1856,36.49435],[71.20061,36.52153],[71.20232,36.53118],[71.19587,36.54671],[71.16729,36.56328],[71.14429,36.58761],[71.12424,36.5924],[71.11617,36.59662],[71.11034,36.60459],[71.10903,36.66565],[71.02873,36.79067],[71.01995,36.80966],[71.01716,36.83664],[71.0187,36.86476],[71.01316,36.88173],[70.99757,36.9066],[70.95702,36.93212],[70.89158,36.95346],[70.85869,36.96816],[70.83394,36.98849],[70.8073,37.00198],[70.78426,37.02771],[70.75385,37.05461],[70.71236,37.07621],[70.7171,37.09387],[70.73001,37.104],[70.8381,37.13387],[70.85306,37.14125],[70.86099,37.15163],[70.85707,37.16642],[70.86703,37.17764],[70.86687,37.18637],[70.85607,37.19467],[70.84901,37.20512],[70.82961,37.2114],[70.80228,37.25319],[70.78971,37.2621],[70.78635,37.27388],[70.78272,37.27678],[70.80682,37.2773],[70.81486,37.28134],[70.81709,37.29346],[70.82154,37.29682],[70.83209,37.29845],[70.84579,37.29279],[70.8627,37.29437],[70.87193,37.29905],[70.87048,37.30951],[70.87566,37.31231],[70.8936,37.31329],[70.90681,37.30573],[70.92742,37.30981],[70.93779,37.32669],[70.95306,37.33867],[70.96316,37.35409],[71.01808,37.36797],[71.02711,37.37658],[71.03407,37.39542],[71.04252,37.40275],[71.05182,37.40542],[71.06761,37.40209],[71.09549,37.38773],[71.1464,37.38029],[71.17564,37.38212],[71.21194,37.39499],[71.23927,37.39724],[71.28146,37.39539],[71.30579,37.39853],[71.31545,37.39558],[71.3167,37.39105],[71.31016,37.38471],[71.29498,37.38023],[71.28695,37.37336],[71.29104,37.36458],[71.28975,37.36018],[71.27915,37.35599],[71.25114,37.35282],[71.22665,37.34412],[71.2082,37.33265],[71.1979,37.31739],[71.19634,37.30693],[71.20239,37.28747],[71.24394,37.26496],[71.30699,37.25485],[71.31807,37.24994],[71.30641,37.21708],[71.31552,37.19207],[71.30919,37.16819],[71.30914,37.13688],[71.32497,37.114],[71.33672,37.08014],[71.34899,37.0579],[71.36018,37.02248],[71.39261,36.98609],[71.3889,36.98077],[71.36919,36.97119],[71.35229,36.95037],[71.32862,36.94063],[71.32713,36.93637],[71.35377,36.91397],[71.35461,36.90298],[71.35136,36.88972],[71.33064,36.8632],[71.31949,36.85485],[71.30568,36.85132],[71.25354,36.85461],[71.2429,36.84789],[71.24265,36.83884],[71.24941,36.82845],[71.26183,36.81963],[71.26368,36.81425],[71.25796,36.80269],[71.24998,36.79636],[71.22235,36.78543],[71.21882,36.77991],[71.21972,36.77044],[71.26667,36.74249],[71.27195,36.73764],[71.27148,36.73246],[71.25753,36.73465],[71.25262,36.73069],[71.25707,36.72251],[71.25055,36.71885],[71.23594,36.72063],[71.23033,36.71722],[71.23382,36.70341],[71.22665,36.68107],[71.24262,36.65906],[71.26329,36.64516],[71.29496,36.61458],[71.34565,36.58645],[71.3614,36.58263],[71.37926,36.56792],[71.41149,36.55717]]],"type":"Polygon"},"properties":{"GID_4":"","NAME_4":"","ghsBuiltCenter":[71.10105,37.04904],"ghsBuiltCenters":[[71.13941,37.07362,2693.0],[71.09857,37.04895,2582.0],[71.04391,37.03397,2090.0],[71.06012,36.9142,1765.0],[71.17729,37.056,1322.0]],"ghsBuiltMax":2693.0,"ghsBuiltWeight":229321.0,"ghsPopCenter":[71.09335,37.02337],"ghsPopCenters":[[71.06012,36.9142,1843.0],[71.13941,37.07362,542.0],[71.09857,37.04895,519.0],[71.04391,37.03397,420.0],[71.07823,36.87635,323.0]],"ghsPopMaxDensity":1843.0,"ghsPopulation":56538.0},"type":"Feature"}],"type":"FeatureCollection"} \ No newline at end of file diff --git a/cache/gadm/boundary_AFG.1.1_1_5.json b/cache/gadm/boundary_AFG.1.1_1_5.json deleted file mode 100644 index 8b8714f..0000000 --- a/cache/gadm/boundary_AFG.1.1_1_5.json +++ /dev/null @@ -1 +0,0 @@ -{"features":[{"geometry":{"coordinates":[[[71.41149,36.55717],[71.40954,36.55237],[71.37395,36.55474],[71.36436,36.55226],[71.31843,36.53446],[71.3019,36.52355],[71.28774,36.52113],[71.28183,36.51721],[71.27595,36.49977],[71.25977,36.48325],[71.24686,36.47709],[71.23186,36.47865],[71.22168,36.4843],[71.20222,36.48003],[71.1881,36.48441],[71.18169,36.49196],[71.1856,36.49435],[71.20061,36.52153],[71.20232,36.53118],[71.19587,36.54671],[71.16729,36.56328],[71.14429,36.58761],[71.12424,36.5924],[71.11617,36.59662],[71.11034,36.60459],[71.10903,36.66565],[71.02873,36.79067],[71.01995,36.80966],[71.01716,36.83664],[71.0187,36.86476],[71.01316,36.88173],[70.99757,36.9066],[70.95702,36.93212],[70.89158,36.95346],[70.85869,36.96816],[70.83394,36.98849],[70.8073,37.00198],[70.78426,37.02771],[70.75385,37.05461],[70.71236,37.07621],[70.7171,37.09387],[70.73001,37.104],[70.8381,37.13387],[70.85306,37.14125],[70.86099,37.15163],[70.85707,37.16642],[70.86703,37.17764],[70.86687,37.18637],[70.85607,37.19467],[70.84901,37.20512],[70.82961,37.2114],[70.80228,37.25319],[70.78971,37.2621],[70.78635,37.27388],[70.78272,37.27678],[70.80682,37.2773],[70.81486,37.28134],[70.81709,37.29346],[70.82154,37.29682],[70.83209,37.29845],[70.84579,37.29279],[70.8627,37.29437],[70.87193,37.29905],[70.87048,37.30951],[70.87566,37.31231],[70.8936,37.31329],[70.90681,37.30573],[70.92742,37.30981],[70.93779,37.32669],[70.95306,37.33867],[70.96316,37.35409],[71.01808,37.36797],[71.02711,37.37658],[71.03407,37.39542],[71.04252,37.40275],[71.05182,37.40542],[71.06761,37.40209],[71.09549,37.38773],[71.1464,37.38029],[71.17564,37.38212],[71.21194,37.39499],[71.23927,37.39724],[71.28146,37.39539],[71.30579,37.39853],[71.31545,37.39558],[71.3167,37.39105],[71.31016,37.38471],[71.29498,37.38023],[71.28695,37.37336],[71.29104,37.36458],[71.28975,37.36018],[71.27915,37.35599],[71.25114,37.35282],[71.22665,37.34412],[71.2082,37.33265],[71.1979,37.31739],[71.19634,37.30693],[71.20239,37.28747],[71.24394,37.26496],[71.30699,37.25485],[71.31807,37.24994],[71.30641,37.21708],[71.31552,37.19207],[71.30919,37.16819],[71.30914,37.13688],[71.32497,37.114],[71.33672,37.08014],[71.34899,37.0579],[71.36018,37.02248],[71.39261,36.98609],[71.3889,36.98077],[71.36919,36.97119],[71.35229,36.95037],[71.32862,36.94063],[71.32713,36.93637],[71.35377,36.91397],[71.35461,36.90298],[71.35136,36.88972],[71.33064,36.8632],[71.31949,36.85485],[71.30568,36.85132],[71.25354,36.85461],[71.2429,36.84789],[71.24265,36.83884],[71.24941,36.82845],[71.26183,36.81963],[71.26368,36.81425],[71.25796,36.80269],[71.24998,36.79636],[71.22235,36.78543],[71.21882,36.77991],[71.21972,36.77044],[71.26667,36.74249],[71.27195,36.73764],[71.27148,36.73246],[71.25753,36.73465],[71.25262,36.73069],[71.25707,36.72251],[71.25055,36.71885],[71.23594,36.72063],[71.23033,36.71722],[71.23382,36.70341],[71.22665,36.68107],[71.24262,36.65906],[71.26329,36.64516],[71.29496,36.61458],[71.34565,36.58645],[71.3614,36.58263],[71.37926,36.56792],[71.41149,36.55717]]],"type":"Polygon"},"properties":{"GID_5":"","NAME_5":"","ghsBuiltCenter":[71.10105,37.04904],"ghsBuiltCenters":[[71.13941,37.07362,2693.0],[71.09857,37.04895,2582.0],[71.04391,37.03397,2090.0],[71.06012,36.9142,1765.0],[71.17729,37.056,1322.0]],"ghsBuiltMax":2693.0,"ghsBuiltWeight":229321.0,"ghsPopCenter":[71.09335,37.02337],"ghsPopCenters":[[71.06012,36.9142,1843.0],[71.13941,37.07362,542.0],[71.09857,37.04895,519.0],[71.04391,37.03397,420.0],[71.07823,36.87635,323.0]],"ghsPopMaxDensity":1843.0,"ghsPopulation":56538.0},"type":"Feature"}],"type":"FeatureCollection"} \ No newline at end of file diff --git a/cache/gadm/boundary_AFG.1.2_1_2.json b/cache/gadm/boundary_AFG.1.2_1_2.json deleted file mode 100644 index b933ddb..0000000 --- a/cache/gadm/boundary_AFG.1.2_1_2.json +++ /dev/null @@ -1 +0,0 @@ -{"features":[{"geometry":{"coordinates":[[[71.2762,38.00465],[71.26561,38.006],[71.25337,38.01428],[71.22517,38.01237],[71.19342,38.01883],[71.17726,38.02659],[71.17789,38.03534],[71.17218,38.03826],[71.16112,38.02494],[71.15066,38.01851],[71.12814,38.01732],[71.11626,38.01307],[71.10431,38.01876],[71.09132,38.01694],[71.08074,38.01983],[71.07189,38.01053],[71.05569,38.00447],[71.03897,37.98],[71.01289,37.96274],[71.00059,37.95081],[71.00089,37.95954],[70.99486,37.96888],[70.95911,37.98344],[70.91583,38.00888],[70.80695,38.05487],[70.78855,38.05737],[70.76753,38.0549],[70.70683,38.03009],[70.68048,38.02772],[70.66789,38.03206],[70.6513,38.04594],[70.63457,38.05439],[70.58826,38.06125],[70.56323,38.07679],[70.54028,38.08434],[70.51607,38.08412],[70.4374,38.06392],[70.41875,38.07549],[70.41838,38.08248],[70.42033,38.08901],[70.43102,38.09893],[70.46798,38.10495],[70.49106,38.12076],[70.505,38.12257],[70.50031,38.14607],[70.50352,38.1644],[70.51794,38.19432],[70.53645,38.20976],[70.53726,38.2244],[70.54602,38.24455],[70.56711,38.26815],[70.60484,38.2833],[70.60899,38.29649],[70.60512,38.30585],[70.61463,38.3245],[70.61066,38.34573],[70.61833,38.35136],[70.6309,38.35353],[70.64323,38.34887],[70.65401,38.3495],[70.65928,38.35102],[70.66563,38.3586],[70.69146,38.36913],[70.69441,38.38095],[70.69186,38.38648],[70.68796,38.38932],[70.67838,38.38737],[70.67419,38.39047],[70.67468,38.40339],[70.67855,38.40888],[70.69831,38.41975],[70.7148,38.4145],[70.74165,38.41995],[70.7566,38.428],[70.77225,38.45615],[70.78256,38.45518],[70.78762,38.45091],[70.80948,38.44415],[70.82104,38.44799],[70.82435,38.45286],[70.82961,38.45212],[70.84317,38.4463],[70.84344,38.44035],[70.8501,38.43874],[70.86002,38.45011],[70.85907,38.45964],[70.86411,38.46035],[70.87345,38.46884],[70.89499,38.46514],[70.90042,38.45963],[70.9007,38.44741],[70.90692,38.44351],[70.9077,38.43891],[70.92131,38.43476],[70.92443,38.4303],[70.93446,38.43529],[70.9354,38.44066],[70.94362,38.44265],[70.94785,38.43847],[70.95433,38.43819],[70.95847,38.4418],[70.96202,38.45662],[70.94379,38.46688],[70.9472,38.47688],[70.9696,38.47609],[70.98946,38.49041],[70.99661,38.48767],[71.00134,38.47703],[71.00946,38.47116],[71.02091,38.46896],[71.0309,38.46125],[71.0294,38.45271],[71.03682,38.44218],[71.04692,38.41021],[71.05642,38.39895],[71.06659,38.40005],[71.06855,38.40467],[71.06647,38.41275],[71.07633,38.41411],[71.09161,38.42328],[71.10434,38.42241],[71.10783,38.41936],[71.10534,38.40898],[71.11013,38.40584],[71.12121,38.4063],[71.13288,38.39707],[71.14374,38.40255],[71.14958,38.39739],[71.15116,38.38928],[71.16306,38.38721],[71.17526,38.36986],[71.18422,38.34589],[71.1974,38.34302],[71.22577,38.32113],[71.2439,38.31803],[71.2527,38.31032],[71.27956,38.31457],[71.29659,38.31296],[71.31345,38.30438],[71.32798,38.30544],[71.334,38.29334],[71.32713,38.28508],[71.33293,38.28072],[71.33456,38.27015],[71.34693,38.27131],[71.36104,38.26833],[71.37412,38.25563],[71.36751,38.22331],[71.37737,38.21522],[71.37899,38.21031],[71.36478,38.19609],[71.3663,38.17847],[71.37576,38.1608],[71.36604,38.15059],[71.34839,38.14753],[71.33983,38.13181],[71.33657,38.11368],[71.3302,38.11209],[71.32637,38.1029],[71.31956,38.0998],[71.3212,38.07051],[71.31136,38.06038],[71.30648,38.04686],[71.30022,38.04295],[71.29439,38.04479],[71.28365,38.04029],[71.28629,38.03307],[71.29458,38.0294],[71.29507,38.01832],[71.2762,38.00465]]],"type":"Polygon"},"properties":{"GID_2":"AFG.1.2_1","NAME_2":"Darwaz","ghsBuiltCenter":[70.7694,38.39001],"ghsBuiltCenters":[[70.69373,38.40057,3959.0],[70.81617,38.44315,3473.0],[70.54108,38.13021,1843.0],[70.84193,38.31455,1670.0],[70.69396,38.36864,1491.0]],"ghsBuiltMax":3959.0,"ghsBuiltWeight":140314.0,"ghsPopCenter":[70.79892,38.38974],"ghsPopCenters":[[71.04928,38.40678,3822.0],[70.81617,38.44315,2720.0],[70.69373,38.40057,2541.0],[70.84193,38.31455,1308.0],[70.54108,38.13021,1183.0]],"ghsPopMaxDensity":3822.0,"ghsPopulation":100450.0,"isOuter":true},"type":"Feature"}],"type":"FeatureCollection"} \ No newline at end of file diff --git a/cache/gadm/boundary_AFG.1.2_1_3.json b/cache/gadm/boundary_AFG.1.2_1_3.json deleted file mode 100644 index 3de0e20..0000000 --- a/cache/gadm/boundary_AFG.1.2_1_3.json +++ /dev/null @@ -1 +0,0 @@ -{"features":[{"geometry":{"coordinates":[[[71.2762,38.00465],[71.26561,38.006],[71.25337,38.01428],[71.22517,38.01237],[71.19342,38.01883],[71.17726,38.02659],[71.17789,38.03534],[71.17218,38.03826],[71.16112,38.02494],[71.15066,38.01851],[71.12814,38.01732],[71.11626,38.01307],[71.10431,38.01876],[71.09132,38.01694],[71.08074,38.01983],[71.07189,38.01053],[71.05569,38.00447],[71.03897,37.98],[71.01289,37.96274],[71.00059,37.95081],[71.00089,37.95954],[70.99486,37.96888],[70.95911,37.98344],[70.91583,38.00888],[70.80695,38.05487],[70.78855,38.05737],[70.76753,38.0549],[70.70683,38.03009],[70.68048,38.02772],[70.66789,38.03206],[70.6513,38.04594],[70.63457,38.05439],[70.58826,38.06125],[70.56323,38.07679],[70.54028,38.08434],[70.51607,38.08412],[70.4374,38.06392],[70.41875,38.07549],[70.41838,38.08248],[70.42033,38.08901],[70.43102,38.09893],[70.46798,38.10495],[70.49106,38.12076],[70.505,38.12257],[70.50031,38.14607],[70.50352,38.1644],[70.51794,38.19432],[70.53645,38.20976],[70.53726,38.2244],[70.54602,38.24455],[70.56711,38.26815],[70.60484,38.2833],[70.60899,38.29649],[70.60512,38.30585],[70.61463,38.3245],[70.61066,38.34573],[70.61833,38.35136],[70.6309,38.35353],[70.64323,38.34887],[70.65401,38.3495],[70.65928,38.35102],[70.66563,38.3586],[70.69146,38.36913],[70.69441,38.38095],[70.69186,38.38648],[70.68796,38.38932],[70.67838,38.38737],[70.67419,38.39047],[70.67468,38.40339],[70.67855,38.40888],[70.69831,38.41975],[70.7148,38.4145],[70.74165,38.41995],[70.7566,38.428],[70.77225,38.45615],[70.78256,38.45518],[70.78762,38.45091],[70.80948,38.44415],[70.82104,38.44799],[70.82435,38.45286],[70.82961,38.45212],[70.84317,38.4463],[70.84344,38.44035],[70.8501,38.43874],[70.86002,38.45011],[70.85907,38.45964],[70.86411,38.46035],[70.87345,38.46884],[70.89499,38.46514],[70.90042,38.45963],[70.9007,38.44741],[70.90692,38.44351],[70.9077,38.43891],[70.92131,38.43476],[70.92443,38.4303],[70.93446,38.43529],[70.9354,38.44066],[70.94362,38.44265],[70.94785,38.43847],[70.95433,38.43819],[70.95847,38.4418],[70.96202,38.45662],[70.94379,38.46688],[70.9472,38.47688],[70.9696,38.47609],[70.98946,38.49041],[70.99661,38.48767],[71.00134,38.47703],[71.00946,38.47116],[71.02091,38.46896],[71.0309,38.46125],[71.0294,38.45271],[71.03682,38.44218],[71.04692,38.41021],[71.05642,38.39895],[71.06659,38.40005],[71.06855,38.40467],[71.06647,38.41275],[71.07633,38.41411],[71.09161,38.42328],[71.10434,38.42241],[71.10783,38.41936],[71.10534,38.40898],[71.11013,38.40584],[71.12121,38.4063],[71.13288,38.39707],[71.14374,38.40255],[71.14958,38.39739],[71.15116,38.38928],[71.16306,38.38721],[71.17526,38.36986],[71.18422,38.34589],[71.1974,38.34302],[71.22577,38.32113],[71.2439,38.31803],[71.2527,38.31032],[71.27956,38.31457],[71.29659,38.31296],[71.31345,38.30438],[71.32798,38.30544],[71.334,38.29334],[71.32713,38.28508],[71.33293,38.28072],[71.33456,38.27015],[71.34693,38.27131],[71.36104,38.26833],[71.37412,38.25563],[71.36751,38.22331],[71.37737,38.21522],[71.37899,38.21031],[71.36478,38.19609],[71.3663,38.17847],[71.37576,38.1608],[71.36604,38.15059],[71.34839,38.14753],[71.33983,38.13181],[71.33657,38.11368],[71.3302,38.11209],[71.32637,38.1029],[71.31956,38.0998],[71.3212,38.07051],[71.31136,38.06038],[71.30648,38.04686],[71.30022,38.04295],[71.29439,38.04479],[71.28365,38.04029],[71.28629,38.03307],[71.29458,38.0294],[71.29507,38.01832],[71.2762,38.00465]]],"type":"Polygon"},"properties":{"GID_3":"","NAME_3":"","ghsBuiltCenter":[70.7694,38.39001],"ghsBuiltCenters":[[70.69373,38.40057,3959.0],[70.81617,38.44315,3473.0],[70.54108,38.13021,1843.0],[70.84193,38.31455,1670.0],[70.69396,38.36864,1491.0]],"ghsBuiltMax":3959.0,"ghsBuiltWeight":140314.0,"ghsPopCenter":[70.79892,38.38974],"ghsPopCenters":[[71.04928,38.40678,3822.0],[70.81617,38.44315,2720.0],[70.69373,38.40057,2541.0],[70.84193,38.31455,1308.0],[70.54108,38.13021,1183.0]],"ghsPopMaxDensity":3822.0,"ghsPopulation":100450.0},"type":"Feature"}],"type":"FeatureCollection"} \ No newline at end of file diff --git a/cache/gadm/boundary_AFG.1.2_1_4.json b/cache/gadm/boundary_AFG.1.2_1_4.json deleted file mode 100644 index 5047d9a..0000000 --- a/cache/gadm/boundary_AFG.1.2_1_4.json +++ /dev/null @@ -1 +0,0 @@ -{"features":[{"geometry":{"coordinates":[[[71.2762,38.00465],[71.26561,38.006],[71.25337,38.01428],[71.22517,38.01237],[71.19342,38.01883],[71.17726,38.02659],[71.17789,38.03534],[71.17218,38.03826],[71.16112,38.02494],[71.15066,38.01851],[71.12814,38.01732],[71.11626,38.01307],[71.10431,38.01876],[71.09132,38.01694],[71.08074,38.01983],[71.07189,38.01053],[71.05569,38.00447],[71.03897,37.98],[71.01289,37.96274],[71.00059,37.95081],[71.00089,37.95954],[70.99486,37.96888],[70.95911,37.98344],[70.91583,38.00888],[70.80695,38.05487],[70.78855,38.05737],[70.76753,38.0549],[70.70683,38.03009],[70.68048,38.02772],[70.66789,38.03206],[70.6513,38.04594],[70.63457,38.05439],[70.58826,38.06125],[70.56323,38.07679],[70.54028,38.08434],[70.51607,38.08412],[70.4374,38.06392],[70.41875,38.07549],[70.41838,38.08248],[70.42033,38.08901],[70.43102,38.09893],[70.46798,38.10495],[70.49106,38.12076],[70.505,38.12257],[70.50031,38.14607],[70.50352,38.1644],[70.51794,38.19432],[70.53645,38.20976],[70.53726,38.2244],[70.54602,38.24455],[70.56711,38.26815],[70.60484,38.2833],[70.60899,38.29649],[70.60512,38.30585],[70.61463,38.3245],[70.61066,38.34573],[70.61833,38.35136],[70.6309,38.35353],[70.64323,38.34887],[70.65401,38.3495],[70.65928,38.35102],[70.66563,38.3586],[70.69146,38.36913],[70.69441,38.38095],[70.69186,38.38648],[70.68796,38.38932],[70.67838,38.38737],[70.67419,38.39047],[70.67468,38.40339],[70.67855,38.40888],[70.69831,38.41975],[70.7148,38.4145],[70.74165,38.41995],[70.7566,38.428],[70.77225,38.45615],[70.78256,38.45518],[70.78762,38.45091],[70.80948,38.44415],[70.82104,38.44799],[70.82435,38.45286],[70.82961,38.45212],[70.84317,38.4463],[70.84344,38.44035],[70.8501,38.43874],[70.86002,38.45011],[70.85907,38.45964],[70.86411,38.46035],[70.87345,38.46884],[70.89499,38.46514],[70.90042,38.45963],[70.9007,38.44741],[70.90692,38.44351],[70.9077,38.43891],[70.92131,38.43476],[70.92443,38.4303],[70.93446,38.43529],[70.9354,38.44066],[70.94362,38.44265],[70.94785,38.43847],[70.95433,38.43819],[70.95847,38.4418],[70.96202,38.45662],[70.94379,38.46688],[70.9472,38.47688],[70.9696,38.47609],[70.98946,38.49041],[70.99661,38.48767],[71.00134,38.47703],[71.00946,38.47116],[71.02091,38.46896],[71.0309,38.46125],[71.0294,38.45271],[71.03682,38.44218],[71.04692,38.41021],[71.05642,38.39895],[71.06659,38.40005],[71.06855,38.40467],[71.06647,38.41275],[71.07633,38.41411],[71.09161,38.42328],[71.10434,38.42241],[71.10783,38.41936],[71.10534,38.40898],[71.11013,38.40584],[71.12121,38.4063],[71.13288,38.39707],[71.14374,38.40255],[71.14958,38.39739],[71.15116,38.38928],[71.16306,38.38721],[71.17526,38.36986],[71.18422,38.34589],[71.1974,38.34302],[71.22577,38.32113],[71.2439,38.31803],[71.2527,38.31032],[71.27956,38.31457],[71.29659,38.31296],[71.31345,38.30438],[71.32798,38.30544],[71.334,38.29334],[71.32713,38.28508],[71.33293,38.28072],[71.33456,38.27015],[71.34693,38.27131],[71.36104,38.26833],[71.37412,38.25563],[71.36751,38.22331],[71.37737,38.21522],[71.37899,38.21031],[71.36478,38.19609],[71.3663,38.17847],[71.37576,38.1608],[71.36604,38.15059],[71.34839,38.14753],[71.33983,38.13181],[71.33657,38.11368],[71.3302,38.11209],[71.32637,38.1029],[71.31956,38.0998],[71.3212,38.07051],[71.31136,38.06038],[71.30648,38.04686],[71.30022,38.04295],[71.29439,38.04479],[71.28365,38.04029],[71.28629,38.03307],[71.29458,38.0294],[71.29507,38.01832],[71.2762,38.00465]]],"type":"Polygon"},"properties":{"GID_4":"","NAME_4":"","ghsBuiltCenter":[70.7694,38.39001],"ghsBuiltCenters":[[70.69373,38.40057,3959.0],[70.81617,38.44315,3473.0],[70.54108,38.13021,1843.0],[70.84193,38.31455,1670.0],[70.69396,38.36864,1491.0]],"ghsBuiltMax":3959.0,"ghsBuiltWeight":140314.0,"ghsPopCenter":[70.79892,38.38974],"ghsPopCenters":[[71.04928,38.40678,3822.0],[70.81617,38.44315,2720.0],[70.69373,38.40057,2541.0],[70.84193,38.31455,1308.0],[70.54108,38.13021,1183.0]],"ghsPopMaxDensity":3822.0,"ghsPopulation":100450.0},"type":"Feature"}],"type":"FeatureCollection"} \ No newline at end of file diff --git a/docs/workers.md b/docs/workers.md deleted file mode 100644 index b6e2d7b..0000000 --- a/docs/workers.md +++ /dev/null @@ -1,459 +0,0 @@ -# Running Products via Native Node.js Worker Threads - -Moving heavy queues (like `ImagesProduct` crunching images via `sharp`, or `LocationsProduct` running grid searches) out of the main Event Loop is essential to preserve API performance and maintain a high Event Loop FPS. - -We orchestrate this entirely within Node.js using the native `worker_threads` module, driven by a centralized JSON configuration. No PM2 dependency is required. - ---- - -## Architecture: Config-Driven Worker Spawning - -The application topology is defined in `server/config/products.json`. The main thread reads this file on boot. If a product has `"workers" > 0`, the main thread spawns dedicated native `Worker` threads to handle its `pg-boss` background jobs — while still registering the product's HTTP routes on the main thread. - -### 1. The Configuration Format (`config/products.json`) - -Each product entry specifies: -- **`name`** — maps to a key in `PRODUCT_IMPORTS` in `registry.ts` -- **`enabled`** — whether to load the product at all -- **`workers`** — how many native Worker threads to spawn (0 = run everything on the main thread) -- **`deps`** — informational dependency list - -```json -{ - "products": [ - { "name": "images", "enabled": true, "workers": 1, "deps": ["serving", "storage"] }, - { "name": "videos", "enabled": true, "workers": 0, "deps": ["serving", "storage"] }, - { "name": "locations", "enabled": true, "workers": 0, "deps": ["serving", "storage"] }, - { "name": "serving", "enabled": true, "workers": 0, "deps": ["images"] }, - { "name": "email", "enabled": true, "workers": 0, "deps": [] }, - { "name": "openai", "enabled": true, "workers": 0, "deps": [] }, - { "name": "analytics", "enabled": true, "workers": 0, "deps": [] }, - { "name": "storage", "enabled": true, "workers": 0, "deps": [] }, - { "name": "ecommerce", "enabled": true, "workers": 0, "deps": ["images"] }, - { "name": "contacts", "enabled": true, "workers": 0, "deps": [] }, - { "name": "campaigns", "enabled": true, "workers": 0, "deps": ["contacts"] }, - { "name": "mcp", "enabled": true, "workers": 0, "deps": ["serving"] } - ] -} -``` - -### 2. Main Thread: The Orchestrator (`src/products/registry.ts`) - -Boot-up is split into two phases: - -**Phase 1 — `registerProductRoutes(app)`:** Reads `products.json`, lazy-imports only the enabled product modules via a `PRODUCT_IMPORTS` map (avoids importing everything on boot), instantiates them, and registers their HTTP routes on the Hono app. - -**Phase 2 — `startProducts(boss)`:** For each product: -- If `workers > 0`, spawns native Worker threads (see §3). -- Always calls `product.start(boss)` on the main thread so the product can register pg-boss queue names and perform local init. - -```typescript -// Lazy imports — only loaded when the product is enabled -const PRODUCT_IMPORTS: Record Promise> = { - 'images': () => import('./images/index.js'), - 'videos': () => import('./videos/index.js'), - 'locations': () => import('./locations/index.js'), - // ... all 12 products -}; - -export const startProducts = async (boss?: any) => { - for (const product of instances) { - const pConfig = product.__config; - - if (pConfig && pConfig.workers > 0) { - const isDev = process.env.NODE_ENV !== 'production'; - - // Dev: uses vite-node wrapper to load TS directly - // Prod: uses pre-bundled worker.cjs - let workerEntry = isDev - ? path.resolve(process.cwd(), 'src', 'worker_wrapper.mjs') - : path.resolve(process.cwd(), 'worker.cjs'); - - for (let i = 0; i < pConfig.workers; i++) { - const worker = new Worker(workerEntry, { - workerData: { productName: pConfig.name, workerScript } - }); - nativeWorkers.push({ id: product.id, worker }); - - // Forward EventBus events from worker → main thread - worker.on('message', (msg) => { - if (msg?.type === 'event' && msg.name) { - EventBus.emit(msg.name, msg.data); - } - }); - } - } - - // Main-thread init (HTTP deps, caching, boss queue creation) - await product.start(boss); - } -}; -``` - -### 3. Worker Entrypoint (`src/worker.ts`) - -When a Worker thread boots, `worker.ts` is loaded. It reads `workerData.productName`, instantiates the matching product class, and starts only its pg-boss consumers. It does **not** start an HTTP server. - -Key responsibilities: -- **PG-Boss queue consumers** — the product's `onStart(boss)` registers workers for its queues. -- **IPC health checks** — responds to `{ type: 'ping' }` messages with `{ type: 'pong', activeJobs, ... }`. -- **IPC job dispatch** — handles `{ type: 'job' }` messages for synchronous request-response via `dispatchToWorker()`. -- **EventBus bridging** — forwards `job:progress`, `job:complete`, and `job:error` events to the parent thread via `parentPort.postMessage()`. - -```typescript -// worker.ts (runs inside the Worker thread) -import { workerData, isMainThread, parentPort } from 'worker_threads'; - -if (isMainThread) throw new Error('Must run inside a Worker thread.'); - -const ProductClass = PRODUCT_CLASSES[workerData.productName]; -const instance = new ProductClass(); - -// IPC: ping/pong + job dispatch -parentPort.on('message', async (msg) => { - if (msg.type === 'ping') return parentPort.postMessage({ type: 'pong', ... }); - if (msg.type === 'job') { /* handleJob → postMessage result */ } -}); - -// Bridge internal events to parent thread -EventBus.on('job:progress', (data) => parentPort.postMessage({ type: 'event', name: 'job:progress', data })); -EventBus.on('job:complete', (data) => parentPort.postMessage({ type: 'event', name: 'job:complete', data })); - -// Start isolated PG-Boss and bind the product -const workerBoss = await startBoss(); -await instance.start(workerBoss); -``` - -### 4. Dev Mode: `worker_wrapper.mjs` - -In dev, Worker threads can't inherit `tsx` hooks from the parent process. To support TypeScript directly, a plain `.mjs` bootstrap uses `vite-node`'s programmatic API to load and execute `worker.ts` with full TS resolution: - -```javascript -// worker_wrapper.mjs -import { workerData } from 'node:worker_threads'; -import { createServer } from 'vite'; -import { ViteNodeRunner } from 'vite-node/client'; - -const server = await createServer({ /* hmr: false, @-alias setup */ }); -const runner = new ViteNodeRunner({ root, base, fetchModule, resolveId }); -await runner.executeFile(workerData.workerScript); -``` - -### 5. Smart Consumer Skipping - -Products that support pg-boss workers (like `LocationsProduct`) use this pattern in `onStart()` to avoid double-consuming: - -```typescript -async onStart(boss?: PgBoss) { - const { isMainThread } = await import('node:worker_threads'); - const workersConfig = this.__config?.workers ?? 0; - const shouldConsume = !isMainThread || workersConfig === 0; - - for (const WorkerClass of this.workers) { - const worker = new WorkerClass(); - await boss.createQueue(worker.queueName); - if (shouldConsume) { - await boss.work(worker.queueName, options, worker.handler.bind(worker)); - } - } -} -``` - -If the product is running with dedicated Worker threads (`workers > 0`), the main thread skips consuming from pg-boss queues — only the Worker threads will consume them. - -### 6. IPC Job Dispatch (`src/commons/worker-ipc.ts`) - -For synchronous request-response between the main thread and worker threads (e.g., image processing called from an HTTP handler), there is a utility: - -```typescript -import { dispatchToWorker, hasWorker } from '@/commons/worker-ipc.js'; - -// Check if a live worker exists -if (await hasWorker('images')) { - const result = await dispatchToWorker('images', 'process_image', { buffer, ... }, [buffer]); -} -``` - -- Uses round-robin across multiple worker threads for the same product. -- Supports zero-copy `ArrayBuffer` transfers via the `transferList` parameter. -- Has a configurable timeout (default 30s). - ---- - -## Base Classes - -### `AbstractProduct` (`src/products/AbstractProduct.ts`) -All products extend this. Provides: -- `start(boss)` / `stop()` lifecycle hooks -- `handleJob(action, msg)` — for IPC job dispatch from worker threads -- `handleStream()` — SSE streaming helper with cache-checking -- `generateHash()` — deterministic deep-sorted SHA-256 hashing - -### `AbstractWorker` (`src/jobs/boss/AbstractWorker.ts`) -PG-Boss queue consumers extend this. Provides: -- `queueName` — the pg-boss queue to consume -- `process(job)` — override with business logic -- `calculateCost(job, result)` — usage metering -- `handler()` — wraps `process()` with error handling and emits `job:complete` / `job:failed` - -Worker classes use the `@Worker(queueName)` decorator for registration. - ---- - -## Case Study: `ImagesProduct` — The Canonical Worker-Offloaded Product - -`ImagesProduct` (`src/products/images/index.ts`) is currently the **only product running with `workers: 1`** in production. It demonstrates the full IPC lifecycle — from HTTP request through worker dispatch to cached response. It does **not** use `AbstractWorker` or pg-boss queues; instead, it uses the synchronous IPC dispatch pattern via `worker-ipc.ts`. - -### The Hybrid Pattern: `hasWorker` + Inline Fallback - -Every image processing path checks whether a live worker thread exists. If yes, the heavy `sharp` work is offloaded. If no (e.g., during tests, or if `workers: 0` in config), it falls back to inline processing on the main thread: - -```typescript -// src/products/images/index.ts — _ensureCachedImage() -if (await hasWorker('images')) { - // Zero-copy transfer: copy Buffer into a transferable ArrayBuffer - const arrayBuffer = new ArrayBuffer(inputBuffer.length); - new Uint8Array(arrayBuffer).set(inputBuffer); - - await dispatchToWorker('images', 'process_image', { - buffer: arrayBuffer, width, height, format, fit - }, [arrayBuffer]); // ← transfer list: moves memory, doesn't clone -} else { - // Inline fallback (same thread) - const pipeline = sharp(inputBuffer).resize({ width, height, fit }).toFormat(format); - await fs.writeFile(filepath, await pipeline.toBuffer()); -} -``` - -This pattern is used in three HTTP handlers: -- **`handlePostImage`** — file upload → resize → cache (or forward to Supabase Storage) -- **`handleRenderImage`** — URL → fetch → resize → serve as binary (used by lazy srcset URLs) -- **`handlePostResponsive`** / **`handleGetResponsive`** — generate multi-format, multi-size srcset variants - -### Worker-Side: `handleJob()` Actions - -Inside the worker thread, the `ImagesProduct` instance receives IPC job messages and routes them by `action`: - -```typescript -// src/products/images/index.ts — handleJob() -async handleJob(action: string, msg: any): Promise { - if (action === 'process_image') { - // Reconstruct Buffer from transferred ArrayBuffer - const inputBuffer = Buffer.from(msg.buffer); - await this.performProcessImage(inputBuffer, filepath, { width, height, format, fit }); - return { filename }; - } - if (action === 'render_image') { - // Supports square crop, contain fit, etc. - await this.performRenderImage(inputBuffer, filepath, { width, height, format, square, contain }); - return { filename }; - } - return super.handleJob(action, msg); // Throws for unknown actions -} -``` - -Both actions write the processed image to the shared `cache/` directory on disk. The main thread then reads the file to serve or forward the response. - -### The Responsive Image Pipeline - -The responsive endpoint generates multiple width × format variants (e.g., `[180, 640, 1024, 2048] × [avif, webp]`). It splits work between **eager** and **lazy** generation: - -| Variant Width | Strategy | What Happens | -|--------------|----------|--------------| -| ≤ 600px | **Eager** | Processed immediately (via worker or inline) and cached to disk. Returns direct cache URL. | -| > 600px | **Lazy** | Returns a dynamic `/api/images/render?url=...&width=...&format=...` URL. Processed on-demand when the browser requests it. | - -This avoids eagerly generating large, rarely-used variants for every upload while ensuring small thumbnails are always instant. - -### Request Coalescing - -When multiple concurrent requests reference the same source URL, `fetchImageCoalesced()` deduplicates them using an in-flight `Map>`. Only one HTTP fetch goes out; all callers share the same Promise. - -### Data Flow Summary - -``` -HTTP Request (main thread) - → hasWorker('images')? ──yes──→ dispatchToWorker() - │ │ - │ ├─ postMessage({ type:'job', action:'render_image', buffer }, [buffer]) - │ │ ↓ (zero-copy ArrayBuffer transfer) - │ │ Worker Thread: handleJob('render_image', msg) - │ │ ↓ - │ │ sharp(buffer).resize().toFormat().toFile(filepath) - │ │ ↓ (streams directly to disk) - │ └─ postMessage({ type:'job_result', result: { filename } }) - │ ↓ - │ main thread: fs.readFile(cache/hash.format) - │ ↓ - │ return c.redirect() or c.body() - │ - └──no──→ Inline: sharp().resize().toFile(filepath) → serve -``` - ---- - -## Why this Pattern is Powerful - -1. **Zero PM2 Dependency:** Entirely native to Node.js. Containerization, Nexe builds — nothing changes. -2. **True Multi-Core Utilization:** `worker_threads` run on distinct OS threads. Setting `workers: 2` for `images` dedicates two CPU cores to Sharp. -3. **API Immunity:** Workers have their own V8 heap and Event Loop. A massive image resize will have zero impact on the main API's Event Loop FPS. -4. **EventBus Bridging:** Worker events (progress, completion) are forwarded to the main thread via IPC `postMessage`, enabling real-time SSE streams to API clients. -5. **Dev/Prod Parity:** The `worker_wrapper.mjs` + vite-node setup means TypeScript runs natively in dev worker threads, while production uses pre-bundled JS — same behavior in both environments. -6. **Round-Robin Dispatch:** The `worker-ipc.ts` utility distributes synchronous job requests across multiple threads, enabling true horizontal scaling within a single process. - ---- - -## Constraints & Gotchas (Lessons from Inngest + Our Benchmarks) - -Node.js worker threads have real constraints that Go/Rust/Python developers would never expect. The [Inngest post on worker threads](https://www.inngest.com/blog/node-worker-threads) formalizes these well. Here's how each constraint applies to **our** architecture: - -### 1. Workers Are NOT Lightweight - -Each worker thread is a **full V8 isolate** — its own heap, its own event loop. ~10 MB memory overhead per worker, with tens-of-milliseconds startup cost. This is why our `products.json` caps workers at 1-2 per product, and workers are spawned **once at boot** and persist for the process lifetime. We never create/destroy workers per-job. - -### 2. You Can't Pass Logic — Only Messages - -Unlike Go goroutines or Rust threads, you can't pass a function to `new Worker()`. The structured clone algorithm can't serialize functions. This is why: - -- Our `EventBus` listeners live on the **main thread** — worker threads post `{ type: 'event' }` messages that get bridged to the main-thread EventBus -- Pino `logger` instances can't cross the boundary — worker threads use their own logger -- `pg-boss` connections are per-thread — each worker establishes its own - -### 3. Bundler Discovery Is Fragile - -Bundlers (webpack) can't statically analyze `new Worker(path)`. Our approach: - -- **Dev:** `worker_wrapper.mjs` uses vite-node's `ViteNodeRunner` to resolve TypeScript at runtime -- **Prod:** `build.sh` compiles `worker.ts` → `worker.cjs` as a separate webpack entry point, and the registry uses `__dirname + '/worker.cjs'` — a plain string the bundler can't trace - -Both paths are hardcoded and tested — no dynamic path construction that could break silently. - -### 4. Dev-Mode vite-node Overhead (CRITICAL) - -Benchmarked 2024-03-24, same 386KB JPEG source at 800px webp: - -| Path | Encode Time | Notes | -|------|-------------|-------| -| Worker thread (vite-node) | **3:265** (3.26s) | IPC + vite-node module transform overhead | -| Main thread (inline) | **0:140** (140ms) | Direct sharp call, no IPC | - -**~23× slower in dev mode via worker thread.** The vite-node `ViteNodeRunner` inside the worker's V8 isolate adds massive overhead for module resolution and transformation. Sharp itself (native C++ addon) runs at the same speed — the cost is entirely in the JS wrapper. - -In **production** with pre-bundled `worker.cjs`, the worker thread runs at near-native speed. The overhead is a **dev-only artifact**. - -> **Practical implication:** Consider setting `"workers": 0` for `images` during local development to avoid the vite-node penalty. The main thread handles 140ms encodes without impacting dev-server responsiveness. - -### 5. No Respawning (Current Gap) - -Inngest implements exponential backoff respawning — if a worker thread crashes (unhandled exception, OOM), the main thread detects the `exit` event and spins up a replacement with increasing delay. - -**We don't do this yet.** If a worker thread dies, it's gone until a full server restart. The `registry.ts` spawner doesn't watch for `exit` events. This is acceptable for now because: - -- Workers are simple (sharp pipeline, no external connections beyond pg-boss) -- Crashes are rare in production -- The inline fallback (`hasWorker() === false`) means the main thread picks up the work - -But for robustness, adding respawn-with-backoff to the worker spawner in `registry.ts` would be a good future improvement. - -### 6. Elastic Autospawning & Tier-Based Limits (Grid Searches) - -Monolithic jobs that process tens of thousands of items (e.g., massive Grid Searches) expose a flaw in static worker pools: **head-of-line blocking**. If all workers are occupied by a massive Enterprise search, Free/Pro users starve. - -To solve this we use an **Elastic Autospawn / Fan-Out Architecture**: - -1. **Fan-Out (Map-Reduce):** Instead of processing 10,000 grid cells in a single Node.js worker loop, an *Orchestrator* job enumerates the area and splits it into 10,000 individual `gridsearch-cell` jobs pushed to PG-Boss. -2. **Tier-Based Queue Routing/Throttling:** We use PG-Boss `singletonKey` (tied to `userId`) and tier-based concurrency limits (e.g., `teamConcurrency: 5` for Pro vs `20` for Enterprise) to ensure fairness at the database queue level. -3. **Distributed SSE (Pub/Sub):** Because micro-jobs fan out across multiple elastic workers, tying SSE to a local `EventBus` via `parentPort` fails. Instead, workers emit progress via **Postgres `NOTIFY`** or Supabase Realtime channels. The main API process (handling the SSE route) uses `LISTEN` to receive events from any worker on any machine, bridging them back to the user's HTTP stream. - ---- - -## Exploring Native (Rust/C++) Replacements - -Given the constraints of V8 Isolates (10MB overhead, slow startup, lack of shared memory serialization), a viable future replacement for CPU-bound or massively concurrent products (like `images` or `locations` grid searches) is replacing Node.js `worker_threads` with **Per-Product Rust or C++ implementations (Binaries or N-API)**. - -If a Native (Rust/C++) worker is implemented: -- **Fast Autospawn:** Native binaries spawn in under 1ms. If compiled as an N-API native module (via `napi-rs` or `node-addon-api` for C++), worker execution is effectively instantaneous function calls avoiding V8 Isolate boot. -- **IPC Performance:** - - Subprocesses communicating via raw UNIX socket or `stdout` streams provide near-native memory transfer without structured-clone serialization bounds. - - N-API bindings allow direct zero-copy memory (SharedArrayBuffer) access between the main Thread JavaScript and native execution. -- **Memory Efficiency:** A single Native concurrency pipeline scaling to 10,000 asynchronous grid cells uses a fraction of the RAM of dozens of isolated Node.js context engines. - -### Side-By-Side Comparison - -| Feature | Node.js `worker_threads` | Rust (N-API / Subprocess) | C++ (N-API / Subprocess) | -| :--- | :--- | :--- | :--- | -| **Startup Time** | ~30-50ms (V8 Isolate boot) | **<1ms** (Native / Binary spawn) | **<1ms** (Native / Binary spawn) | -| **Memory per Instance** | High (~10-30MB baseline) | **Minimal** (<2MB) | **Minimal** (<2MB) | -| **IPC Performance** | Slow (`postMessage` Structured Clone) | **High** (Zero-Copy SharedArrayBuffer or MsgPack UDS) | **High** (Zero-Copy SharedArrayBuffer or MsgPack UDS) | -| **Autospawning** | Poor (Spiking spawns causes OOM) | **Excellent** | **Excellent** | -| **Development Speed** | Fastest | Slower (Strict compiler, borrow checker) | Slower (Manual compilation, header management) | -| **Memory Safety** | High (V8 Engine) | **High** (Compiler-enforced lifetimes) | Lower (Prone to segfaults / memory leaks) | -| **Ecosystem (Parallelism)** | Limited (libuv threadpool) | **Best-in-class** (Tokio, Rayon) | Strong (std::thread, Boost) | - ---- - -## 7. Messaging: Internal & External Workers (Protobuf) - -When moving to an Elastic Autospawn architecture with Native workers, the serialization format and communication transport become the most crucial factors for performance and system integrity. - -### Why Protobuf? -While MessagePack over Unix Domain Sockets works, **Protocol Buffers (Protobuf)** offers several distinct advantages, especially when scaling from "Internal Subprocesses" to "External Distributed Workers": - -1. **Strict Type Contracts:** Both Node.js (TypeScript) and Native (Rust/C++) share the exact same `.proto` schema. If a payload field is required, the compiler ensures it exists. If the Node.js API changes a field structure, the Native worker fails to compile, preventing silent production parsing errors. -2. **Backwards Compatibility:** Protobuf is inherently designed for evolving APIs without breaking older workers. -3. **RPC Native (gRPC):** As we expand from *Internal Workers* on the same machine to *External Workers* on entirely different physical servers, Protobuf naturally upgrades into gRPC with zero serialization changes. - -### The "Dual Model" Architecture - -The beauty of standardizing on Protobuf is that the *exact same serialization code* is used regardless of where the worker lives. - -#### 1. Internal Workers (Local IPC via Subprocesses) -- **The Scenario:** The main Node.js API process spawns a native Rust/C++ executable as a child process on the **same machine**. -- **The Transport:** Unix Domain Sockets (UDS) / Named Pipes or Standard I/O (stdio). UDS is preferred because it's full-duplex and avoids Node's `stdout` buffering constraints. -- **How it works:** - 1. Node.js encodes the `JobPayload` message using the compiled `protobufjs` TypeScript library. - 2. Node.js writes the binary payload to the local UNIX Domain Socket (e.g., `/tmp/worker_grid_123.sock`). Because UDS is a TCP-like stream, payloads must be **length-prefixed** (e.g., 4 bytes for length, followed by the Protobuf bytes) so the receiver knows when the message ends. - 3. The Rust/C++ subprocess reads the length prefix, reads the exact byte count, and uses `prost` (Rust) or the Google Protobuf C++ library to deserialize instantly. - 4. The worker executes the CPU-heavy logic, serializes the `JobResult`, prefixes the length, and streams it back. - -#### 2. External Workers (Distributed Execution) -- **The Scenario:** Fanning out 10,000 Grid Search cells across dozens of physical worker nodes to prevent local CPU exhaustion. -- **The Transport:** Pg-Boss / Postgres (or gRPC). -- **How it works:** - 1. **The Queue:** The main Node.js process encodes the job payload via Protobuf and saves the raw bytes (or Base64-encoded bytes) into the `pgboss.job` table. - 2. **The Fleet:** Hundreds of external Rust/C++ worker nodes connect directly to the database layer (or via a gRPC interface) pulling jobs. - 3. **The Decoding:** The remote execution node pulls the binary payload and deserializes the Protobuf bytes. Since the schema is strict, all external workers instantly understand the payload, ensuring perfect schema synchronization across the heterogeneous distributed fleet. - ---- - -## 8. Storage & Database Integrations for Native Workers - -Transitioning to Native Autospawning workers heavily impacts how the database and storage layers scale, specifically around connection pooling, payload limits, and blob storage. - -### Connection Limits (Supavisor) -If 5,000 autospawned native processes all open distinct `libpq` connections to Postgres, the database will instantly lock up with `FATAL: too many clients`. -**The Rule:** All native workers (whether internal executables or external nodes) *must* connect to Postgres via a connection pooler like **Supavisor** or **PgBouncer**, which transparently multiplexes thousands of transient client connections onto a handful of persistent database connections. - -### Event Bus Limits (Postgres NOTIFY) -As established, we use `LISTEN / NOTIFY` to bridge Server-Sent Events (SSE) from the Native workers back to the Node.js API stream. -**The Constraint:** Postgres `NOTIFY` string payloads are hard-limited to **8000 bytes**. You cannot emit massive JSON/Protobuf result arrays over `NOTIFY`. It must only contain progression percentages or tiny metadata. - -### Returning Artifacts & Large Results -When a Native worker finishes crunching data, it needs to save the result. -1. **Small Results (JSON/Protobuf < 1MB):** - - The native worker calls the `pg-boss.complete(jobId, protobuf_bytes)` equivalent, storing the payload back in the `pgboss.job` table. -2. **Tabular Results (Big Data):** - - e.g., 50,000 scraped locations from a massive grid cell. The native worker uses the incredibly fast SQL `COPY` command (bulk insert) to slam the data directly into a dedicated Postgres table (e.g., `places`), and completes the `pg-boss` job with an empty payload. -3. **Huge Blobs (Images / Videos / AI Models):** - - The native worker *does not touch Postgres for blobs*. The Node API orchestrator pre-signs a **Supabase Storage Upload URL** and embeds it in the job payload. The Native worker generates the 50MB file and streams it via `libcurl` directly to S3/Supabase Storage, completely bypassing the database stack. - ---- - -## 9. Next-level Abstracting: Embedded Scripting (Lua/WASM) - -While writing the *infrastructure layer* (UDS reading, Protobuf decoding, Postgres connection pooling) in strictly-typed Native code (Rust/C++) is essential for performance, writing volatile *business logic* (like search heuristics) in C++ hurts developer velocity and requires constant recompilations. - -To solve this we use the **Native Host + Embedded Scripting** pattern: -1. **The Architecture:** We compile a standalone Native Executable (the "Host") in Rust or C++. This host statically embeds a lightweight scripting engine (like **LuaJIT** or a **WASM** runtime like Wasmtime). -2. **Execution:** The Native Host safely handles all the heavy lifting—reading Unix Domain Sockets, managing DB connections, and parsing Protobuf. Once the payload is ready, it passes it into the embedded Lua state or WASM function instance. -3. **The Benefit:** Developers write the actual product logic in high-level Lua (or AssemblyScript for WASM). It executes wildly faster than Node.js (LuaJIT approaches raw C speed) while maintaining the tiny `<2MB` memory footprint, but allows for instant hot-reloading of the scripts without ever running a C++ compiler. diff --git a/image.png b/image.png new file mode 100644 index 0000000..a11316a Binary files /dev/null and b/image.png differ diff --git a/install-lnx.sh b/install-lnx.sh new file mode 100644 index 0000000..3fcc655 --- /dev/null +++ b/install-lnx.sh @@ -0,0 +1,60 @@ +#!/usr/bin/env bash +# ───────────────────────────────────────────────────────────────────────────── +# install-lnx.sh – Install build dependencies for polymech-cli on Linux +# +# Tested on: Ubuntu 20.04+ / Debian 11+ +# Usage: sudo bash install-lnx.sh +# ───────────────────────────────────────────────────────────────────────────── +set -euo pipefail + +echo "── polymech-cli Linux dependency installer ──" + +# ── 1. System packages (apt) ───────────────────────────────────────────────── +echo "" +echo "[1/3] Installing system packages via apt …" +apt-get update -qq +apt-get install -y --no-install-recommends \ + build-essential \ + gcc \ + g++ \ + git \ + libssl-dev \ + pkg-config \ + snapd + +# ── 2. CMake ≥ 3.20 via snap ──────────────────────────────────────────────── +# The project requires cmake_minimum_required(VERSION 3.20). +# Ubuntu 20.04 ships cmake 3.16, so we use the snap package instead. +echo "" +echo "[2/3] Installing CMake via snap (≥ 3.20 required) …" +if command -v /snap/bin/cmake &>/dev/null; then + echo " cmake snap already installed: $(/snap/bin/cmake --version | head -1)" +else + snap install cmake --classic + echo " Installed: $(/snap/bin/cmake --version | head -1)" +fi + +# ── 3. Node.js (for npm run build:linux) ────────────────────────────────────── +echo "" +echo "[3/3] Checking for Node.js / npm …" +if command -v node &>/dev/null; then + echo " node $(node --version) already installed" +else + echo " Node.js not found. Install via nvm or nodesource, e.g.:" + echo " curl -fsSL https://deb.nodesource.com/setup_20.x | sudo -E bash -" + echo " sudo apt-get install -y nodejs" +fi + +# ── Summary ────────────────────────────────────────────────────────────────── +echo "" +echo "── Done! ──" +echo "" +echo "All C++ dependencies (CLI11, tomlplusplus, Catch2, asio, concurrentqueue," +echo "taskflow, curl, lexbor, rapidjson) are fetched automatically by CMake" +echo "FetchContent at build time — no manual installation needed." +echo "" +echo "To build:" +echo " cd $(dirname "$0")" +echo " npm run build:linux" +echo "" +echo "The binary will be placed in: dist/polymech-cli" diff --git a/orchestrator/spawn.mjs b/orchestrator/spawn.mjs index ef85528..1e8980c 100644 --- a/orchestrator/spawn.mjs +++ b/orchestrator/spawn.mjs @@ -6,8 +6,7 @@ * * Usage: * import { spawnWorker } from './spawn.mjs'; - * const w = await spawnWorker('./build/dev/Debug/polymech-cli.exe'); - * const res = await w.request({ type: 'ping' }); + * const w = await spawnWorker('./dist/polymech-cli.exe'); * console.log(res); // { id: '...', type: 'pong', payload: {} } * await w.shutdown(); */ @@ -71,6 +70,9 @@ export function spawnWorker(exePath, args = ['worker']) { // Pending request map: id → { resolve, reject, timer } const pending = new Map(); + // Event handler for unmatched messages (progress events, etc.) + let eventHandler = null; + let readyResolve; const ready = new Promise((resolve) => { readyResolve = resolve; }); @@ -97,8 +99,12 @@ export function spawnWorker(exePath, args = ['worker']) { return; } - // Unmatched message (event, broadcast, etc.) - console.log('[orchestrator] unmatched message:', msg); + // Unmatched message (progress event, broadcast, etc.) + if (eventHandler) { + eventHandler(msg); + } else { + console.log('[orchestrator] unmatched message:', msg); + } }); proc.stdout.on('data', feedData); @@ -148,5 +154,6 @@ export function spawnWorker(exePath, args = ['worker']) { kill: () => proc.kill(), process: proc, ready, + onEvent: (handler) => { eventHandler = handler; }, }; } diff --git a/orchestrator/test-gridsearch-ipc-daemon.mjs b/orchestrator/test-gridsearch-ipc-daemon.mjs new file mode 100644 index 0000000..feb501e --- /dev/null +++ b/orchestrator/test-gridsearch-ipc-daemon.mjs @@ -0,0 +1,204 @@ +/** + * orchestrator/test-gridsearch-ipc.mjs + * + * E2E test: spawn the C++ worker, send a gridsearch request + * matching `npm run gridsearch:enrich` defaults, collect IPC events, + * and verify the full event sequence. + * + * Run: node orchestrator/test-gridsearch-ipc.mjs + * Needs: npm run build-debug (or npm run build) + */ + +import { spawnWorker } from './spawn.mjs'; +import { resolve, dirname } from 'node:path'; +import { readFileSync } from 'node:fs'; +import { fileURLToPath } from 'node:url'; +import fs from 'node:fs'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const IS_WIN = process.platform === 'win32'; +const EXE_NAME = IS_WIN ? 'polymech-cli.exe' : 'polymech-cli'; + +const EXE = resolve(__dirname, '..', 'dist', EXE_NAME); +if (!fs.existsSync(EXE)) { + console.error(`❌ No ${EXE_NAME} found in dist. Run npm run build first.`); + process.exit(1); +} +console.log(`Binary: ${EXE}\n`); + +// Load the sample settings (same as gridsearch:enrich) +const sampleConfig = JSON.parse( + readFileSync(resolve(__dirname, '..', 'config', 'gridsearch-sample.json'), 'utf8') +); + +let passed = 0; +let failed = 0; + +function assert(condition, label) { + if (condition) { + console.log(` ✅ ${label}`); + passed++; + } else { + console.error(` ❌ ${label}`); + failed++; + } +} + +// ── Event collector ───────────────────────────────────────────────────────── + +const EXPECTED_EVENTS = [ + 'grid-ready', + 'waypoint-start', + 'area', + 'location', + 'enrich-start', + 'node', + 'nodePage', + // 'node-error' — may or may not occur, depends on network +]; + +function createCollector() { + const events = {}; + for (const t of ['grid-ready', 'waypoint-start', 'area', 'location', + 'enrich-start', 'node', 'node-error', 'nodePage']) { + events[t] = []; + } + return { + events, + handler(msg) { + const t = msg.type; + if (events[t]) { + events[t].push(msg); + } else { + events[t] = [msg]; + } + // Live progress indicator + const d = msg.payload ?? {}; + if (t === 'waypoint-start') { + process.stdout.write(`\r 🔍 Searching waypoint ${(d.index ?? 0) + 1}/${d.total ?? '?'}...`); + } else if (t === 'node') { + process.stdout.write(`\r 📧 Enriched: ${d.title?.substring(0, 40) ?? ''} `); + } else if (t === 'node-error') { + process.stdout.write(`\r ⚠️ Error: ${d.node?.title?.substring(0, 40) ?? ''} `); + } + }, + }; +} + +// ── Main test ─────────────────────────────────────────────────────────────── + +async function run() { + console.log('🧪 Gridsearch IPC E2E Test\n'); + + // ── 1. Spawn worker ─────────────────────────────────────────────────── + console.log('1. Spawn worker in daemon mode'); + const worker = spawnWorker(EXE, ['worker', '--daemon', '--user-uid', '3bb4cfbf-318b-44d3-a9d3-35680e738421']); + const readyMsg = await worker.ready; + assert(readyMsg.type === 'ready', 'Worker sends ready signal'); + + // ── 2. Register event collector ─────────────────────────────────────── + const collector = createCollector(); + worker.onEvent(collector.handler); + + // ── 3. Send gridsearch request (matching gridsearch:enrich) ──────────── + console.log('2. Send gridsearch request (Aruba / recycling / --enrich)'); + const t0 = Date.now(); + + // Very long timeout — enrichment can take minutes + const result = await worker.request( + { + type: 'gridsearch', + payload: { + ...sampleConfig, + enrich: true, + }, + }, + 5 * 60 * 1000 // 5 min timeout + ); + + const elapsed = ((Date.now() - t0) / 1000).toFixed(1); + console.log(`\n\n ⏱️ Completed in ${elapsed}s\n`); + + // ── 4. Verify final result ──────────────────────────────────────────── + console.log('3. Verify job_result'); + assert(result.type === 'job_result', `Response type is "job_result" (got "${result.type}")`); + + const summary = result.payload ?? null; + assert(summary !== null, 'job_result payload is present'); + + if (summary) { + assert(typeof summary.totalMs === 'number', `totalMs is number (${summary.totalMs})`); + assert(typeof summary.searchMs === 'number', `searchMs is number (${summary.searchMs})`); + assert(typeof summary.enrichMs === 'number', `enrichMs is number (${summary.enrichMs})`); + assert(typeof summary.freshApiCalls === 'number', `freshApiCalls is number (${summary.freshApiCalls})`); + assert(typeof summary.waypointCount === 'number', `waypointCount is number (${summary.waypointCount})`); + assert(summary.gridStats && typeof summary.gridStats.validCells === 'number', 'gridStats.validCells present'); + assert(summary.searchStats && typeof summary.searchStats.totalResults === 'number', 'searchStats.totalResults present'); + assert(typeof summary.enrichedOk === 'number', `enrichedOk is number (${summary.enrichedOk})`); + assert(typeof summary.enrichedTotal === 'number', `enrichedTotal is number (${summary.enrichedTotal})`); + } + + // ── 5. Verify event sequence ────────────────────────────────────────── + console.log('4. Verify event stream'); + const e = collector.events; + + assert(e['grid-ready'].length === 1, `Exactly 1 grid-ready event (got ${e['grid-ready'].length})`); + assert(e['waypoint-start'].length > 0, `At least 1 waypoint-start event (got ${e['waypoint-start'].length})`); + assert(e['area'].length > 0, `At least 1 area event (got ${e['area'].length})`); + assert(e['waypoint-start'].length === e['area'].length, `waypoint-start count (${e['waypoint-start'].length}) === area count (${e['area'].length})`); + assert(e['enrich-start'].length === 1, `Exactly 1 enrich-start event (got ${e['enrich-start'].length})`); + + const totalNodes = e['node'].length + e['node-error'].length; + assert(totalNodes > 0, `At least 1 node event (got ${totalNodes}: ${e['node'].length} ok, ${e['node-error'].length} errors)`); + + // Validate grid-ready payload + if (e['grid-ready'].length > 0) { + const gr = e['grid-ready'][0].payload ?? {}; + assert(Array.isArray(gr.areas), 'grid-ready.areas is array'); + assert(typeof gr.total === 'number' && gr.total > 0, `grid-ready.total > 0 (${gr.total})`); + } + + // Validate location events have required fields + if (e['location'].length > 0) { + const loc = e['location'][0].payload ?? {}; + assert(loc.location && typeof loc.location.title === 'string', 'location event has location.title'); + assert(loc.location && typeof loc.location.place_id === 'string', 'location event has location.place_id'); + assert(typeof loc.areaName === 'string', 'location event has areaName'); + } + assert(e['location'].length > 0, `At least 1 location event (got ${e['location'].length})`); + + // Validate node payloads + if (e['node'].length > 0) { + const nd = e['node'][0].payload ?? {}; + assert(typeof nd.placeId === 'string', 'node event has placeId'); + assert(typeof nd.title === 'string', 'node event has title'); + assert(Array.isArray(nd.emails), 'node event has emails array'); + assert(typeof nd.status === 'string', 'node event has status'); + } + + // ── 6. Print event summary ──────────────────────────────────────────── + console.log('\n5. Event summary'); + for (const [type, arr] of Object.entries(e)) { + if (arr.length > 0) console.log(` ${type}: ${arr.length}`); + } + + // ── 7. Shutdown ─────────────────────────────────────────────────────── + console.log('\n6. Graceful shutdown'); + const shutdownRes = await worker.shutdown(); + assert(shutdownRes.type === 'shutdown_ack', 'Shutdown acknowledged'); + + await new Promise(r => setTimeout(r, 500)); + assert(worker.process.exitCode === 0, `Worker exited with code 0 (got ${worker.process.exitCode})`); + + // ── Summary ─────────────────────────────────────────────────────────── + console.log(`\n────────────────────────────────`); + console.log(` Passed: ${passed} Failed: ${failed}`); + console.log(`────────────────────────────────\n`); + + process.exit(failed > 0 ? 1 : 0); +} + +run().catch((err) => { + console.error('Test runner error:', err); + process.exit(1); +}); diff --git a/orchestrator/test-gridsearch-ipc-uds-meta.mjs b/orchestrator/test-gridsearch-ipc-uds-meta.mjs new file mode 100644 index 0000000..a6e7707 --- /dev/null +++ b/orchestrator/test-gridsearch-ipc-uds-meta.mjs @@ -0,0 +1,218 @@ +/** + * orchestrator/test-gridsearch-ipc-uds-meta.mjs + * + * E2E test for Unix Domain Sockets / Windows Named Pipes (Meta Enrichment)! + * Spawns the worker in `--uds` mode and tests direct high-throughput + * lock-free JSON binary framing over a net.Socket. + */ + +import { spawn } from 'node:child_process'; +import { resolve, dirname, join } from 'node:path'; +import { readFileSync, existsSync, unlinkSync } from 'node:fs'; +import { fileURLToPath } from 'node:url'; +import net from 'node:net'; +import { tmpdir } from 'node:os'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const IS_WIN = process.platform === 'win32'; +const EXE_NAME = IS_WIN ? 'polymech-cli.exe' : 'polymech-cli'; +const EXE = resolve(__dirname, '..', 'dist', EXE_NAME); +const TEST_CANCEL = false; + +if (!existsSync(EXE)) { + console.error(`❌ Binary not found at ${EXE}`); + process.exit(1); +} + +const PIPE_NAME = 'polymech-test-uds-meta'; +const CPP_UDS_ARG = IS_WIN ? '4001' : join(tmpdir(), `${PIPE_NAME}.sock`); + +if (!IS_WIN && existsSync(CPP_UDS_ARG)) { + unlinkSync(CPP_UDS_ARG); +} + +console.log(`Binary: ${EXE}`); +console.log(`C++ Arg: ${CPP_UDS_ARG}\n`); + +// ── Event collector ───────────────────────────────────────────────────────── +function createCollector() { + const events = {}; + for (const t of ['grid-ready', 'waypoint-start', 'area', 'location', + 'enrich-start', 'node', 'node-error', 'nodePage', 'job_result']) { + events[t] = []; + } + return { + events, + onComplete: null, + handler(msg) { + const t = msg.type; + if (events[t]) events[t].push(msg); + else events[t] = [msg]; + + const d = msg.data ?? {}; + if (t === 'waypoint-start') { + process.stdout.write(`\r 🔍 Searching waypoint ${(d.index ?? 0) + 1}/${d.total ?? '?'}...`); + } else if (t === 'node') { + process.stdout.write(`\r 📧 Enriched: ${d.title?.substring(0, 40) ?? ''} `); + } else if (t === 'node-error') { + process.stdout.write(`\r ⚠️ Error: ${d.node?.title?.substring(0, 40) ?? ''} `); + } else if (t === 'job_result') { + console.log(`\n 🏁 Pipeline complete!`); + if (this.onComplete) this.onComplete(msg); + } + }, + }; +} + +let passed = 0; +let failed = 0; +function assert(condition, label) { + if (condition) { console.log(` ✅ ${label}`); passed++; } + else { console.error(` ❌ ${label}`); failed++; } +} + +async function run() { + console.log('🧪 Gridsearch UDS Meta E2E Test\n'); + + // 1. Spawn worker in UDS mode + console.log('1. Spawning remote C++ Taskflow Daemon'); + const worker = spawn(EXE, ['worker', '--uds', CPP_UDS_ARG, '--daemon'], { stdio: 'inherit' }); + + // Give the daemon a moment to boot + console.log('2. Connecting net.Socket with retries...'); + + let socket; + for (let i = 0; i < 15; i++) { + try { + await new Promise((resolve, reject) => { + if (IS_WIN) { + socket = net.connect({ port: 4001, host: '127.0.0.1' }); + } else { + socket = net.connect(CPP_UDS_ARG); + } + socket.once('connect', resolve); + socket.once('error', reject); + }); + console.log(' ✅ Socket Connected to UDS!'); + break; + } catch (e) { + if (i === 14) throw e; + await new Promise(r => setTimeout(r, 500)); + } + } + + const collector = createCollector(); + let buffer = Buffer.alloc(0); + + // Buffer framing logic (length-prefixed streaming) + socket.on('data', (chunk) => { + buffer = Buffer.concat([buffer, chunk]); + while (buffer.length >= 4) { + const len = buffer.readUInt32LE(0); + if (buffer.length >= 4 + len) { + const payload = buffer.toString('utf8', 4, 4 + len); + buffer = buffer.subarray(4 + len); + try { + const msg = JSON.parse(payload); + collector.handler(msg); + } catch (e) { + console.error("JSON PARSE ERROR:", e, payload); + } + } else { + break; // Wait for more chunks + } + } + }); + + // 3. Send Gridsearch payload + // USE gridsearch-sample.json instead of gridsearch-bcn-universities.json + const sampleConfig = JSON.parse( + readFileSync(resolve(__dirname, '..', 'config', 'gridsearch-sample.json'), 'utf8') + ); + + sampleConfig.configPath = resolve(__dirname, '..', 'config', 'postgres.toml'); + sampleConfig.jobId = 'uds-meta-test-abc'; + sampleConfig.noCache = true; // force re-enrichment even if cached + + console.log('3. Writing serialized IPC Payload over pipe...'); + const jsonStr = JSON.stringify(sampleConfig); + const lenBuf = Buffer.alloc(4); + lenBuf.writeUInt32LE(Buffer.byteLength(jsonStr)); + socket.write(lenBuf); + socket.write(jsonStr); + + // 4. Wait for pipeline completion (job_result event) or timeout + console.log('\n4. Awaiting multi-threaded Execution Pipeline (can take minutes)...\n'); + + await new Promise((resolve) => { + collector.onComplete = () => { + // Send stop command to gracefully shut down the daemon + console.log(' 📤 Sending stop command to daemon...'); + const stopPayload = JSON.stringify({ action: 'stop' }); + const stopLen = Buffer.alloc(4); + stopLen.writeUInt32LE(Buffer.byteLength(stopPayload)); + socket.write(stopLen); + socket.write(stopPayload); + setTimeout(resolve, 1000); // Give daemon a moment to ack + }; + + // Safety timeout + setTimeout(() => { + console.log('\n ⏰ Timeout reached (300s) — forcing shutdown.'); + resolve(); + }, 300000); // Wait up to 5 minutes + }); + + console.log('\n\n5. Event summary'); + for (const [k, v] of Object.entries(collector.events)) { + console.log(` ${k}: ${v.length}`); + } + + // Assertions + const ev = collector.events; + assert(ev['grid-ready'].length === 1, 'grid-ready emitted once'); + assert(ev['waypoint-start'].length > 0, 'waypoint-start events received'); + assert(ev['location'].length > 0, 'location events received'); + assert(ev['enrich-start'].length === 1, 'enrich-start emitted once'); + assert(ev['job_result'].length === 1, 'job_result emitted once'); + + // Verify social profiles and md body + const nodes = ev['node']; + let foundSocial = false; + let foundSiteMd = false; + + for (const n of nodes) { + const d = n.data; + if (!d) continue; + + if (d.socials && d.socials.length > 0) { + foundSocial = true; + } + + if (d.sites && Array.isArray(d.sites) && d.sites.length > 0) { + foundSiteMd = true; + } + } + + if (foundSocial) { + assert(foundSocial, 'At least one enriched node has social media profiles discovered'); + } else { + console.log(' ⚠️ No social media profiles discovered in this run (data-dependent), but pipeline completed.'); + } + + assert(foundSiteMd, 'At least one enriched node has markdown sites mapped'); + + console.log('6. Cleanup'); + socket.destroy(); + worker.kill('SIGTERM'); + + console.log(`\n────────────────────────────────`); + console.log(` Passed: ${passed} Failed: ${failed}`); + console.log(`────────────────────────────────`); + process.exit(failed > 0 ? 1 : 0); +} + +run().catch(e => { + console.error(e); + process.exit(1); +}); diff --git a/orchestrator/test-gridsearch-ipc-uds.mjs b/orchestrator/test-gridsearch-ipc-uds.mjs new file mode 100644 index 0000000..549d3e0 --- /dev/null +++ b/orchestrator/test-gridsearch-ipc-uds.mjs @@ -0,0 +1,255 @@ +/** + * orchestrator/test-gridsearch-ipc-uds.mjs + * + * E2E test for Unix Domain Sockets / Windows Named Pipes! + * Spawns the worker in `--uds` mode and tests direct high-throughput + * lock-free JSON binary framing over a net.Socket. + */ + +import { spawn } from 'node:child_process'; +import { resolve, dirname, join } from 'node:path'; +import { readFileSync, existsSync, unlinkSync } from 'node:fs'; +import { fileURLToPath } from 'node:url'; +import net from 'node:net'; +import { tmpdir } from 'node:os'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const IS_WIN = process.platform === 'win32'; +const EXE_NAME = IS_WIN ? 'polymech-cli.exe' : 'polymech-cli'; +const EXE = resolve(__dirname, '..', 'dist', EXE_NAME); +const TEST_CANCEL = false; + +if (!existsSync(EXE)) { + console.error(`❌ Binary not found at ${EXE}`); + process.exit(1); +} + +const PIPE_NAME = 'polymech-test-uds'; +const CPP_UDS_ARG = IS_WIN ? '4000' : join(tmpdir(), `${PIPE_NAME}.sock`); + +if (!IS_WIN && existsSync(CPP_UDS_ARG)) { + unlinkSync(CPP_UDS_ARG); +} + +console.log(`Binary: ${EXE}`); +console.log(`C++ Arg: ${CPP_UDS_ARG}\n`); + +// ── Event collector ───────────────────────────────────────────────────────── +function createCollector() { + const events = {}; + for (const t of ['grid-ready', 'waypoint-start', 'area', 'location', + 'enrich-start', 'node', 'node-error', 'nodePage', 'job_result']) { + events[t] = []; + } + return { + events, + onComplete: null, + handler(msg) { + const t = msg.type; + if (events[t]) events[t].push(msg); + else events[t] = [msg]; + + const d = msg.data ?? {}; + if (t === 'waypoint-start') { + process.stdout.write(`\r 🔍 Searching waypoint ${(d.index ?? 0) + 1}/${d.total ?? '?'}...`); + } else if (t === 'node') { + process.stdout.write(`\r 📧 Enriched: ${d.title?.substring(0, 40) ?? ''} `); + } else if (t === 'node-error') { + process.stdout.write(`\r ⚠️ Error: ${d.node?.title?.substring(0, 40) ?? ''} `); + } else if (t === 'job_result') { + console.log(`\n 🏁 Pipeline complete!`); + if (this.onComplete) this.onComplete(msg); + } + }, + }; +} + +let passed = 0; +let failed = 0; +function assert(condition, label) { + if (condition) { console.log(` ✅ ${label}`); passed++; } + else { console.error(` ❌ ${label}`); failed++; } +} + +async function run() { + console.log('🧪 Gridsearch UDS / Named Pipe E2E Test\n'); + + // 1. Spawn worker in UDS mode + console.log('1. Spawning remote C++ Taskflow Daemon'); + const worker = spawn(EXE, ['worker', '--uds', CPP_UDS_ARG, '--daemon'], { stdio: 'inherit' }); + + // Give the daemon a moment to boot + console.log('2. Connecting net.Socket with retries...'); + + let socket; + for (let i = 0; i < 15; i++) { + try { + await new Promise((resolve, reject) => { + if (IS_WIN) { + socket = net.connect({ port: 4000, host: '127.0.0.1' }); + } else { + socket = net.connect(CPP_UDS_ARG); + } + socket.once('connect', resolve); + socket.once('error', reject); + }); + console.log(' ✅ Socket Connected to UDS!'); + break; + } catch (e) { + if (i === 14) throw e; + await new Promise(r => setTimeout(r, 500)); + } + } + + const collector = createCollector(); + let buffer = Buffer.alloc(0); + + // Buffer framing logic (length-prefixed streaming) + socket.on('data', (chunk) => { + buffer = Buffer.concat([buffer, chunk]); + while (buffer.length >= 4) { + const len = buffer.readUInt32LE(0); + if (buffer.length >= 4 + len) { + const payload = buffer.toString('utf8', 4, 4 + len); + buffer = buffer.subarray(4 + len); + try { + const msg = JSON.parse(payload); + collector.handler(msg); + } catch (e) { + console.error("JSON PARSE ERROR:", e, payload); + } + } else { + break; // Wait for more chunks + } + } + }); + + // 3. Send Gridsearch payload + const sampleConfig = JSON.parse( + readFileSync(resolve(__dirname, '..', 'config', 'gridsearch-bcn-universities.json'), 'utf8') + ); + + sampleConfig.configPath = resolve(__dirname, '..', 'config', 'postgres.toml'); + sampleConfig.jobId = 'uds-test-cancel-abc'; + + console.log('3. Writing serialized IPC Payload over pipe...'); + const jsonStr = JSON.stringify(sampleConfig); + const lenBuf = Buffer.alloc(4); + lenBuf.writeUInt32LE(Buffer.byteLength(jsonStr)); + socket.write(lenBuf); + socket.write(jsonStr); + + // Send cancellation after 5 seconds + if (TEST_CANCEL) { + setTimeout(() => { + console.log('\n\n--> Testing Dynamic Cancellation (Sending cancel event for uds-test-cancel-abc)...'); + const cancelPayload = JSON.stringify({ action: "cancel", jobId: "uds-test-cancel-abc" }); + const cancelLenBuf = Buffer.alloc(4); + cancelLenBuf.writeUInt32LE(Buffer.byteLength(cancelPayload)); + socket.write(cancelLenBuf); + socket.write(cancelPayload); + }, 5000); + } + + // 4. Wait for pipeline completion (job_result event) or timeout + console.log('\n4. Awaiting multi-threaded Execution Pipeline (can take minutes)...\n'); + + await new Promise((resolve) => { + collector.onComplete = () => { + // Send stop command to gracefully shut down the daemon + console.log(' 📤 Sending stop command to daemon...'); + const stopPayload = JSON.stringify({ action: 'stop' }); + const stopLen = Buffer.alloc(4); + stopLen.writeUInt32LE(Buffer.byteLength(stopPayload)); + socket.write(stopLen); + socket.write(stopPayload); + setTimeout(resolve, 1000); // Give daemon a moment to ack + }; + + // Safety timeout + setTimeout(() => { + console.log('\n ⏰ Timeout reached (120s) — forcing shutdown.'); + resolve(); + }, 120000); + }); + + console.log('\n\n5. Event summary'); + for (const [k, v] of Object.entries(collector.events)) { + console.log(` ${k}: ${v.length}`); + } + + // Assertions + const ev = collector.events; + assert(ev['grid-ready'].length === 1, 'grid-ready emitted once'); + assert(ev['waypoint-start'].length > 0, 'waypoint-start events received'); + assert(ev['location'].length > 0, 'location events received'); + assert(ev['enrich-start'].length === 1, 'enrich-start emitted once'); + assert(ev['job_result'].length === 1, 'job_result emitted once'); + + // Check enrichment skip log (if present in log events) + const logEvents = ev['log'] ?? []; + const skipLog = logEvents.find(l => + typeof l.data === 'string' && l.data.includes('already enriched') + ); + const nodeCount = ev['node'].length + ev['node-error'].length; + if (skipLog) { + console.log(` ℹ️ Pre-enrich skip detected: ${skipLog.data}`); + assert(nodeCount === 0, 'no enrichment needed (all skipped)'); + } else { + console.log(' ℹ️ No pre-enrich skips (all locations are new or unenriched)'); + assert(nodeCount > 0, 'enrichment node events received'); + } + + // Check filterTypes assertions: all locations must have website + matching type + const FILTER_TYPE = 'Recycling center'; + const locations = ev['location']; + const badWebsite = locations.filter(l => { + const loc = l.data?.location; + return !loc?.website; + }); + + assert(badWebsite.length === 0, `all locations have website (${badWebsite.length} missing)`); + + const badType = locations.filter(l => { + const loc = l.data?.location; + const types = loc?.types ?? []; + const type = loc?.type ?? ''; + return !types.includes(FILTER_TYPE) && type !== FILTER_TYPE; + }); + if (badType.length > 0) { + console.log(` ❌ Mismatched locations:`); + badType.slice(0, 3).forEach(l => console.log(JSON.stringify(l.data?.location, null, 2))); + } + assert(badType.length === 0, `all locations match type "${FILTER_TYPE}" (${badType.length} mismatched)`); + + const filterLog = logEvents.find(l => + typeof l.data === 'string' && l.data.includes('locations removed') + ); + if (filterLog) { + console.log(` ℹ️ Filter applied: ${filterLog.data}`); + } + + const filterTypesLog = logEvents.filter(l => + typeof l.data === 'string' && (l.data.includes('filterTypes:') || l.data.includes(' - ')) + ); + if (filterTypesLog.length > 0) { + console.log(` ℹ️ Parsed filterTypes in C++:`); + filterTypesLog.forEach(l => console.log(` ${l.data}`)); + } + + console.log(` ℹ️ Locations after filter: ${locations.length}`); + + console.log('6. Cleanup'); + socket.destroy(); + worker.kill('SIGTERM'); + + console.log(`\n────────────────────────────────`); + console.log(` Passed: ${passed} Failed: ${failed}`); + console.log(`────────────────────────────────`); + process.exit(failed > 0 ? 1 : 0); +} + +run().catch(e => { + console.error(e); + process.exit(1); +}); diff --git a/orchestrator/test-gridsearch-ipc.mjs b/orchestrator/test-gridsearch-ipc.mjs new file mode 100644 index 0000000..13c9306 --- /dev/null +++ b/orchestrator/test-gridsearch-ipc.mjs @@ -0,0 +1,204 @@ +/** + * orchestrator/test-gridsearch-ipc.mjs + * + * E2E test: spawn the C++ worker, send a gridsearch request + * matching `npm run gridsearch:enrich` defaults, collect IPC events, + * and verify the full event sequence. + * + * Run: node orchestrator/test-gridsearch-ipc.mjs + * Needs: npm run build-debug (or npm run build) + */ + +import { spawnWorker } from './spawn.mjs'; +import { resolve, dirname } from 'node:path'; +import { readFileSync } from 'node:fs'; +import { fileURLToPath } from 'node:url'; +import fs from 'node:fs'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const IS_WIN = process.platform === 'win32'; +const EXE_NAME = IS_WIN ? 'polymech-cli.exe' : 'polymech-cli'; + +const EXE = resolve(__dirname, '..', 'dist', EXE_NAME); +if (!fs.existsSync(EXE)) { + console.error(`❌ No ${EXE_NAME} found in dist. Run npm run build first.`); + process.exit(1); +} +console.log(`Binary: ${EXE}\n`); + +// Load the sample settings (same as gridsearch:enrich) +const sampleConfig = JSON.parse( + readFileSync(resolve(__dirname, '..', 'config', 'gridsearch-sample.json'), 'utf8') +); + +let passed = 0; +let failed = 0; + +function assert(condition, label) { + if (condition) { + console.log(` ✅ ${label}`); + passed++; + } else { + console.error(` ❌ ${label}`); + failed++; + } +} + +// ── Event collector ───────────────────────────────────────────────────────── + +const EXPECTED_EVENTS = [ + 'grid-ready', + 'waypoint-start', + 'area', + 'location', + 'enrich-start', + 'node', + 'nodePage', + // 'node-error' — may or may not occur, depends on network +]; + +function createCollector() { + const events = {}; + for (const t of ['grid-ready', 'waypoint-start', 'area', 'location', + 'enrich-start', 'node', 'node-error', 'nodePage']) { + events[t] = []; + } + return { + events, + handler(msg) { + const t = msg.type; + if (events[t]) { + events[t].push(msg); + } else { + events[t] = [msg]; + } + // Live progress indicator + const d = msg.payload ?? {}; + if (t === 'waypoint-start') { + process.stdout.write(`\r 🔍 Searching waypoint ${(d.index ?? 0) + 1}/${d.total ?? '?'}...`); + } else if (t === 'node') { + process.stdout.write(`\r 📧 Enriched: ${d.title?.substring(0, 40) ?? ''} `); + } else if (t === 'node-error') { + process.stdout.write(`\r ⚠️ Error: ${d.node?.title?.substring(0, 40) ?? ''} `); + } + }, + }; +} + +// ── Main test ─────────────────────────────────────────────────────────────── + +async function run() { + console.log('🧪 Gridsearch IPC E2E Test\n'); + + // ── 1. Spawn worker ─────────────────────────────────────────────────── + console.log('1. Spawn worker'); + const worker = spawnWorker(EXE); + const readyMsg = await worker.ready; + assert(readyMsg.type === 'ready', 'Worker sends ready signal'); + + // ── 2. Register event collector ─────────────────────────────────────── + const collector = createCollector(); + worker.onEvent(collector.handler); + + // ── 3. Send gridsearch request (matching gridsearch:enrich) ──────────── + console.log('2. Send gridsearch request (Aruba / recycling / --enrich)'); + const t0 = Date.now(); + + // Very long timeout — enrichment can take minutes + const result = await worker.request( + { + type: 'gridsearch', + payload: { + ...sampleConfig, + enrich: true, + }, + }, + 5 * 60 * 1000 // 5 min timeout + ); + + const elapsed = ((Date.now() - t0) / 1000).toFixed(1); + console.log(`\n\n ⏱️ Completed in ${elapsed}s\n`); + + // ── 4. Verify final result ──────────────────────────────────────────── + console.log('3. Verify job_result'); + assert(result.type === 'job_result', `Response type is "job_result" (got "${result.type}")`); + + const summary = result.payload ?? null; + assert(summary !== null, 'job_result payload is present'); + + if (summary) { + assert(typeof summary.totalMs === 'number', `totalMs is number (${summary.totalMs})`); + assert(typeof summary.searchMs === 'number', `searchMs is number (${summary.searchMs})`); + assert(typeof summary.enrichMs === 'number', `enrichMs is number (${summary.enrichMs})`); + assert(typeof summary.freshApiCalls === 'number', `freshApiCalls is number (${summary.freshApiCalls})`); + assert(typeof summary.waypointCount === 'number', `waypointCount is number (${summary.waypointCount})`); + assert(summary.gridStats && typeof summary.gridStats.validCells === 'number', 'gridStats.validCells present'); + assert(summary.searchStats && typeof summary.searchStats.totalResults === 'number', 'searchStats.totalResults present'); + assert(typeof summary.enrichedOk === 'number', `enrichedOk is number (${summary.enrichedOk})`); + assert(typeof summary.enrichedTotal === 'number', `enrichedTotal is number (${summary.enrichedTotal})`); + } + + // ── 5. Verify event sequence ────────────────────────────────────────── + console.log('4. Verify event stream'); + const e = collector.events; + + assert(e['grid-ready'].length === 1, `Exactly 1 grid-ready event (got ${e['grid-ready'].length})`); + assert(e['waypoint-start'].length > 0, `At least 1 waypoint-start event (got ${e['waypoint-start'].length})`); + assert(e['area'].length > 0, `At least 1 area event (got ${e['area'].length})`); + assert(e['waypoint-start'].length === e['area'].length, `waypoint-start count (${e['waypoint-start'].length}) === area count (${e['area'].length})`); + assert(e['enrich-start'].length === 1, `Exactly 1 enrich-start event (got ${e['enrich-start'].length})`); + + const totalNodes = e['node'].length + e['node-error'].length; + assert(totalNodes > 0, `At least 1 node event (got ${totalNodes}: ${e['node'].length} ok, ${e['node-error'].length} errors)`); + + // Validate grid-ready payload + if (e['grid-ready'].length > 0) { + const gr = e['grid-ready'][0].payload ?? {}; + assert(Array.isArray(gr.areas), 'grid-ready.areas is array'); + assert(typeof gr.total === 'number' && gr.total > 0, `grid-ready.total > 0 (${gr.total})`); + } + + // Validate location events have required fields + if (e['location'].length > 0) { + const loc = e['location'][0].payload ?? {}; + assert(loc.location && typeof loc.location.title === 'string', 'location event has location.title'); + assert(loc.location && typeof loc.location.place_id === 'string', 'location event has location.place_id'); + assert(typeof loc.areaName === 'string', 'location event has areaName'); + } + assert(e['location'].length > 0, `At least 1 location event (got ${e['location'].length})`); + + // Validate node payloads + if (e['node'].length > 0) { + const nd = e['node'][0].payload ?? {}; + assert(typeof nd.placeId === 'string', 'node event has placeId'); + assert(typeof nd.title === 'string', 'node event has title'); + assert(Array.isArray(nd.emails), 'node event has emails array'); + assert(typeof nd.status === 'string', 'node event has status'); + } + + // ── 6. Print event summary ──────────────────────────────────────────── + console.log('\n5. Event summary'); + for (const [type, arr] of Object.entries(e)) { + if (arr.length > 0) console.log(` ${type}: ${arr.length}`); + } + + // ── 7. Shutdown ─────────────────────────────────────────────────────── + console.log('\n6. Graceful shutdown'); + const shutdownRes = await worker.shutdown(); + assert(shutdownRes.type === 'shutdown_ack', 'Shutdown acknowledged'); + + await new Promise(r => setTimeout(r, 500)); + assert(worker.process.exitCode === 0, `Worker exited with code 0 (got ${worker.process.exitCode})`); + + // ── Summary ─────────────────────────────────────────────────────────── + console.log(`\n────────────────────────────────`); + console.log(` Passed: ${passed} Failed: ${failed}`); + console.log(`────────────────────────────────\n`); + + process.exit(failed > 0 ? 1 : 0); +} + +run().catch((err) => { + console.error('Test runner error:', err); + process.exit(1); +}); diff --git a/orchestrator/test-ipc.mjs b/orchestrator/test-ipc.mjs index 6180eb1..b63e179 100644 --- a/orchestrator/test-ipc.mjs +++ b/orchestrator/test-ipc.mjs @@ -12,7 +12,7 @@ import { resolve, dirname } from 'node:path'; import { fileURLToPath } from 'node:url'; const __dirname = dirname(fileURLToPath(import.meta.url)); -const EXE = resolve(__dirname, '..', 'build', 'dev', 'Debug', 'polymech-cli.exe'); +const EXE = resolve(__dirname, '..', 'dist', 'polymech-cli.exe'); let passed = 0; let failed = 0; diff --git a/package.json b/package.json index fe8eac5..408d2c4 100644 --- a/package.json +++ b/package.json @@ -8,18 +8,26 @@ "scripts": { "config": "cmake --preset dev", "config:release": "cmake --preset release", - "build": "cmake --build --preset dev", - "build:release": "cmake --build --preset release", + "build": "cmake --preset dev && cmake --build --preset dev", + "build:release": "cmake --preset release && cmake --build --preset release", + "build:linux": "bash build-linux.sh", "test": "ctest --test-dir build/dev -C Debug --output-on-failure", "test:release": "ctest --test-dir build/release -C Release --output-on-failure", - "clean": "cmake -E rm -rf build/dev", - "clean:release": "cmake -E rm -rf build/release", - "clean:all": "cmake -E rm -rf build", - "rebuild": "npm run clean && npm run config && npm run build", - "run": ".\\build\\dev\\Debug\\polymech-cli.exe --help", - "worker": ".\\build\\dev\\Debug\\polymech-cli.exe worker", - "test:ipc": "node orchestrator/test-ipc.mjs", - "gridsearch": ".\\build\\Debug\\polymech-cli.exe gridsearch ABW recycling --dry-run" + "clean": "cmake -E rm -rf build dist", + "rebuild": "npm run clean && npm run build", + "run": ".\\dist\\polymech-cli.exe --help", + "worker": ".\\dist\\polymech-cli.exe worker", + "test:ipc": "node orchestrator/test-gridsearch-ipc.mjs", + "gridsearch": ".\\dist\\polymech-cli.exe gridsearch ABW recycling --dry-run", + "gridsearch:settings": ".\\dist\\polymech-cli.exe gridsearch --settings config/gridsearch-sample.json --dry-run", + "gridsearch:settings:live": ".\\dist\\polymech-cli.exe gridsearch --settings config/gridsearch-sample.json", + "gridsearch:enrich": ".\\dist\\polymech-cli.exe gridsearch --settings config/gridsearch-sample.json --enrich", + "gridsearch:enrich-test": ".\\dist\\polymech-cli.exe gridsearch --settings config/gridsearch-test-bcn.json --enrich --persistence-postgres", + "test:gridsearch-ipc": "node orchestrator/test-gridsearch-ipc.mjs", + "test:ipc:daemon": "node orchestrator/test-gridsearch-ipc-daemon.mjs", + "test:ipc:uds": "node orchestrator/test-gridsearch-ipc-uds.mjs", + "test:ipc:uds-meta": "node orchestrator/test-gridsearch-ipc-uds-meta.mjs", + "test:html": "cmake --preset release && cmake --build --preset release --target test_html && .\\dist\\test_html.exe" }, "repository": { "type": "git", diff --git a/packages/enrichers/CMakeLists.txt b/packages/enrichers/CMakeLists.txt new file mode 100644 index 0000000..5d9338d --- /dev/null +++ b/packages/enrichers/CMakeLists.txt @@ -0,0 +1,4 @@ +add_library(enrichers STATIC src/enrichers.cpp) + +target_include_directories(enrichers PUBLIC include) +target_link_libraries(enrichers PUBLIC http html json logger) diff --git a/packages/enrichers/include/enrichers/enrichers.h b/packages/enrichers/include/enrichers/enrichers.h new file mode 100644 index 0000000..d053bab --- /dev/null +++ b/packages/enrichers/include/enrichers/enrichers.h @@ -0,0 +1,162 @@ +#pragma once + +#include +#include +#include + +namespace enrichers { + +// ── Status codes ──────────────────────────────────────────────────────────── + +enum class EnrichStatus { + OK, + NO_EMAIL, + META_TIMEOUT, + EMAIL_TIMEOUT, + FETCH_ERROR, + NO_PAGES, + ERROR, +}; + +const char *status_string(EnrichStatus s); + +// ── Data types ────────────────────────────────────────────────────────────── + +struct PageError { + std::string url; + std::string status; // "SEARCHED_EMAIL", "FAILED", ... + std::string method; // "GET", "SCRAPELESS", ... + std::string error; + int http_status = 0; + std::vector emails; +}; + +struct SocialLink { + std::string platform; // "instagram", "facebook", "linkedin", ... + std::string url; +}; + +struct SiteMeta { + std::string title; + std::string description; + std::string og_image; + std::string canonical; + std::vector socials; + std::vector internal_pages; // discovered internal hrefs + std::vector emails; + std::string body_text; + std::string body_html; + std::map sites; // url -> body_md + int http_status = 0; + std::string fetch_error; + std::vector json_ld; +}; + +struct EnrichedNode { + int idx = 0; + std::string title; + std::string place_id; + std::string website; + std::string address; + std::string type; + std::string grid_area; + std::string grid_gid; + int pages_found = 0; + int pages_scraped = 0; + std::vector emails; + std::vector socials; + int meta_ms = 0; + int email_ms = 0; + int total_ms = 0; + EnrichStatus status = EnrichStatus::NO_EMAIL; + std::string error; + std::map pages; // "home" → body text + std::vector meta_pages; + std::vector page_errors; + std::string enricher_hash; + std::string geo_json; + std::map sites; // url -> body_md +}; + +// ── Configuration ─────────────────────────────────────────────────────────── + +struct EnrichConfig { + bool enable_homepage_md = true; + int meta_timeout_ms = 10000; + int email_timeout_ms = 15000; + int email_page_timeout_ms = 10000; + int email_max_pages = 8; + int email_abort_after = 1; + + /// Scrapeless API key — if set, pages that yield no emails via plain + /// HTTP GET will be re-fetched through the Scrapeless Universal Scraping + /// API (JS rendering). Leave empty to disable the fallback. + std::string scrapeless_key; + + std::string bigdata_key; + + std::vector contact_patterns = { + "contact", "kontakt", "contacto", "contacta", "impression", + "about", "impress", "impressum", "datenschutz", "privacy", + "legal", "team", "nosotros", "empresa", "sobre", + }; + std::vector probe_paths = { + "/contact", "/contacto", "/kontakt", "/contacta", + "/about", "/about-us", "/impressum", + }; + + std::string meta_scraper; + int meta_concurrency = 5; + int meta_idle_timeout = 60; +}; + +// ── Location input ────────────────────────────────────────────────────────── + +struct LocationInput { + std::string title; + std::string place_id; + std::string website; + std::string address; + std::string type; + std::string grid_area; + std::string grid_gid; + double lat = 0; + double lng = 0; +}; + +// ── Core API ──────────────────────────────────────────────────────────────── + +/// Check if a candidate string looks like a real email address. +bool is_likely_email(const std::string &candidate); + +/// Extract all email addresses from a text body. +std::vector extract_emails(const std::string &text); + +/// Scrape metadata from a website URL (static HTML via libcurl + lexbor). +SiteMeta scrape_meta(const std::string &url, int timeout_ms = 10000); + +/// Scrape emails from a single page URL. +std::vector scrape_emails_from_page(const std::string &url, + int timeout_ms = 10000); + +/// Fetch a page via Scrapeless Universal Scraping API (JS rendering), +/// then extract emails from the rendered HTML. Returns empty if key is +/// blank or the API call fails. +std::vector scrape_emails_scrapeless(const std::string &url, + const std::string &api_key, + int timeout_ms = 15000); + +/// Scrape metadata from a website URL via Scrapeless Universal API (JS +/// rendering). +SiteMeta scrape_meta_scrapeless(const std::string &url, + const std::string &api_key, + int timeout_ms = 15000); + +/// Full enrichment pipeline for a single location: meta → email. +EnrichedNode enrich_location(const LocationInput &loc, + const EnrichConfig &cfg = {}); + +/// Resolve a URL relative to a base URL. +std::string resolve_url(const std::string &base, const std::string &href); + +} // namespace enrichers diff --git a/packages/enrichers/src/enrichers.cpp b/packages/enrichers/src/enrichers.cpp new file mode 100644 index 0000000..3497edc --- /dev/null +++ b/packages/enrichers/src/enrichers.cpp @@ -0,0 +1,800 @@ +#include "enrichers/enrichers.h" +#include "html/html.h" +#include "http/http.h" +#include "logger/logger.h" +#include "json/json.h" + +#include +#include +#include +#include +#include +#include + +namespace enrichers { + +// ── Status string ─────────────────────────────────────────────────────────── + +const char *status_string(EnrichStatus s) { + switch (s) { + case EnrichStatus::OK: + return "OK"; + case EnrichStatus::NO_EMAIL: + return "NO_EMAIL"; + case EnrichStatus::META_TIMEOUT: + return "META_TIMEOUT"; + case EnrichStatus::EMAIL_TIMEOUT: + return "EMAIL_TIMEOUT"; + case EnrichStatus::FETCH_ERROR: + return "FETCH_ERROR"; + case EnrichStatus::NO_PAGES: + return "NO_PAGES"; + case EnrichStatus::ERROR: + return "ERROR"; + } + return "UNKNOWN"; +} + +// ── Timing helper ─────────────────────────────────────────────────────────── + +static int elapsed_ms(std::chrono::steady_clock::time_point t0) { + auto now = std::chrono::steady_clock::now(); + return static_cast( + std::chrono::duration_cast(now - t0).count()); +} + +// ── Email extraction ──────────────────────────────────────────────────────── + +static const std::regex + EMAIL_RE(R"([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})", + std::regex::optimize); + +// Asset extensions that disqualify an email-like string +static const std::vector ASSET_EXTS = { + ".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp", + ".avif", ".css", ".js", ".woff", ".woff2", ".ttf", + ".eot", ".mp4", ".mp3", ".pdf", ".zip", ".ico", +}; + +static std::string to_lower(const std::string &s) { + std::string out = s; + std::transform(out.begin(), out.end(), out.begin(), + [](unsigned char c) { return std::tolower(c); }); + return out; +} + +bool is_likely_email(const std::string &candidate) { + if (candidate.size() < 5 || candidate.size() > 254) + return false; + if (candidate.find("..") != std::string::npos) + return false; + auto at_pos = candidate.find('@'); + if (at_pos == std::string::npos || at_pos == 0 || + at_pos == candidate.size() - 1) + return false; + + auto lower = to_lower(candidate); + + // Reject asset-like extensions + for (auto &ext : ASSET_EXTS) { + if (lower.size() >= ext.size() && + lower.compare(lower.size() - ext.size(), ext.size(), ext) == 0) { + return false; + } + } + + // Reject common placeholders + if (lower.find("example") != std::string::npos) + return false; + if (lower.find("sentry") != std::string::npos) + return false; + if (lower.find("test") != std::string::npos) + return false; + if (lower.find("placeholder") != std::string::npos) + return false; + if (lower.find("wixpress.com") != std::string::npos) + return false; + + // Reject if local part is pure hex hash (8+ hex chars) + if (at_pos >= 8) { + auto local = lower.substr(0, at_pos); + bool all_hex = std::all_of(local.begin(), local.end(), [](char c) { + return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f'); + }); + if (all_hex) + return false; + } + + // Reject if domain part looks numeric-only (e.g. 1234@5678) + auto domain = lower.substr(at_pos + 1); + auto dot_pos = domain.find('.'); + if (dot_pos == std::string::npos) + return false; + if (domain.length() - dot_pos <= 2) + return false; // Minimum 2 chars for TLD + + auto domPart = domain.substr(0, dot_pos); + bool all_digits = + !domPart.empty() && + std::all_of(domPart.begin(), domPart.end(), + [](unsigned char c) { return std::isdigit(c); }); + if (all_digits) + return false; + + return true; +} + +static bool is_valid_email_char(char c) { + return std::isalnum(static_cast(c)) || c == '.' || c == '_' || + c == '%' || c == '+' || c == '-'; +} + +std::vector extract_emails(const std::string &text) { + std::vector results; + if (text.empty()) + return results; + + std::set seen; + size_t pos = 0; + + while ((pos = text.find('@', pos)) != std::string::npos) { + if (pos == 0 || pos == text.length() - 1) { + pos++; + continue; + } + + // Scan backwards + size_t start = pos; + while (start > 0 && is_valid_email_char(text[start - 1])) { + start--; + } + + // Scan forwards + size_t end = pos; + while (end < text.length() - 1 && is_valid_email_char(text[end + 1])) { + end++; + } + + if (start < pos && end > pos) { + std::string candidate = text.substr(start, end - start + 1); + + // Strip trailing dots/hyphens eagerly grabbed + while (!candidate.empty() && + (candidate.back() == '.' || candidate.back() == '-')) { + candidate.pop_back(); + end--; + } + + // Strip leading dots/hyphens + size_t local_start = 0; + while (local_start < candidate.length() && + (candidate[local_start] == '.' || candidate[local_start] == '-')) { + local_start++; + } + if (local_start > 0) { + candidate = candidate.substr(local_start); + } + + std::string lower = to_lower(candidate); + if (is_likely_email(lower)) { + if (seen.insert(lower).second) { + results.push_back(lower); + } + } + } + pos = end + 1; + } + + return results; +} + +// ── URL resolution ────────────────────────────────────────────────────────── + +std::string resolve_url(const std::string &base, const std::string &href) { + if (href.empty()) + return {}; + + // Already absolute + if (href.find("http://") == 0 || href.find("https://") == 0) + return href; + + // Protocol-relative + if (href.find("//") == 0) { + auto proto_end = base.find("//"); + if (proto_end != std::string::npos) { + return base.substr(0, proto_end) + href; + } + return "https:" + href; + } + + // Skip non-HTTP + if (href.find("mailto:") == 0 || href.find("tel:") == 0 || + href.find("javascript:") == 0 || href[0] == '#') { + return {}; + } + + // Relative path + // Find base origin: https://example.com + auto proto = base.find("://"); + if (proto == std::string::npos) + return {}; + auto origin_end = base.find('/', proto + 3); + std::string origin = + (origin_end != std::string::npos) ? base.substr(0, origin_end) : base; + + if (href[0] == '/') { + return origin + href; + } + + // Relative without leading slash + if (origin_end != std::string::npos) { + auto last_slash = base.rfind('/'); + if (last_slash > proto + 2) { + return base.substr(0, last_slash + 1) + href; + } + } + return origin + "/" + href; +} + +// ── Social link classification ────────────────────────────────────────────── + +static std::string classify_social(const std::string &url) { + auto lower = to_lower(url); + if (lower.find("instagram.com") != std::string::npos) + return "instagram"; + if (lower.find("facebook.com") != std::string::npos) + return "facebook"; + if (lower.find("linkedin.com") != std::string::npos) + return "linkedin"; + if (lower.find("twitter.com") != std::string::npos || + lower.find("x.com") != std::string::npos) + return "twitter"; + if (lower.find("youtube.com") != std::string::npos) + return "youtube"; + if (lower.find("tiktok.com") != std::string::npos) + return "tiktok"; + if (lower.find("pinterest.com") != std::string::npos) + return "pinterest"; + if (lower.find("github.com") != std::string::npos) + return "github"; + return {}; +} + +// ── Same-origin check ─────────────────────────────────────────────────────── + +static std::string get_origin(const std::string &url) { + auto proto = url.find("://"); + if (proto == std::string::npos) + return {}; + auto origin_end = url.find('/', proto + 3); + return (origin_end != std::string::npos) ? url.substr(0, origin_end) : url; +} + +static bool is_same_origin(const std::string &base_url, + const std::string &href) { + auto bo = to_lower(get_origin(base_url)); + auto ho = to_lower(get_origin(href)); + if (bo.empty() || ho.empty()) + return false; + // Strip www. for comparison + auto strip_www = [](std::string &s) { + auto pos = s.find("://www."); + if (pos != std::string::npos) { + s = s.substr(0, pos + 3) + s.substr(pos + 7); + } + }; + strip_www(bo); + strip_www(ho); + return bo == ho; +} + +// ── Contact page matching ─────────────────────────────────────────────────── + +static bool matches_contact_pattern(const std::string &url, + const std::vector &patterns) { + auto lower = to_lower(url); + for (auto &pat : patterns) { + if (lower.find(to_lower(pat)) != std::string::npos) + return true; + } + return false; +} + +// ── Shared HTML parsing logic for Meta ────────────────────────────────────── + +static SiteMeta parse_meta_html(const std::string &url, int http_status, + const std::string &html_body, + const std::string &fetch_error) { + SiteMeta meta; + meta.http_status = http_status; + + if (!fetch_error.empty()) { + meta.fetch_error = fetch_error; + return meta; + } + + meta.body_html = html_body; + + // Parse with lexbor helpers + meta.title = html::get_title(html_body); + meta.description = html::get_meta(html_body, "description"); + meta.og_image = html::get_meta(html_body, "og:image"); + meta.canonical = html::get_canonical(html_body); + meta.body_text = html::get_body_text(html_body); + meta.json_ld = html::get_json_ld(html_body); + + // OG fallbacks + if (meta.description.empty()) + meta.description = html::get_meta(html_body, "og:description"); + if (meta.title.empty()) + meta.title = html::get_meta(html_body, "og:title"); + + // Links — classify into social / internal / mailto + auto links = html::get_links(html_body); + std::set seen_pages; + + // Extract emails from body text (much smaller than raw HTML) + meta.emails = extract_emails(meta.body_text); + + for (auto &lk : links) { + if (lk.href.length() > 7 && to_lower(lk.href).find("mailto:") == 0) { + std::string email = lk.href.substr(7); + // Strip anything after ? (like ?subject=...) + auto q = email.find('?'); + if (q != std::string::npos) + email = email.substr(0, q); + // Clean it + email = to_lower(email); + if (is_likely_email(email)) { + if (std::find(meta.emails.begin(), meta.emails.end(), email) == + meta.emails.end()) { + meta.emails.push_back(email); + } + } + continue; + } + + auto resolved = resolve_url(url, lk.href); + if (resolved.empty()) + continue; + + auto social = classify_social(resolved); + if (!social.empty()) { + meta.socials.push_back({social, resolved}); + continue; + } + + if (is_same_origin(url, resolved)) { + // Strip fragment (#) from URL + auto hash_pos = resolved.find('#'); + if (hash_pos != std::string::npos) { + resolved = resolved.substr(0, hash_pos); + } + if (!resolved.empty() && seen_pages.insert(resolved).second) { + meta.internal_pages.push_back(resolved); + } + } + } + + return meta; +} + +// ── scrape_meta ───────────────────────────────────────────────────────────── + +SiteMeta scrape_meta(const std::string &url, int timeout_ms) { + http::GetOptions opts; + opts.timeout_ms = timeout_ms; + opts.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/120.0.0.0 Safari/537.36"; + + auto resp = http::get(url, opts); + std::string fetch_err; + if (resp.status_code < 0 || resp.status_code >= 400) { + fetch_err = resp.body; + } + return parse_meta_html(url, static_cast(resp.status_code), resp.body, + fetch_err); +} + +// ── scrape_emails_from_page ───────────────────────────────────────────────── + +std::vector scrape_emails_from_page(const std::string &url, + int timeout_ms, + int &out_status_code) { + http::GetOptions opts; + opts.timeout_ms = timeout_ms; + opts.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/120.0.0.0 Safari/537.36"; + + auto resp = http::get(url, opts); + out_status_code = static_cast(resp.status_code); + if (resp.status_code < 0 || resp.status_code >= 400) { + return {}; + } + + // Extract body text then find emails + auto text = html::get_body_text(resp.body); + auto from_text = extract_emails(text); + + // Extract mailto: links from HTML directly without regexing the huge string + auto links = html::get_links(resp.body); + std::set seen(from_text.begin(), from_text.end()); + + for (auto &lk : links) { + if (lk.href.length() > 7 && to_lower(lk.href).find("mailto:") == 0) { + std::string m = lk.href.substr(7); + auto q = m.find('?'); + if (q != std::string::npos) + m = m.substr(0, q); + m = to_lower(m); + if (is_likely_email(m)) { + if (seen.insert(m).second) { + from_text.push_back(m); + } + } + } + } + + return from_text; +} + +static std::string extract_scrapeless_html(const std::string &json_body) { + std::string data = json::get_string(json_body, "data"); + if (data.empty()) { + return json_body; // Fallback to raw response if not found + } + return data; +} + +SiteMeta scrape_meta_scrapeless(const std::string &url, + const std::string &api_key, int timeout_ms) { + if (api_key.empty()) + return parse_meta_html(url, 0, "", "missing api key"); + + std::string payload = R"({"actor":"unlocker.webunlocker","input":{"url":")" + + url + + R"(","jsRender":{"enabled":true,"headless":true}}})"; + + http::PostOptions opts; + opts.content_type = "application/json"; + opts.bearer_token = api_key; + opts.timeout_ms = + std::max(timeout_ms, 45000); // Scrapeless needs generous timeout + + auto resp = http::post("https://api.scrapeless.com/api/v2/unlocker/request", + payload, opts); + + std::string fetch_err; + if (resp.status_code < 0 || resp.status_code >= 400) { + fetch_err = resp.body; + logger::error("[meta:scrapeless] API Error HTTP " + + std::to_string(resp.status_code) + " for " + url + " : " + + fetch_err); + return parse_meta_html(url, static_cast(resp.status_code), resp.body, + fetch_err); + } + + std::string rendered_html = extract_scrapeless_html(resp.body); + return parse_meta_html(url, static_cast(resp.status_code), rendered_html, + ""); +} + +std::vector scrape_emails_scrapeless(const std::string &url, + const std::string &api_key, + int timeout_ms) { + if (api_key.empty()) + return {}; + + // Build the Scrapeless Universal Scraping API request body. + // We ask for the fully-rendered HTML of the target URL. + std::string payload = R"({"actor":"unlocker.webunlocker","input":{"url":")" + + url + + R"(","jsRender":{"enabled":true,"headless":true}}})"; + + http::PostOptions opts; + opts.content_type = "application/json"; + opts.bearer_token = api_key; + opts.timeout_ms = + std::max(timeout_ms, 45000); // Scrapeless needs generous timeout + + auto resp = http::post("https://api.scrapeless.com/api/v2/unlocker/request", + payload, opts); + + if (resp.status_code < 0 || resp.status_code >= 400) { + logger::error("[email:scrapeless] API Error HTTP " + + std::to_string(resp.status_code) + " for " + url + " : " + + resp.body); + return {}; // API error — silent fallback + } + + std::string rendered_html = extract_scrapeless_html(resp.body); + + // Parse and extract emails from the rendered HTML + auto text = html::get_body_text(rendered_html); + auto from_text = extract_emails(text); + + // Fast mailto extraction instead of HTML regex + auto links = html::get_links(rendered_html); + std::set seen(from_text.begin(), from_text.end()); + + for (auto &lk : links) { + if (lk.href.length() > 7 && to_lower(lk.href).find("mailto:") == 0) { + std::string m = lk.href.substr(7); + auto q = m.find('?'); + if (q != std::string::npos) + m = m.substr(0, q); + m = to_lower(m); + if (is_likely_email(m)) { + if (seen.insert(m).second) { + from_text.push_back(m); + } + } + } + } + + return from_text; +} + +// ── enrich_location ───────────────────────────────────────────────────────── + +EnrichedNode enrich_location(const LocationInput &loc, + const EnrichConfig &cfg) { + auto t0 = std::chrono::steady_clock::now(); + + EnrichedNode node; + node.title = loc.title; + node.place_id = loc.place_id; + node.website = loc.website; + node.address = loc.address; + node.type = loc.type; + node.grid_area = loc.grid_area; + node.grid_gid = loc.grid_gid; + node.status = EnrichStatus::NO_EMAIL; + + if (loc.website.empty()) { + node.status = EnrichStatus::FETCH_ERROR; + node.error = "no website"; + node.total_ms = elapsed_ms(t0); + return node; + } + + // ── Phase 1: Meta scrape ──────────────────────────────────────────────── + + auto meta_t0 = std::chrono::steady_clock::now(); + SiteMeta meta; + bool meta_timed_out = false; + + try { + if (cfg.meta_scraper == "SCRAPELESS" && !cfg.scrapeless_key.empty()) { + logger::debug("[meta:scrapeless] Fetching " + loc.website); + meta = scrape_meta_scrapeless(loc.website, cfg.scrapeless_key, + cfg.meta_timeout_ms); + } else { + logger::debug("[meta:http] Fetching " + loc.website); + meta = scrape_meta(loc.website, cfg.meta_timeout_ms); + } + } catch (...) { + meta.fetch_error = "exception during meta scrape"; + meta_timed_out = true; + } + node.meta_ms = elapsed_ms(meta_t0); + + // Check if meta took too long (within threshold of timeout) + if (node.meta_ms >= cfg.meta_timeout_ms - 1000) { + meta_timed_out = true; + } + + // logger::info("[" + std::string(loc.title.empty() ? loc.website : loc.title) + // + "] Meta fetch took " + std::to_string(node.meta_ms) + "ms. Links found: " + // + std::to_string(meta.internal_pages.size())); + + if (!meta.body_text.empty()) + node.pages["home"] = meta.body_text; + if (cfg.enable_homepage_md && !meta.body_html.empty()) { + // Cap HTML body at 512 KB to prevent stack overflow in recursive html2md + // parser + static constexpr size_t MAX_HTML_BYTES = 512 * 1024; + if (meta.body_html.size() > MAX_HTML_BYTES) { + logger::warn("[" + loc.title + "] body_html too large (" + + std::to_string(meta.body_html.size() / 1024) + + " KB), skipping markdown conversion"); + } else { + try { + node.sites[loc.website] = html::to_markdown(meta.body_html); + } catch (const std::exception &e) { + logger::warn("[" + loc.title + + "] html::to_markdown failed: " + e.what()); + } catch (...) { + logger::warn("[" + loc.title + + "] html::to_markdown crashed (unknown exception)"); + } + } + } + node.meta_pages = meta.internal_pages; + node.pages_found = static_cast(meta.internal_pages.size()); + node.socials = meta.socials; + + if (!meta.fetch_error.empty()) { + node.error = meta.fetch_error; + node.status = EnrichStatus::FETCH_ERROR; + node.total_ms = elapsed_ms(t0); + return node; + } + + // If meta already found emails, we're done (early exit like TS) + if (!meta.emails.empty()) { + node.emails = meta.emails; + node.status = EnrichStatus::OK; + node.total_ms = elapsed_ms(t0); + return node; + } + + // ── Build contact page list ───────────────────────────────────────────── + + std::vector contact_pages; + std::set seen_urls; + + for (auto &page_url : meta.internal_pages) { + if (matches_contact_pattern(page_url, cfg.contact_patterns)) { + if (seen_urls.insert(page_url).second) { + contact_pages.push_back(page_url); + } + } + } + + // No more probe paths. If we found 0 contact pages, we just give up or time + // out. + + node.pages_found = static_cast(contact_pages.size()); + + if (contact_pages.empty()) { + logger::debug("[" + + std::string(loc.title.empty() ? loc.website : loc.title) + + "] No contact pages found."); + node.status = + meta_timed_out ? EnrichStatus::META_TIMEOUT : EnrichStatus::NO_PAGES; + node.total_ms = elapsed_ms(t0); + return node; + } + + logger::debug("[" + std::string(loc.title.empty() ? loc.website : loc.title) + + "] Contact pages to scrape: " + + std::to_string(contact_pages.size()) + " (parallel)"); + + // ── Phase 2: Email scrape per contact page ────────────────────────────── + + struct AsyncResult { + std::string url; + std::vector errors; + std::vector emails; + int ms; + }; + + int pages_to_scrape = + std::min(static_cast(contact_pages.size()), cfg.email_max_pages); + + std::vector contact_threads; + std::vector contact_results(pages_to_scrape); + + auto email_t0 = std::chrono::steady_clock::now(); + + for (int i = 0; i < pages_to_scrape; ++i) { + auto page_url = contact_pages[i]; + + contact_threads.emplace_back([i, &contact_results, page_url, cfg, loc]() { + auto start = std::chrono::steady_clock::now(); + AsyncResult res; + res.url = page_url; + + PageError pe1; + pe1.url = page_url; + pe1.method = "GET"; + + int http_status = 0; + try { + // logger::debug("[email:http] Fetching " + page_url); + auto page_emails = scrape_emails_from_page( + page_url, cfg.email_page_timeout_ms, http_status); + pe1.emails = page_emails; + logger::debug("[" + + std::string(loc.title.empty() ? loc.website : loc.title) + + "] HTTP fetch finished code " + + std::to_string(http_status) + " for " + page_url); + + if (page_emails.empty()) { + if (http_status == 404 || http_status == 400 || http_status == 500) { + pe1.status = "NOT_FOUND"; + pe1.error = "HTTP " + std::to_string(http_status); + } else { + pe1.status = "AXIOS_NO_EMAIL"; + res.errors.push_back(pe1); // pushed before scrapeless + + if (cfg.meta_scraper == "SCRAPELESS" && + !cfg.scrapeless_key.empty()) { + PageError pe2; + pe2.url = page_url; + pe2.method = "SCRAPELESS"; + try { + logger::debug("[email:scrapeless] Fallback scraping " + + page_url); + auto s_emails = + scrape_emails_scrapeless(page_url, cfg.scrapeless_key, + cfg.email_page_timeout_ms + 5000); + pe2.emails = s_emails; + pe2.status = s_emails.empty() ? "FAILED" : "SEARCHED_EMAIL"; + if (!s_emails.empty()) + res.emails = s_emails; + logger::debug( + "[" + + std::string(loc.title.empty() ? loc.website : loc.title) + + "] Scrapeless fallback finished for " + page_url); + } catch (...) { + pe2.status = "FAILED"; + pe2.error = "scrapeless exception"; + } + res.errors.push_back(pe2); + } + res.ms = elapsed_ms(start); + contact_results[i] = res; + return; + } + } else { + pe1.status = "SEARCHED_EMAIL"; + res.emails = page_emails; + } + } catch (...) { + pe1.status = "AXIOS_FAILED"; + pe1.error = "exception"; + } + // Only insert pe1 if we didn't already push it during fallback + if (res.errors.empty() || res.errors[0].method != "GET") { + res.errors.insert(res.errors.begin(), pe1); + } + res.ms = elapsed_ms(start); + contact_results[i] = res; + }); + } + + for (auto &t : contact_threads) { + if (t.joinable()) + t.join(); + } + + std::set all_emails; + int pages_scraped = 0; + + for (auto &res : contact_results) { + pages_scraped++; + for (auto &pe : res.errors) { + node.page_errors.push_back(std::move(pe)); + } + for (auto &e : res.emails) { + all_emails.insert(e); + } + } + + node.email_ms = elapsed_ms(email_t0); + node.pages_scraped = pages_scraped; + + // Merge emails + node.emails.assign(all_emails.begin(), all_emails.end()); + + // Final status + bool email_timed_out = node.email_ms >= cfg.email_timeout_ms - 1000; + if (!node.emails.empty()) { + node.status = EnrichStatus::OK; + } else if (email_timed_out) { + node.status = EnrichStatus::EMAIL_TIMEOUT; + } else if (meta_timed_out) { + node.status = EnrichStatus::META_TIMEOUT; + } else { + node.status = EnrichStatus::NO_EMAIL; + } + + node.total_ms = elapsed_ms(t0); + return node; +} + +} // namespace enrichers diff --git a/packages/gadm_reader/src/gadm_reader.cpp b/packages/gadm_reader/src/gadm_reader.cpp index 13cb488..7ea96fa 100644 --- a/packages/gadm_reader/src/gadm_reader.cpp +++ b/packages/gadm_reader/src/gadm_reader.cpp @@ -210,18 +210,18 @@ BoundaryResult load_boundary_file(const std::string& filepath) { BoundaryResult load_boundary(const std::string& gid, int targetLevel, const std::string& cacheDir) { - // Try: cacheDir/boundary_{gid}_{level}.json - std::string path = cacheDir + "/boundary_" + gid + "_" + std::to_string(targetLevel) + ".json"; + std::string cc = country_code(gid); + std::string filename = "boundary_" + gid + "_" + std::to_string(targetLevel) + ".json"; + + // Primary: cacheDir/{countryCode}/boundary_{gid}_{level}.json + std::string path = cacheDir + "/" + cc + "/" + filename; auto result = load_boundary_file(path); if (result.error.empty()) return result; - // Fallback: cacheDir/boundary_{countryCode}_{level}.json - std::string cc = country_code(gid); - if (cc != gid) { - path = cacheDir + "/boundary_" + cc + "_" + std::to_string(targetLevel) + ".json"; - result = load_boundary_file(path); - if (result.error.empty()) return result; - } + // Fallback (flat): cacheDir/boundary_{gid}_{level}.json + path = cacheDir + "/" + filename; + result = load_boundary_file(path); + if (result.error.empty()) return result; // Both failed result.error = "No boundary file found for gid=" + gid + " level=" + std::to_string(targetLevel) + " in " + cacheDir; diff --git a/packages/grid/include/grid/grid.h b/packages/grid/include/grid/grid.h index 80f2947..ca719bc 100644 --- a/packages/grid/include/grid/grid.h +++ b/packages/grid/include/grid/grid.h @@ -16,6 +16,8 @@ struct Waypoint { double lng = 0; double lat = 0; double radius_km = 0; + std::string area_gid; + std::string area_name; }; struct GridOptions { diff --git a/packages/grid/src/grid.cpp b/packages/grid/src/grid.cpp index 1d3f99d..3a1ed4b 100644 --- a/packages/grid/src/grid.cpp +++ b/packages/grid/src/grid.cpp @@ -168,7 +168,9 @@ static GridResult generate_admin(const std::vector& features, static_cast(res.waypoints.size() + 1), std::round(center.lon * 1e6) / 1e6, std::round(center.lat * 1e6) / 1e6, - std::round(radiusKm * 100.0) / 100.0 + std::round(radiusKm * 100.0) / 100.0, + f.gid, + f.name }); res.validCells++; } else { @@ -248,7 +250,9 @@ static GridResult generate_centers(const std::vector& features, static_cast(res.waypoints.size() + 1), std::round(pt.lon * 1e6) / 1e6, std::round(pt.lat * 1e6) / 1e6, - std::round((opts.cellSize / 2.0) * 100.0) / 100.0 + std::round((opts.cellSize / 2.0) * 100.0) / 100.0, + f.gid, + f.name }); res.validCells++; } else { @@ -322,7 +326,9 @@ static GridResult generate_polygon_grid(const std::vector& featur static_cast(res.waypoints.size() + 1), std::round(gc.lon * 1e6) / 1e6, std::round(gc.lat * 1e6) / 1e6, - std::round(cellRadiusKm * 100.0) / 100.0 + std::round(cellRadiusKm * 100.0) / 100.0, + regionFeat.gid, + regionFeat.name }); res.validCells++; } else { @@ -356,9 +362,21 @@ GridResult generate(const std::vector& features, // Sort waypoints if (result.waypoints.size() > 1) { - if (opts.groupByRegion && features.size() > 1 && opts.gridMode != "admin" && opts.gridMode != "centers") { - // Group by region index could be added, but for now sort all together - sort_waypoints(result.waypoints, opts.pathOrder, opts.cellSize); + if (opts.groupByRegion && features.size() > 1) { + std::stable_sort(result.waypoints.begin(), result.waypoints.end(), + [](const Waypoint& a, const Waypoint& b) { return a.area_gid < b.area_gid; }); + + auto start = result.waypoints.begin(); + while (start != result.waypoints.end()) { + auto end = start; + while (end != result.waypoints.end() && end->area_gid == start->area_gid) { + ++end; + } + std::vector group(start, end); + sort_waypoints(group, opts.pathOrder, opts.cellSize); + std::copy(group.begin(), group.end(), start); + start = end; + } } else { sort_waypoints(result.waypoints, opts.pathOrder, opts.cellSize); } diff --git a/packages/html/CMakeLists.txt b/packages/html/CMakeLists.txt index e592b29..f10d8ce 100644 --- a/packages/html/CMakeLists.txt +++ b/packages/html/CMakeLists.txt @@ -1,26 +1,33 @@ -include(FetchContent) - -FetchContent_Declare( - lexbor - GIT_REPOSITORY https://github.com/lexbor/lexbor.git - GIT_TAG v2.4.0 - GIT_SHALLOW TRUE -) - -# Build lexbor as static -set(LEXBOR_BUILD_SHARED OFF CACHE BOOL "" FORCE) -set(LEXBOR_BUILD_STATIC ON CACHE BOOL "" FORCE) - -FetchContent_MakeAvailable(lexbor) - -add_library(html STATIC - src/html.cpp -) - -target_include_directories(html - PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include -) - -target_link_libraries(html - PUBLIC lexbor_static +include(FetchContent) + +FetchContent_Declare( + lexbor + GIT_REPOSITORY https://github.com/lexbor/lexbor.git + GIT_TAG v2.4.0 + GIT_SHALLOW TRUE +) + +# Build lexbor as static +set(LEXBOR_BUILD_SHARED OFF CACHE BOOL "" FORCE) +set(LEXBOR_BUILD_STATIC ON CACHE BOOL "" FORCE) +FetchContent_MakeAvailable(lexbor) + +add_library(html STATIC + src/html.cpp + src/html2md.cpp + src/table.cpp +) + +# MSVC: treat source and execution charset as UTF-8 +# (fixes \u200b zero-width-space mismatch in html2md tests) +if(MSVC) + target_compile_options(html PRIVATE /utf-8) +endif() + +target_include_directories(html + PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include +) + +target_link_libraries(html + PUBLIC lexbor_static ) diff --git a/packages/html/include/html/html.h b/packages/html/include/html/html.h index 684180e..5159893 100644 --- a/packages/html/include/html/html.h +++ b/packages/html/include/html/html.h @@ -11,6 +11,13 @@ struct Element { std::string text; }; +/// Link with href and optional attributes. +struct Link { + std::string href; + std::string rel; // e.g. "canonical", "stylesheet" + std::string text; // anchor text (for tags) +}; + /// Parse an HTML string and return all elements with their text content. std::vector parse(const std::string &html_str); @@ -18,4 +25,31 @@ std::vector parse(const std::string &html_str); std::vector select(const std::string &html_str, const std::string &selector); +// ── Enricher extraction helpers ───────────────────────────────────────────── + +/// Extract the text. +std::string get_title(const std::string &html_str); + +/// Extract a <meta name="X"> or <meta property="X"> content attribute. +std::string get_meta(const std::string &html_str, const std::string &name); + +/// Extract <link rel="canonical"> href. +std::string get_canonical(const std::string &html_str); + +/// Extract all <a href="..."> values (resolved links as-is from the HTML). +std::vector<Link> get_links(const std::string &html_str); + +/// Extract visible body text, stripping script/style/noscript/svg/iframe. +std::string get_body_text(const std::string &html_str); + +/// Extract raw JSON strings from <script type="application/ld+json">. +std::vector<std::string> get_json_ld(const std::string &html_str); + +/// Extract an attribute value from the first element matching a CSS selector. +std::string get_attr(const std::string &html_str, const std::string &selector, + const std::string &attr_name); + +/// Convert HTML content to Markdown. +std::string to_markdown(const std::string &html_str); + } // namespace html diff --git a/packages/html/include/html/html2md.h b/packages/html/include/html/html2md.h new file mode 100644 index 0000000..0c73ca4 --- /dev/null +++ b/packages/html/include/html/html2md.h @@ -0,0 +1,690 @@ +// Copyright (c) Tim Gromeyer +// Licensed under the MIT License - https://opensource.org/licenses/MIT + +#ifndef HTML2MD_H +#define HTML2MD_H + +#include <memory> +#include <string> +#include <unordered_map> +#include <cstdint> + +/*! + * \brief html2md namespace + * + * The html2md namespace provides: + * 1. The Converter class + * 2. Static wrapper around Converter class + * + * \note Do NOT try to convert HTML that contains a list in an ordered list or a + * `blockquote` in a list!\n This will be a **total** mess! + */ +namespace html2md { + +/*! + * \brief Options for the conversion from HTML to Markdown + * \warning Make sure to pass valid options; otherwise, the output will be + * invalid! + * + * Example from `tests/main.cpp`: + * + * ```cpp + * auto *options = new html2md::Options(); + * options->splitLines = false; + * + * html2md::Converter c(html, options); + * auto md = c.convert(); + * ``` + */ +struct Options { + /*! + * \brief Add new line when a certain number of characters is reached + * + * \see softBreak + * \see hardBreak + */ + bool splitLines = true; + + /*! + * \brief softBreak Wrap after ... characters when the next space is reached + * and as long as it's not in a list, table, image or anchor (link). + */ + int softBreak = 80; + + /*! + * \brief hardBreak Force a break after ... characters in a line + */ + int hardBreak = 100; + + /*! + * \brief The char used for unordered lists + * + * Valid: + * - `-` + * - `+` + * - `*` + * + * Example: + * + * ```markdown + * - List + * + Also a list + * * And this to + * ``` + */ + char unorderedList = '-'; + + /*! + * \brief The char used after the number of the item + * + * Valid: + * - `.` + * - `)` + * + * Example: + * + * ```markdown + * 1. Hello + * 2) World! + * ``` + */ + char orderedList = '.'; + + /*! + * \brief Whether title is added as h1 heading at the very beginning of the + * markdown + * + * Whether title is added as h1 heading at the very beginning of the markdown. + * Default is true. + */ + bool includeTitle = true; + + /*! + * \brief Whetever to format Markdown Tables + * + * Whetever to format Markdown Tables. + * Default is true. + */ + bool formatTable = true; + + /*! + * \brief Whether to force left trim of lines in the final Markdown output + * + * Whether to force left trim of lines in the final Markdown output. + * Default is false. + */ + bool forceLeftTrim = false; + + /*! + * \brief Whether to compress whitespace (tabs, multiple spaces) into a single + * space + * + * Whether to compress whitespace (tabs, multiple spaces) into a single space. + * Default is false. + */ + bool compressWhitespace = false; + + /*! + * \brief Whether to escape numbered lists (e.g. "4." -> "4\.") to prevent them + * from being interpreted as lists in Markdown. + * + * Whether to escape numbered lists. + * Default is true. + */ + bool escapeNumberedList = true; + + /*! + * \brief Whether to keep HTML entities (e.g. ` `) in the output + * + * If true, the converter will not replace HTML entities configured in the + * internal conversion map. Default is false (current behaviour). + */ + bool keepHtmlEntities = false; + + inline bool operator==(html2md::Options o) const { + return splitLines == o.splitLines && unorderedList == o.unorderedList && + orderedList == o.orderedList && includeTitle == o.includeTitle && + softBreak == o.softBreak && hardBreak == o.hardBreak && + formatTable == o.formatTable && forceLeftTrim == o.forceLeftTrim && + compressWhitespace == o.compressWhitespace && + escapeNumberedList == o.escapeNumberedList && + keepHtmlEntities == o.keepHtmlEntities; + }; +}; + +/*! + * \brief Class for converting HTML to Markdown + * + * This class converts HTML to Markdown. + * There is also a static wrapper for this class (see html2md::Convert). + * + * ## Usage example + * + * Option 1: Use the class: + * + * ```cpp + * std::string html = "<h1>example</h1>"; + * html2md::Converter c(html); + * auto md = c.convert(); + * + * if (!c.ok()) std::cout << "There was something wrong in the HTML\n"; + * std::cout << md; // # example + * ``` + * + * Option 2: Use the static wrapper: + * + * ```cpp + * std::string html = "<h1>example</h1>"; + * + * auto md = html2md::Convert(html); + * std::cout << md; + * ``` + * + * Advanced: use Options: + * + * ```cpp + * std::string html = "<h1>example</h1>"; + * + * auto *options = new html2md::Options(); + * options->splitLines = false; + * options->unorderedList = '*'; + * + * html2md::Converter c(html, options); + * auto md = c.convert(); + * if (!c.ok()) std::cout << "There was something wrong in the HTML\n"; + * std::cout << md; // # example + * ``` + */ +class Converter { +public: + /*! + * \brief Standard initializer, takes HTML as parameter. Also prepares + * everything. \param html The HTML as std::string. \param options Options for + * the Conversation. See html2md::Options() for more. + * + * \note Don't pass anything else than HTML, otherwise the output will be a + * **mess**! + * + * This is the default initializer.<br> + * You can use appendToMd() to append something to the beginning of the + * generated output. + */ + explicit inline Converter(const std::string &html, + struct Options *options = nullptr) { + *this = Converter(&html, options); + } + + /*! + * \brief Convert HTML into Markdown. + * \return Returns the converted Markdown. + * + * This function actually converts the HTML into Markdown. + * It also cleans up the Markdown so you don't have to do anything. + */ + [[nodiscard]] std::string convert(); + + /*! + * \brief Append a char to the Markdown. + * \param ch The char to append. + * \return Returns a copy of the instance with the char appended. + */ + Converter *appendToMd(char ch); + + /*! + * \brief Append a char* to the Markdown. + * \param str The char* to append. + * \return Returns a copy of the instance with the char* appended. + */ + Converter *appendToMd(const char *str); + + /*! + * \brief Append a string to the Markdown. + * \param s The string to append. + * \return Returns a copy of the instance with the string appended. + */ + inline Converter *appendToMd(const std::string &s) { + return appendToMd(s.c_str()); + } + + /*! + * \brief Appends a ' ' in certain cases. + * \return Copy of the instance with(maybe) the appended space. + * + * This function appends ' ' if: + * - md does not end with `*` + * - md does not end with `\n` aka newline + */ + Converter *appendBlank(); + + /*! + * \brief Add an HTML symbol conversion + * \param htmlSymbol The HTML symbol to convert + * \param replacement The replacement string + * \note This is useful for converting HTML entities to their Markdown + * equivalents. For example, you can add a conversion for " " to + * " " (space) or "<" to "<" (less than). + * \note This is not a standard feature of the Converter class, but it can + * be added to the class to allow for more flexibility in the conversion + * process. You can use this feature to add custom conversions for any HTML + * symbol that you want to convert to a specific Markdown representation. + */ + void addHtmlSymbolConversion(const std::string &htmlSymbol, + const std::string &replacement) { + htmlSymbolConversions_[htmlSymbol] = replacement; + } + + /*! + * \brief Remove an HTML symbol conversion + * \param htmlSymbol The HTML symbol to remove + * \note This is useful for removing custom conversions that you have added + * previously. + */ + void removeHtmlSymbolConversion(const std::string &htmlSymbol) { + htmlSymbolConversions_.erase(htmlSymbol); + } + + /*! + * \brief Clear all HTML symbol conversions + * \note This is useful for clearing the conversion map (it's empty afterwards). + */ + void clearHtmlSymbolConversions() { htmlSymbolConversions_.clear(); } + + /*! + * \brief Checks if everything was closed properly(in the HTML). + * \return Returns false if there is a unclosed tag. + * \note As long as you have not called convert(), it always returns true. + */ + [[nodiscard]] bool ok() const; + + /*! + * \brief Reset the generated Markdown + */ + void reset(); + + /*! + * \brief Checks if the HTML matches and the options are the same. + * \param The Converter object to compare with + * \return true if the HTML and options matches otherwise false + */ + inline bool operator==(const Converter *c) const { return *this == *c; } + + inline bool operator==(const Converter &c) const { + return html_ == c.html_ && option == c.option; + } + + /*! + * \brief Returns ok(). + */ + inline explicit operator bool() const { return ok(); }; + +private: + // Attributes + static constexpr const char *kAttributeHref = "href"; + static constexpr const char *kAttributeAlt = "alt"; + static constexpr const char *kAttributeTitle = "title"; + static constexpr const char *kAttributeClass = "class"; + static constexpr const char *kAttributeSrc = "src"; + static constexpr const char *kAttrinuteAlign = "align"; + + static constexpr const char *kTagAnchor = "a"; + static constexpr const char *kTagBreak = "br"; + static constexpr const char *kTagCode = "code"; + static constexpr const char *kTagDiv = "div"; + static constexpr const char *kTagHead = "head"; + static constexpr const char *kTagLink = "link"; + static constexpr const char *kTagListItem = "li"; + static constexpr const char *kTagMeta = "meta"; + static constexpr const char *kTagNav = "nav"; + static constexpr const char *kTagNoScript = "noscript"; + static constexpr const char *kTagOption = "option"; + static constexpr const char *kTagOrderedList = "ol"; + static constexpr const char *kTagParagraph = "p"; + static constexpr const char *kTagPre = "pre"; + static constexpr const char *kTagScript = "script"; + static constexpr const char *kTagSpan = "span"; + static constexpr const char *kTagStyle = "style"; + static constexpr const char *kTagTemplate = "template"; + static constexpr const char *kTagTitle = "title"; + static constexpr const char *kTagUnorderedList = "ul"; + static constexpr const char *kTagImg = "img"; + static constexpr const char *kTagSeperator = "hr"; + + // Text format + static constexpr const char *kTagBold = "b"; + static constexpr const char *kTagStrong = "strong"; + static constexpr const char *kTagItalic = "em"; + static constexpr const char *kTagItalic2 = "i"; + static constexpr const char *kTagCitation = "cite"; + static constexpr const char *kTagDefinition = "dfn"; + static constexpr const char *kTagUnderline = "u"; + static constexpr const char *kTagStrighthrought = "del"; + static constexpr const char *kTagStrighthrought2 = "s"; + + static constexpr const char *kTagBlockquote = "blockquote"; + + // Header + static constexpr const char *kTagHeader1 = "h1"; + static constexpr const char *kTagHeader2 = "h2"; + static constexpr const char *kTagHeader3 = "h3"; + static constexpr const char *kTagHeader4 = "h4"; + static constexpr const char *kTagHeader5 = "h5"; + static constexpr const char *kTagHeader6 = "h6"; + + // Table + static constexpr const char *kTagTable = "table"; + static constexpr const char *kTagTableRow = "tr"; + static constexpr const char *kTagTableHeader = "th"; + static constexpr const char *kTagTableData = "td"; + + size_t index_ch_in_html_ = 0; + + bool is_closing_tag_ = false; + bool is_in_attribute_value_ = false; + bool is_in_code_ = false; + bool is_in_list_ = false; + bool is_in_p_ = false; + bool is_in_pre_ = false; + bool is_in_table_ = false; + bool is_in_table_row_ = false; + bool is_in_tag_ = false; + bool is_self_closing_tag_ = false; + bool skipping_leading_whitespace_ = true; + + // relevant for <li> only, false = is in unordered list + bool is_in_ordered_list_ = false; + uint8_t index_ol = 0; + + // store the table start + size_t table_start = 0; + + // number of lists + uint8_t index_li = 0; + + uint8_t index_blockquote = 0; + + char prev_ch_in_md_ = 0, prev_prev_ch_in_md_ = 0; + char prev_ch_in_html_ = 'x'; + + std::string html_; + + uint16_t offset_lt_ = 0; + std::string current_tag_; + std::string prev_tag_; + + // Line which separates header from data + std::string tableLine; + + size_t chars_in_curr_line_ = 0; + + std::string md_; + + Options option; + + std::unordered_map<std::string, std::string> htmlSymbolConversions_ = { + {""", "\""}, {"<", "<"}, {">", ">"}, + {"&", "&"}, {" ", " "}, {"→", "→"}}; + + // Tag: base class for tag types + struct Tag { + virtual void OnHasLeftOpeningTag(Converter *c) = 0; + virtual void OnHasLeftClosingTag(Converter *c) = 0; + }; + + // Tag types + + // tags that are not printed (nav, script, noscript, ...) + struct TagIgnored : Tag { + void OnHasLeftOpeningTag(Converter *c) override {}; + void OnHasLeftClosingTag(Converter *c) override {}; + }; + + struct TagAnchor : Tag { + void OnHasLeftOpeningTag(Converter *c) override; + void OnHasLeftClosingTag(Converter *c) override; + + std::string current_href_; + std::string current_title_; + }; + + struct TagBold : Tag { + void OnHasLeftOpeningTag(Converter *c) override; + void OnHasLeftClosingTag(Converter *c) override; + }; + + struct TagItalic : Tag { + void OnHasLeftOpeningTag(Converter *c) override; + void OnHasLeftClosingTag(Converter *c) override; + }; + + struct TagUnderline : Tag { + void OnHasLeftOpeningTag(Converter *c) override; + void OnHasLeftClosingTag(Converter *c) override; + }; + + struct TagStrikethrought : Tag { + void OnHasLeftOpeningTag(Converter *c) override; + void OnHasLeftClosingTag(Converter *c) override; + }; + + struct TagBreak : Tag { + void OnHasLeftOpeningTag(Converter *c) override; + void OnHasLeftClosingTag(Converter *c) override; + }; + + struct TagDiv : Tag { + void OnHasLeftOpeningTag(Converter *c) override; + void OnHasLeftClosingTag(Converter *c) override; + }; + + struct TagHeader1 : Tag { + void OnHasLeftOpeningTag(Converter *c) override; + void OnHasLeftClosingTag(Converter *c) override; + }; + + struct TagHeader2 : Tag { + void OnHasLeftOpeningTag(Converter *c) override; + void OnHasLeftClosingTag(Converter *c) override; + }; + + struct TagHeader3 : Tag { + void OnHasLeftOpeningTag(Converter *c) override; + void OnHasLeftClosingTag(Converter *c) override; + }; + + struct TagHeader4 : Tag { + void OnHasLeftOpeningTag(Converter *c) override; + void OnHasLeftClosingTag(Converter *c) override; + }; + + struct TagHeader5 : Tag { + void OnHasLeftOpeningTag(Converter *c) override; + void OnHasLeftClosingTag(Converter *c) override; + }; + + struct TagHeader6 : Tag { + void OnHasLeftOpeningTag(Converter *c) override; + void OnHasLeftClosingTag(Converter *c) override; + }; + + struct TagListItem : Tag { + void OnHasLeftOpeningTag(Converter *c) override; + void OnHasLeftClosingTag(Converter *c) override; + }; + + struct TagOption : Tag { + void OnHasLeftOpeningTag(Converter *c) override; + void OnHasLeftClosingTag(Converter *c) override; + }; + + struct TagOrderedList : Tag { + void OnHasLeftOpeningTag(Converter *c) override; + void OnHasLeftClosingTag(Converter *c) override; + }; + + struct TagParagraph : Tag { + void OnHasLeftOpeningTag(Converter *c) override; + void OnHasLeftClosingTag(Converter *c) override; + }; + + struct TagPre : Tag { + void OnHasLeftOpeningTag(Converter *c) override; + void OnHasLeftClosingTag(Converter *c) override; + }; + + struct TagCode : Tag { + void OnHasLeftOpeningTag(Converter *c) override; + void OnHasLeftClosingTag(Converter *c) override; + }; + + struct TagSpan : Tag { + void OnHasLeftOpeningTag(Converter *c) override; + void OnHasLeftClosingTag(Converter *c) override; + }; + + struct TagTitle : Tag { + void OnHasLeftOpeningTag(Converter *c) override; + void OnHasLeftClosingTag(Converter *c) override; + }; + + struct TagUnorderedList : Tag { + void OnHasLeftOpeningTag(Converter *c) override; + void OnHasLeftClosingTag(Converter *c) override; + }; + + struct TagImage : Tag { + void OnHasLeftOpeningTag(Converter *c) override; + void OnHasLeftClosingTag(Converter *c) override; + }; + + struct TagSeperator : Tag { + void OnHasLeftOpeningTag(Converter *c) override; + void OnHasLeftClosingTag(Converter *c) override; + }; + + struct TagTable : Tag { + void OnHasLeftOpeningTag(Converter *c) override; + void OnHasLeftClosingTag(Converter *c) override; + }; + + struct TagTableRow : Tag { + void OnHasLeftOpeningTag(Converter *c) override; + void OnHasLeftClosingTag(Converter *c) override; + }; + + struct TagTableHeader : Tag { + void OnHasLeftOpeningTag(Converter *c) override; + void OnHasLeftClosingTag(Converter *c) override; + }; + + struct TagTableData : Tag { + void OnHasLeftOpeningTag(Converter *c) override; + void OnHasLeftClosingTag(Converter *c) override; + }; + + struct TagBlockquote : Tag { + void OnHasLeftOpeningTag(Converter *c) override; + void OnHasLeftClosingTag(Converter *c) override; + }; + + std::unordered_map<std::string, std::shared_ptr<Tag>> tags_; + + explicit Converter(const std::string *html, struct Options *options); + + void CleanUpMarkdown(); + + // Trim from start (in place) + static void LTrim(std::string *s); + + // Trim from end (in place) + Converter *RTrim(std::string *s, bool trim_only_blank = false); + + // Trim from both ends (in place) + Converter *Trim(std::string *s); + + // 1. trim all lines + // 2. reduce consecutive newlines to maximum 3 + void TidyAllLines(std::string *str); + + std::string ExtractAttributeFromTagLeftOf(const std::string &attr); + + void TurnLineIntoHeader1(); + + void TurnLineIntoHeader2(); + + // Current char: '<' + void OnHasEnteredTag(); + + Converter *UpdatePrevChFromMd(); + + /** + * Handle next char within <...> tag + * + * @param ch current character + * @return continue surrounding iteration? + */ + bool ParseCharInTag(char ch); + + // Current char: '>' + bool OnHasLeftTag(); + + inline static bool TagContainsAttributesToHide(std::string *tag) { + using std::string; + + return (*tag).find(" aria=\"hidden\"") != string::npos || + (*tag).find("display:none") != string::npos || + (*tag).find("visibility:hidden") != string::npos || + (*tag).find("opacity:0") != string::npos || + (*tag).find("Details-content--hidden-not-important") != string::npos; + } + + Converter *ShortenMarkdown(size_t chars = 1); + inline bool shortIfPrevCh(char prev) { + if (prev_ch_in_md_ == prev) { + ShortenMarkdown(); + return true; + } + return false; + }; + + /** + * @param ch + * @return continue iteration surrounding this method's invocation? + */ + bool ParseCharInTagContent(char ch); + + // Replace previous space (if any) in current markdown line by newline + bool ReplacePreviousSpaceInLineByNewline(); + + static inline bool IsIgnoredTag(const std::string &tag) { + return (tag[0] == '-' || kTagTemplate == tag || kTagStyle == tag || + kTagScript == tag || kTagNoScript == tag || kTagNav == tag); + + // meta: not ignored to tolerate if closing is omitted + } + + [[nodiscard]] bool IsInIgnoredTag() const; +}; // Converter + +/*! + * \brief Static wrapper around the Converter class + * \param html The HTML passed to Converter + * \param ok Optional: Pass a reference to a local bool to store the output of + * Converter::ok() \return Returns the by Converter generated Markdown + */ +inline std::string Convert(const std::string &html, bool *ok = nullptr) { + Converter c(html); + auto md = c.convert(); + if (ok != nullptr) + *ok = c.ok(); + return md; +} + +#ifndef PYTHON_BINDINGS +inline std::string Convert(const std::string &&html, bool *ok = nullptr) { + return Convert(html, ok); +} +#endif + +} // namespace html2md + +#endif // HTML2MD_H diff --git a/packages/html/include/html/table.h b/packages/html/include/html/table.h new file mode 100644 index 0000000..9cf3c4b --- /dev/null +++ b/packages/html/include/html/table.h @@ -0,0 +1,11 @@ +// Copyright (c) Tim Gromeyer +// Licensed under the MIT License - https://opensource.org/licenses/MIT + +#ifndef TABLE_H +#define TABLE_H + +#include <string> + +[[nodiscard]] std::string formatMarkdownTable(const std::string &inputTable); + +#endif // TABLE_H diff --git a/packages/html/readme.md b/packages/html/readme.md new file mode 100644 index 0000000..c1b6518 --- /dev/null +++ b/packages/html/readme.md @@ -0,0 +1,101 @@ +# Scraper Request + +## OpenAPI Specification + +```yaml +openapi: 3.0.1 +info: + title: '' + description: '' + version: 1.0.0 +paths: + /api/v1/scraper/request: + post: + summary: Scraper Request + deprecated: false + description: '' + tags: + - Scraping API + parameters: [] + requestBody: + content: + application/json: + schema: + type: object + properties: + actor: + type: string + input: + type: object + properties: + url: + type: string + required: + - url + x-apidog-orders: + - url + proxy: + type: object + properties: + country: + type: string + required: + - country + x-apidog-orders: + - country + async: + type: boolean + description: |- + If true, the task will be executed asynchronously. + If false, the task will be executed synchronously. + required: + - actor + - input + - proxy + x-apidog-orders: + - actor + - input + - proxy + - async + example: + actor: scraper.xxx + input: + url: >- + https://www.***.com/shop/us/products/stmicroelectronics/tda7265a-3074457345625542393/ + proxy: + country: US + async: false + responses: + '200': + description: '' + content: + application/json: + schema: + type: object + properties: {} + x-apidog-orders: [] + headers: {} + x-apidog-name: Success + security: + - apikey-header-x-api-token: [] + x-apidog-folder: Scraping API + x-apidog-status: released + x-run-in-apidog: https://app.apidog.com/web/project/745098/apis/api-11949852-run +components: + schemas: {} + securitySchemes: + bearer: + type: bearer + scheme: bearer + description: Bearer token authentication using your Scrapeless API key + apikey-header-x-api-token: + type: apiKey + in: header + name: x-api-token +servers: + - url: https://api.scrapeless.com + description: Prod Env +security: + - apikey-header-x-api-token: [] + +``` \ No newline at end of file diff --git a/packages/html/src/html.cpp b/packages/html/src/html.cpp index 0883059..a13ff89 100644 --- a/packages/html/src/html.cpp +++ b/packages/html/src/html.cpp @@ -3,6 +3,10 @@ #include <lexbor/css/css.h> #include <lexbor/html/html.h> #include <lexbor/selectors/selectors.h> +#include <html/html2md.h> + +#include <algorithm> +#include <cstring> namespace html { @@ -26,6 +30,35 @@ static std::string tag_name(lxb_dom_element_t *el) { return std::string(reinterpret_cast<const char *>(name), len); } +static std::string get_element_attr(lxb_dom_element_t *el, const char *attr) { + size_t len = 0; + const lxb_char_t *val = lxb_dom_element_get_attribute( + el, reinterpret_cast<const lxb_char_t *>(attr), strlen(attr), &len); + if (!val) + return {}; + return std::string(reinterpret_cast<const char *>(val), len); +} + +static lxb_html_document_t *parse_doc(const std::string &html_str) { + auto *doc = lxb_html_document_create(); + if (!doc) return nullptr; + auto status = lxb_html_document_parse( + doc, reinterpret_cast<const lxb_char_t *>(html_str.c_str()), + html_str.size()); + if (status != LXB_STATUS_OK) { + lxb_html_document_destroy(doc); + return nullptr; + } + return doc; +} + +// ── Helper: check if a tag name matches a noise element ───────────────────── + +static bool is_noise_tag(const std::string &name) { + return name == "script" || name == "style" || name == "noscript" || + name == "svg" || name == "iframe"; +} + // ── walk tree recursively ─────────────────────────────────────────────────── static void walk(lxb_dom_node_t *node, std::vector<Element> &out) { @@ -45,22 +78,125 @@ static void walk(lxb_dom_node_t *node, std::vector<Element> &out) { } } +// ── Walk for visible text only (skip noise tags) ──────────────────────────── + +static void walk_text(lxb_dom_node_t *node, std::string &out) { + if (!node) return; + + if (node->type == LXB_DOM_NODE_TYPE_ELEMENT) { + auto *el = lxb_dom_interface_element(node); + auto name = tag_name(el); + if (is_noise_tag(name)) return; // Skip noise subtrees entirely + } + + if (node->type == LXB_DOM_NODE_TYPE_TEXT) { + size_t len = 0; + const lxb_char_t *data = lxb_dom_node_text_content(node, &len); + if (data && len > 0) { + std::string chunk(reinterpret_cast<const char *>(data), len); + // Collapse whitespace + bool needSpace = !out.empty() && out.back() != ' ' && out.back() != '\n'; + // Trim leading/trailing whitespace from chunk + size_t start = chunk.find_first_not_of(" \t\n\r"); + size_t end = chunk.find_last_not_of(" \t\n\r"); + if (start != std::string::npos) { + if (needSpace) out += ' '; + out += chunk.substr(start, end - start + 1); + } + } + } + + auto *child = node->first_child; + while (child) { + walk_text(child, out); + child = child->next; + } +} + +// ── Walk <head> for meta/title/link ───────────────────────────────────────── + +struct HeadData { + std::string title; + std::string canonical; + std::vector<std::pair<std::string, std::string>> metas; // name/property → content + std::vector<std::string> json_ld; +}; + +static void walk_head(lxb_dom_node_t *node, HeadData &data) { + if (!node) return; + + if (node->type == LXB_DOM_NODE_TYPE_ELEMENT) { + auto *el = lxb_dom_interface_element(node); + auto name = tag_name(el); + + if (name == "title") { + data.title = node_text(node); + } else if (name == "meta") { + auto nameAttr = get_element_attr(el, "name"); + auto propAttr = get_element_attr(el, "property"); + auto content = get_element_attr(el, "content"); + if (!content.empty()) { + if (!nameAttr.empty()) data.metas.emplace_back(nameAttr, content); + if (!propAttr.empty()) data.metas.emplace_back(propAttr, content); + } + } else if (name == "link") { + auto rel = get_element_attr(el, "rel"); + if (rel == "canonical") { + data.canonical = get_element_attr(el, "href"); + } + } else if (name == "script") { + auto type = get_element_attr(el, "type"); + if (type == "application/ld+json") { + auto text = node_text(node); + if (!text.empty()) data.json_ld.push_back(text); + } + } + } + + auto *child = node->first_child; + while (child) { + walk_head(child, data); + child = child->next; + } +} + +// ── Walk <body> for <a> links ─────────────────────────────────────────────── + +static void walk_links(lxb_dom_node_t *node, std::vector<Link> &out) { + if (!node) return; + + if (node->type == LXB_DOM_NODE_TYPE_ELEMENT) { + auto *el = lxb_dom_interface_element(node); + auto name = tag_name(el); + + if (name == "a") { + auto href = get_element_attr(el, "href"); + if (!href.empty()) { + Link lk; + lk.href = href; + lk.rel = get_element_attr(el, "rel"); + lk.text = node_text(node); + out.push_back(std::move(lk)); + } + } + } + + auto *child = node->first_child; + while (child) { + walk_links(child, out); + child = child->next; + } +} + // ── public API ────────────────────────────────────────────────────────────── std::vector<Element> parse(const std::string &html_str) { - auto *doc = lxb_html_document_create(); - if (!doc) - return {}; - - auto status = lxb_html_document_parse( - doc, reinterpret_cast<const lxb_char_t *>(html_str.c_str()), - html_str.size()); + auto *doc = parse_doc(html_str); + if (!doc) return {}; std::vector<Element> result; - if (status == LXB_STATUS_OK) { - auto *body = lxb_dom_interface_node(lxb_html_document_body_element(doc)); - walk(body, result); - } + auto *body = lxb_dom_interface_node(lxb_html_document_body_element(doc)); + walk(body, result); lxb_html_document_destroy(doc); return result; @@ -87,20 +223,9 @@ std::vector<std::string> select(const std::string &html_str, const std::string &selector) { std::vector<std::string> result; - // Parse document - auto *doc = lxb_html_document_create(); - if (!doc) - return result; + auto *doc = parse_doc(html_str); + if (!doc) return result; - auto status = lxb_html_document_parse( - doc, reinterpret_cast<const lxb_char_t *>(html_str.c_str()), - html_str.size()); - if (status != LXB_STATUS_OK) { - lxb_html_document_destroy(doc); - return result; - } - - // Set up CSS parser + selectors engine auto *css_parser = lxb_css_parser_create(); lxb_css_parser_init(css_parser, nullptr); @@ -126,4 +251,153 @@ std::vector<std::string> select(const std::string &html_str, return result; } +// ── Enricher extraction helpers ───────────────────────────────────────────── + +std::string get_title(const std::string &html_str) { + auto *doc = parse_doc(html_str); + if (!doc) return {}; + + HeadData data; + auto *head = lxb_dom_interface_node(lxb_html_document_head_element(doc)); + walk_head(head, data); + + lxb_html_document_destroy(doc); + return data.title; +} + +std::string get_meta(const std::string &html_str, const std::string &name) { + auto *doc = parse_doc(html_str); + if (!doc) return {}; + + HeadData data; + auto *head = lxb_dom_interface_node(lxb_html_document_head_element(doc)); + walk_head(head, data); + + lxb_html_document_destroy(doc); + + for (auto &[key, val] : data.metas) { + if (key == name) return val; + } + return {}; +} + +std::string get_canonical(const std::string &html_str) { + auto *doc = parse_doc(html_str); + if (!doc) return {}; + + HeadData data; + auto *head = lxb_dom_interface_node(lxb_html_document_head_element(doc)); + walk_head(head, data); + + lxb_html_document_destroy(doc); + return data.canonical; +} + +std::vector<Link> get_links(const std::string &html_str) { + auto *doc = parse_doc(html_str); + if (!doc) return {}; + + std::vector<Link> links; + auto *body = lxb_dom_interface_node(lxb_html_document_body_element(doc)); + walk_links(body, links); + + lxb_html_document_destroy(doc); + return links; +} + +std::string get_body_text(const std::string &html_str) { + auto *doc = parse_doc(html_str); + if (!doc) return {}; + + std::string text; + auto *body = lxb_dom_interface_node(lxb_html_document_body_element(doc)); + walk_text(body, text); + + lxb_html_document_destroy(doc); + return text; +} + +std::vector<std::string> get_json_ld(const std::string &html_str) { + auto *doc = parse_doc(html_str); + if (!doc) return {}; + + HeadData data; + // JSON-LD can be in head or body — walk entire document + auto *root = lxb_dom_interface_node( + lxb_dom_document_element(&doc->dom_document)); + walk_head(root, data); + + lxb_html_document_destroy(doc); + return data.json_ld; +} + +// ── get_attr via CSS selector ─────────────────────────────────────────────── + +struct AttrCtx { + std::string attr_name; + std::string result; + bool found; +}; + +static lxb_status_t attr_cb(lxb_dom_node_t *node, + lxb_css_selector_specificity_t spec, void *ctx) { + (void)spec; + auto *actx = static_cast<AttrCtx *>(ctx); + if (actx->found) return LXB_STATUS_OK; + + if (node->type == LXB_DOM_NODE_TYPE_ELEMENT) { + auto *el = lxb_dom_interface_element(node); + auto val = get_element_attr(el, actx->attr_name.c_str()); + if (!val.empty()) { + actx->result = val; + actx->found = true; + } + } + return LXB_STATUS_OK; +} + +std::string get_attr(const std::string &html_str, const std::string &selector, + const std::string &attr_name) { + auto *doc = parse_doc(html_str); + if (!doc) return {}; + + auto *css_parser = lxb_css_parser_create(); + lxb_css_parser_init(css_parser, nullptr); + + auto *selectors = lxb_selectors_create(); + lxb_selectors_init(selectors); + + auto *list = lxb_css_selectors_parse( + css_parser, reinterpret_cast<const lxb_char_t *>(selector.c_str()), + selector.size()); + + std::string result; + if (list) { + AttrCtx ctx{attr_name, {}, false}; + auto *root = lxb_dom_interface_node( + lxb_dom_document_element(&doc->dom_document)); + lxb_selectors_find(selectors, root, list, attr_cb, &ctx); + result = ctx.result; + lxb_css_selector_list_destroy_memory(list); + } + + lxb_selectors_destroy(selectors, true); + lxb_css_parser_destroy(css_parser, true); + lxb_html_document_destroy(doc); + + return result; +} + +std::string to_markdown(const std::string &html_str) { + // Defense-in-depth: hard cap at 2 MB even if the caller forgets. + // The enricher pipeline already caps at 512 KB, but future callers + // may not — prevent OOM / multi-second hangs from html2md. + static constexpr size_t MAX_HTML2MD_INPUT = 2 * 1024 * 1024; + if (html_str.size() > MAX_HTML2MD_INPUT) { + return "*[Content truncated: HTML too large for markdown conversion (" + + std::to_string(html_str.size() / 1024) + " KB)]*\n"; + } + return html2md::Convert(html_str); +} + } // namespace html diff --git a/packages/html/src/html2md.cpp b/packages/html/src/html2md.cpp new file mode 100644 index 0000000..3389313 --- /dev/null +++ b/packages/html/src/html2md.cpp @@ -0,0 +1,1195 @@ +// Copyright (c) Tim Gromeyer +// Licensed under the MIT License - https://opensource.org/licenses/MIT + +#include "html/html2md.h" +#include "html/table.h" + +#include <algorithm> +#include <cctype> +#include <cstring> +#include <memory> +#include <sstream> +#include <vector> + +using std::make_shared; +using std::string; +using std::vector; + +namespace { +bool startsWith(const string &str, const string &prefix) { + return str.size() >= prefix.size() && + 0 == str.compare(0, prefix.size(), prefix); +} + +bool endsWith(const string &str, const string &suffix) { + return str.size() >= suffix.size() && + 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix); +} + +size_t ReplaceAll(string *haystack, const string &needle, + const string &replacement) { + // Get first occurrence + size_t pos = (*haystack).find(needle); + + size_t amount_replaced = 0; + + // Repeat until end is reached + while (pos != string::npos) { + // Replace this occurrence of sub string + (*haystack).replace(pos, needle.size(), replacement); + + // Get the next occurrence from the current position + pos = (*haystack).find(needle, pos + replacement.size()); + + ++amount_replaced; + } + + return amount_replaced; +} + +size_t ReplaceAll(string *haystack, const string &needle, const char c) { + return ReplaceAll(haystack, needle, string({c})); +} + +// Split given string by given character delimiter into vector of strings +vector<string> Split(string const &str, char delimiter) { + vector<string> result; + std::stringstream iss(str); + + for (string token; getline(iss, token, delimiter);) + result.push_back(token); + + return result; +} + +string Repeat(const string &str, size_t amount) { + if (amount == 0) + return ""; + else if (amount == 1) + return str; + + // Optimize for single-character strings (common case for blockquotes, etc.) + if (str.size() == 1) + return string(amount, str[0]); + + // For multi-character strings, reserve space upfront + string out; + out.reserve(str.size() * amount); + for (size_t i = 0; i < amount; ++i) + out.append(str); + + return out; +} + +string toLower(const string &str) { + string lower; + lower.reserve(str.size()); + for (char ch : str) { + lower += tolower((unsigned char)ch); + } + return lower; +} + +} // namespace + +namespace html2md { + +Converter::Converter(const string *html, Options *options) : html_(*html) { + if (options) + option = *options; + + md_.reserve(html->size() * 1.2); + tags_.reserve(41); + + // non-printing tags + auto tagIgnored = make_shared<Converter::TagIgnored>(); + tags_[kTagHead] = tagIgnored; + tags_[kTagMeta] = tagIgnored; + tags_[kTagNav] = tagIgnored; + tags_[kTagNoScript] = tagIgnored; + tags_[kTagScript] = tagIgnored; + tags_[kTagStyle] = tagIgnored; + tags_[kTagTemplate] = tagIgnored; + + // printing tags + tags_[kTagAnchor] = make_shared<Converter::TagAnchor>(); + tags_[kTagBreak] = make_shared<Converter::TagBreak>(); + tags_[kTagDiv] = make_shared<Converter::TagDiv>(); + tags_[kTagHeader1] = make_shared<Converter::TagHeader1>(); + tags_[kTagHeader2] = make_shared<Converter::TagHeader2>(); + tags_[kTagHeader3] = make_shared<Converter::TagHeader3>(); + tags_[kTagHeader4] = make_shared<Converter::TagHeader4>(); + tags_[kTagHeader5] = make_shared<Converter::TagHeader5>(); + tags_[kTagHeader6] = make_shared<Converter::TagHeader6>(); + tags_[kTagListItem] = make_shared<Converter::TagListItem>(); + tags_[kTagOption] = make_shared<Converter::TagOption>(); + tags_[kTagOrderedList] = make_shared<Converter::TagOrderedList>(); + tags_[kTagPre] = make_shared<Converter::TagPre>(); + tags_[kTagCode] = make_shared<Converter::TagCode>(); + tags_[kTagParagraph] = make_shared<Converter::TagParagraph>(); + tags_[kTagSpan] = make_shared<Converter::TagSpan>(); + tags_[kTagUnorderedList] = make_shared<Converter::TagUnorderedList>(); + tags_[kTagTitle] = make_shared<Converter::TagTitle>(); + tags_[kTagImg] = make_shared<Converter::TagImage>(); + tags_[kTagSeperator] = make_shared<Converter::TagSeperator>(); + + // Text formatting + auto tagBold = make_shared<Converter::TagBold>(); + tags_[kTagBold] = tagBold; + tags_[kTagStrong] = tagBold; + + auto tagItalic = make_shared<Converter::TagItalic>(); + tags_[kTagItalic] = tagItalic; + tags_[kTagItalic2] = tagItalic; + tags_[kTagDefinition] = tagItalic; + tags_[kTagCitation] = tagItalic; + + tags_[kTagUnderline] = make_shared<Converter::TagUnderline>(); + + auto tagStrighthrought = make_shared<Converter::TagStrikethrought>(); + tags_[kTagStrighthrought] = tagStrighthrought; + tags_[kTagStrighthrought2] = tagStrighthrought; + + tags_[kTagBlockquote] = make_shared<Converter::TagBlockquote>(); + + // Tables + tags_[kTagTable] = make_shared<Converter::TagTable>(); + tags_[kTagTableRow] = make_shared<Converter::TagTableRow>(); + tags_[kTagTableHeader] = make_shared<Converter::TagTableHeader>(); + tags_[kTagTableData] = make_shared<Converter::TagTableData>(); +} + +void Converter::CleanUpMarkdown() { + TidyAllLines(&md_); + std::string buffer; + buffer.reserve(md_.size()); + + // Replace HTML symbols during the initial pass unless the user requested + // to keep HTML entities intact (e.g. keep ` `) + if (!option.keepHtmlEntities) { + for (size_t i = 0; i < md_.size();) { + bool replaced = false; + + // C++11 compatible iteration over htmlSymbolConversions_ + for (const auto &symbol_replacement : htmlSymbolConversions_) { + const std::string &symbol = symbol_replacement.first; + const std::string &replacement = symbol_replacement.second; + + if (md_.compare(i, symbol.size(), symbol) == 0) { + buffer.append(replacement); + i += symbol.size(); + replaced = true; + break; + } + } + + if (!replaced) { + buffer.push_back(md_[i++]); + } + } + } else { + // Keep entities as-is: copy through without transforming + buffer.append(md_); + } + + // Use swap instead of move assignment for better pre-C++11 compatibility + md_.swap(buffer); + + // Optimized replacement sequence + // Note: Multiple simple passes are faster than one complex pass due to: + // - Better branch prediction + // - Better cache locality + // - Simpler instruction patterns + const char *replacements[][2] = { + {" , ", ", "}, {"\n.\n", ".\n"}, {"\n↵\n", " ↵\n"}, {"\n*\n", "\n"}, + {"\n. ", ".\n"}, {"\t\t ", "\t\t"}, + }; + + for (const auto &replacement : replacements) { + ReplaceAll(&md_, replacement[0], replacement[1]); + } +} + +Converter *Converter::appendToMd(char ch) { + if (IsInIgnoredTag()) + return this; + + if (index_blockquote != 0 && ch == '\n') { + if (is_in_pre_) { + md_ += ch; + chars_in_curr_line_ = 0; + appendToMd(Repeat("> ", index_blockquote)); + } + + return this; + } + + md_ += ch; + + if (ch == '\n') + chars_in_curr_line_ = 0; + else + ++chars_in_curr_line_; + + return this; +} + +Converter *Converter::appendToMd(const char *str) { + if (IsInIgnoredTag()) + return this; + + md_ += str; + + auto str_len = strlen(str); + + // Efficiently update chars_in_curr_line_ by scanning for last newline + for (size_t i = 0; i < str_len; ++i) { + if (str[i] == '\n') + chars_in_curr_line_ = 0; + else + ++chars_in_curr_line_; + } + + return this; +} + +Converter *Converter::appendBlank() { + UpdatePrevChFromMd(); + + if (prev_ch_in_md_ == '\n' || + (prev_ch_in_md_ == '*' && prev_prev_ch_in_md_ == '*')) + return this; + + return appendToMd(' '); +} + +bool Converter::ok() const { + return !is_in_pre_ && !is_in_list_ && !is_in_p_ && !is_in_table_ && + !is_in_tag_ && index_blockquote == 0 && index_li == 0; +} + +void Converter::LTrim(string *s) { + (*s).erase((*s).begin(), + find_if((*s).begin(), (*s).end(), + [](unsigned char ch) { return !std::isspace((unsigned char)ch); })); +} + +Converter *Converter::RTrim(string *s, bool trim_only_blank) { + (*s).erase(find_if((*s).rbegin(), (*s).rend(), + [trim_only_blank](unsigned char ch) { + if (trim_only_blank) + return !isblank((unsigned char)ch); + + return !isspace((unsigned char)ch); + }) + .base(), + (*s).end()); + + return this; +} + +// NOTE: Pay attention when changing one of the trim functions. It can break the +// output! +Converter *Converter::Trim(string *s) { + if (!startsWith(*s, "\t") || option.forceLeftTrim) + LTrim(s); + + if (!(startsWith(*s, " "), endsWith(*s, " "))) + RTrim(s); + + return this; +} + +void Converter::TidyAllLines(string *str) { + if (str->empty()) + return; + + // Ensure input ends with newline to simplify logic + if (str->back() != '\n') { + str->push_back('\n'); + } + + size_t read = 0; + size_t write = 0; + size_t len = str->size(); + + uint8_t amount_newlines = 0; + bool in_code_block = false; + + while (read < len) { + size_t line_start = read; + size_t line_end = read; + + // Find end of line + while (line_end < len && (*str)[line_end] != '\n') { + line_end++; + } + + size_t line_len = line_end - line_start; + + // Check for code block markers + if (line_len >= 3) { + char c1 = (*str)[line_start]; + char c2 = (*str)[line_start + 1]; + char c3 = (*str)[line_start + 2]; + if ((c1 == '`' && c2 == '`' && c3 == '`') || + (c1 == '~' && c2 == '~' && c3 == '~')) { + in_code_block = !in_code_block; + } + } + + if (in_code_block) { + // Copy line as-is + if (write != line_start) { + for (size_t i = 0; i < line_len; ++i) { + (*str)[write + i] = (*str)[line_start + i]; + } + } + write += line_len; + (*str)[write++] = '\n'; + } else { + // Trim logic + size_t trim_start = line_start; + size_t trim_end = line_end; + + // Trim leading whitespace + if (option.forceLeftTrim || + (trim_start < trim_end && (*str)[trim_start] != '\t')) { + while (trim_start < trim_end && + std::isspace((unsigned char)(*str)[trim_start])) { + ++trim_start; + } + } + + // Trim trailing whitespace, preserve " " + bool has_line_break = false; + if (trim_end >= trim_start + 2 && (*str)[trim_end - 1] == ' ' && + (*str)[trim_end - 2] == ' ') { + has_line_break = true; + trim_end -= 2; + } + + while (trim_end > trim_start && + std::isspace((unsigned char)(*str)[trim_end - 1])) { + --trim_end; + } + + if (has_line_break) { + trim_end += 2; + } + + size_t trimmed_len = trim_end - trim_start; + + if (trimmed_len == 0) { + // Empty line + if (amount_newlines < 2 && write > 0) { + (*str)[write++] = '\n'; + amount_newlines++; + } + } else { + amount_newlines = 0; + if (write != trim_start) { + for (size_t i = 0; i < trimmed_len; ++i) { + (*str)[write + i] = (*str)[trim_start + i]; + } + } + write += trimmed_len; + (*str)[write++] = '\n'; + } + } + + read = line_end + 1; + } + + str->resize(write); +} + +string Converter::ExtractAttributeFromTagLeftOf(const string &attr) { + // Extract the whole tag from current offset, e.g. from '>', backwards + auto tag = html_.substr(offset_lt_, index_ch_in_html_ - offset_lt_); + string lowerTag = toLower(tag); // Convert tag to lowercase for comparison + + // locate given attribute (case-insensitive) + auto offset_attr = lowerTag.find(attr); + + if (offset_attr == string::npos) + return ""; + + // locate attribute-value pair's '=' + auto offset_equals = tag.find('=', offset_attr); + + if (offset_equals == string::npos) + return ""; + + // locate value's surrounding quotes + auto offset_double_quote = tag.find('"', offset_equals); + auto offset_single_quote = tag.find('\'', offset_equals); + + bool has_double_quote = offset_double_quote != string::npos; + bool has_single_quote = offset_single_quote != string::npos; + + if (!has_double_quote && !has_single_quote) + return ""; + + char wrapping_quote = 0; + + size_t offset_opening_quote = 0; + size_t offset_closing_quote = 0; + + if (has_double_quote) { + if (!has_single_quote) { + wrapping_quote = '"'; + offset_opening_quote = offset_double_quote; + } else { + if (offset_double_quote < offset_single_quote) { + wrapping_quote = '"'; + offset_opening_quote = offset_double_quote; + } else { + wrapping_quote = '\''; + offset_opening_quote = offset_single_quote; + } + } + } else { + // has only single quote + wrapping_quote = '\''; + offset_opening_quote = offset_single_quote; + } + + if (offset_opening_quote == string::npos) + return ""; + + offset_closing_quote = tag.find(wrapping_quote, offset_opening_quote + 1); + + if (offset_closing_quote == string::npos) + return ""; + + return tag.substr(offset_opening_quote + 1, + offset_closing_quote - 1 - offset_opening_quote); +} + +void Converter::TurnLineIntoHeader1() { + appendToMd('\n' + Repeat("=", chars_in_curr_line_) + "\n\n"); + + chars_in_curr_line_ = 0; +} + +void Converter::TurnLineIntoHeader2() { + appendToMd('\n' + Repeat("-", chars_in_curr_line_) + "\n\n"); + + chars_in_curr_line_ = 0; +} + +string Converter::convert() { + // We already converted + if (index_ch_in_html_ == html_.size()) + return md_; + + reset(); + + for (char ch : html_) { + ++index_ch_in_html_; + + if (!is_in_tag_ && ch == '<') { + OnHasEnteredTag(); + + continue; + } + + if (is_in_tag_) + ParseCharInTag(ch); + else + ParseCharInTagContent(ch); + } + + CleanUpMarkdown(); + + // Remove trailing double newline if present (keep only single newline) + if (md_.size() >= 2 && md_[md_.size() - 1] == '\n' && md_[md_.size() - 2] == '\n') { + md_.pop_back(); + } + + return md_; +} + +void Converter::OnHasEnteredTag() { + offset_lt_ = index_ch_in_html_; + is_in_tag_ = true; + is_closing_tag_ = false; + prev_tag_ = current_tag_; + current_tag_ = ""; + + if (!md_.empty()) { + UpdatePrevChFromMd(); + } +} + +Converter *Converter::UpdatePrevChFromMd() { + if (!md_.empty()) { + prev_ch_in_md_ = md_[md_.length() - 1]; + + if (md_.length() > 1) + prev_prev_ch_in_md_ = md_[md_.length() - 2]; + } + + return this; +} + +bool Converter::ParseCharInTag(char ch) { + + if (ch == '/' && !is_in_attribute_value_) { + is_closing_tag_ = current_tag_.empty(); + is_self_closing_tag_ = !is_closing_tag_; + skipping_leading_whitespace_ = true; // Reset for next tag + return true; + } + + if (ch == '>') { + // Trim trailing whitespace by removing characters from current_tag_ + while (!current_tag_.empty() && std::isspace(static_cast<unsigned char>(current_tag_.back()))) { + current_tag_.pop_back(); + } + skipping_leading_whitespace_ = true; // Reset for next tag + if (!is_self_closing_tag_) + return OnHasLeftTag(); + else { + OnHasLeftTag(); + is_self_closing_tag_ = false; + is_closing_tag_ = true; + return OnHasLeftTag(); + } + } + + if (ch == '"') { + if (is_in_attribute_value_) { + is_in_attribute_value_ = false; + } else { + size_t pos = current_tag_.length(); + while (pos > 0 && isspace((unsigned char)current_tag_[pos - 1])) { + pos--; + } + if (pos > 0 && current_tag_[pos - 1] == '=') { + is_in_attribute_value_ = true; + } + } + skipping_leading_whitespace_ = false; // Stop skipping after attribute + return true; + } + + // Handle whitespace: skip leading whitespace, keep others + if (isspace((unsigned char)ch) && skipping_leading_whitespace_) { + return true; // Ignore leading whitespace + } + + // Once we encounter a non-whitespace character, stop skipping + skipping_leading_whitespace_ = false; + current_tag_ += tolower((unsigned char)ch); + return false; +} + +bool Converter::OnHasLeftTag() { + is_in_tag_ = false; + + UpdatePrevChFromMd(); + + if (!is_closing_tag_) + if (TagContainsAttributesToHide(¤t_tag_)) + return true; + + // Extract tag name without Split() - just find first space + size_t space_pos = current_tag_.find(' '); + if (space_pos != string::npos) { + current_tag_ = current_tag_.substr(0, space_pos); + } + + if (current_tag_.empty()) + return true; + + auto tag = tags_[current_tag_]; + + if (!tag) + return true; + + if (!is_closing_tag_) { + tag->OnHasLeftOpeningTag(this); + } + else { + is_closing_tag_ = false; + + tag->OnHasLeftClosingTag(this); + } + + return true; +} + +Converter *Converter::ShortenMarkdown(size_t chars) { + md_ = md_.substr(0, md_.length() - chars); + + if (chars > chars_in_curr_line_) + chars_in_curr_line_ = 0; + else + chars_in_curr_line_ = chars_in_curr_line_ - chars; + + return this->UpdatePrevChFromMd(); +} + +bool Converter::ParseCharInTagContent(char ch) { + if (is_in_code_) { + md_ += ch; + + if (index_blockquote != 0 && ch == '\n') + appendToMd(Repeat("> ", index_blockquote)); + + return true; + } + + if (option.compressWhitespace && !is_in_pre_) { + if (ch == '\t') + ch = ' '; + + if (ch == ' ') { + UpdatePrevChFromMd(); + if (prev_ch_in_md_ == ' ' || prev_ch_in_md_ == '\n') + return true; + } + } + + if (IsInIgnoredTag() || current_tag_ == kTagLink) { + prev_ch_in_html_ = ch; + + return true; + } + + if (ch == '\n') { + if (index_blockquote != 0) { + md_ += '\n'; + chars_in_curr_line_ = 0; + appendToMd(Repeat("> ", index_blockquote)); + } + + return true; + } + + switch (ch) { + case '*': + appendToMd("\\*"); + break; + case '`': + appendToMd("\\`"); + break; + case '\\': + appendToMd("\\\\"); + break; + case '.': { + bool is_ordered_list_start = false; + if (chars_in_curr_line_ > 0) { + size_t start_idx = md_.length() - chars_in_curr_line_; + size_t idx = start_idx; + // Skip spaces + while (idx < md_.length() && isspace((unsigned char)md_[idx])) { + idx++; + } + // Check digits + bool has_digits = false; + while (idx < md_.length() && isdigit((unsigned char)md_[idx])) { + has_digits = true; + idx++; + } + // If we reached the end and had digits, it's a match + if (has_digits && idx == md_.length()) { + is_ordered_list_start = true; + } + } + + if (is_ordered_list_start && option.escapeNumberedList) { + appendToMd("\\."); + } else { + md_ += ch; + ++chars_in_curr_line_; + } + break; + } + default: + md_ += ch; + ++chars_in_curr_line_; + break; + } + + if (chars_in_curr_line_ > option.softBreak && !is_in_table_ && !is_in_list_ && + current_tag_ != kTagImg && current_tag_ != kTagAnchor && + option.splitLines) { + if (ch == ' ') { // If the next char is - it will become a list + md_ += '\n'; + chars_in_curr_line_ = 0; + } else if (chars_in_curr_line_ > option.hardBreak) { + ReplacePreviousSpaceInLineByNewline(); + } + } + + return false; +} + +bool Converter::ReplacePreviousSpaceInLineByNewline() { + if (current_tag_ == kTagParagraph || + is_in_table_ && (prev_tag_ != kTagCode && prev_tag_ != kTagPre)) + return false; + + auto offset = md_.length() - 1; + + if (md_.length() == 0) + return true; + + do { + if (md_[offset] == '\n') + return false; + + if (md_[offset] == ' ') { + md_[offset] = '\n'; + chars_in_curr_line_ = md_.length() - offset; + + return true; + } + + --offset; + } while (offset > 0); + + return false; +} + +void Converter::TagAnchor::OnHasLeftOpeningTag(Converter *c) { + if (c->prev_tag_ == kTagImg) + c->appendToMd('\n'); + + current_title_ = c->ExtractAttributeFromTagLeftOf(kAttributeTitle); + + c->appendToMd('['); + current_href_ = c->ExtractAttributeFromTagLeftOf(kAttributeHref); +} + +void Converter::TagAnchor::OnHasLeftClosingTag(Converter *c) { + if (!c->shortIfPrevCh('[')) { + c->appendToMd("](")->appendToMd(current_href_); + + // If title is set append it + if (!current_title_.empty()) { + c->appendToMd(" \"")->appendToMd(current_title_)->appendToMd('"'); + current_title_.clear(); + } + + c->appendToMd(')'); + + if (c->prev_tag_ == kTagImg) + c->appendToMd('\n'); + } +} + +void Converter::TagBold::OnHasLeftOpeningTag(Converter *c) { + c->appendToMd("**"); +} + +void Converter::TagBold::OnHasLeftClosingTag(Converter *c) { + c->appendToMd("**"); +} + +void Converter::TagItalic::OnHasLeftOpeningTag(Converter *c) { + c->appendToMd('*'); +} + +void Converter::TagItalic::OnHasLeftClosingTag(Converter *c) { + c->appendToMd('*'); +} + +void Converter::TagUnderline::OnHasLeftOpeningTag(Converter *c) { + c->appendToMd("<u>"); +} + +void Converter::TagUnderline::OnHasLeftClosingTag(Converter *c) { + c->appendToMd("</u>"); +} + +void Converter::TagStrikethrought::OnHasLeftOpeningTag(Converter *c) { + c->appendToMd('~'); +} + +void Converter::TagStrikethrought::OnHasLeftClosingTag(Converter *c) { + c->appendToMd('~'); +} + +void Converter::TagBreak::OnHasLeftOpeningTag(Converter *c) { + if (c->is_in_list_) { // When it's in a list, it's not in a paragraph + c->appendToMd(" \n"); + c->appendToMd(Repeat(" ", c->index_li)); + } else if (c->is_in_table_) { + c->appendToMd("<br>"); + } else if (!c->md_.empty()) + c->appendToMd(" \n"); +} + +void Converter::TagBreak::OnHasLeftClosingTag(Converter *c) {} + +void Converter::TagDiv::OnHasLeftOpeningTag(Converter *c) { + if (c->prev_ch_in_md_ != '\n') + c->appendToMd('\n'); + + if (c->prev_prev_ch_in_md_ != '\n') + c->appendToMd('\n'); +} + +void Converter::TagDiv::OnHasLeftClosingTag(Converter *c) {} + +void Converter::TagHeader1::OnHasLeftOpeningTag(Converter *c) { + c->appendToMd("\n# "); +} + +void Converter::TagHeader1::OnHasLeftClosingTag(Converter *c) { + if (c->prev_prev_ch_in_md_ != ' ') + c->appendToMd('\n'); +} + +void Converter::TagHeader2::OnHasLeftOpeningTag(Converter *c) { + c->appendToMd("\n## "); +} + +void Converter::TagHeader2::OnHasLeftClosingTag(Converter *c) { + if (c->prev_prev_ch_in_md_ != ' ') + c->appendToMd('\n'); +} + +void Converter::TagHeader3::OnHasLeftOpeningTag(Converter *c) { + c->appendToMd("\n### "); +} + +void Converter::TagHeader3::OnHasLeftClosingTag(Converter *c) { + if (c->prev_prev_ch_in_md_ != ' ') + c->appendToMd('\n'); +} + +void Converter::TagHeader4::OnHasLeftOpeningTag(Converter *c) { + c->appendToMd("\n#### "); +} + +void Converter::TagHeader4::OnHasLeftClosingTag(Converter *c) { + if (c->prev_prev_ch_in_md_ != ' ') + c->appendToMd('\n'); +} + +void Converter::TagHeader5::OnHasLeftOpeningTag(Converter *c) { + c->appendToMd("\n##### "); +} + +void Converter::TagHeader5::OnHasLeftClosingTag(Converter *c) { + if (c->prev_prev_ch_in_md_ != ' ') + c->appendToMd('\n'); +} + +void Converter::TagHeader6::OnHasLeftOpeningTag(Converter *c) { + c->appendToMd("\n###### "); +} + +void Converter::TagHeader6::OnHasLeftClosingTag(Converter *c) { + if (c->prev_prev_ch_in_md_ != ' ') + c->appendToMd('\n'); +} + +void Converter::TagListItem::OnHasLeftOpeningTag(Converter *c) { + if (c->is_in_table_) + return; + + if (!c->is_in_ordered_list_) { + c->appendToMd(string({c->option.unorderedList, ' '})); + return; + } + + ++c->index_ol; + + string num = std::to_string(c->index_ol); + num.append({c->option.orderedList, ' '}); + c->appendToMd(num); +} + +void Converter::TagListItem::OnHasLeftClosingTag(Converter *c) { + if (c->is_in_table_) + return; + + if (c->prev_ch_in_md_ != '\n') + c->appendToMd('\n'); +} + +void Converter::TagOption::OnHasLeftOpeningTag(Converter *c) {} + +void Converter::TagOption::OnHasLeftClosingTag(Converter *c) { + if (c->md_.length() > 0) + c->appendToMd(" \n"); +} + +void Converter::TagOrderedList::OnHasLeftOpeningTag(Converter *c) { + if (c->is_in_table_) + return; + + c->is_in_list_ = true; + c->is_in_ordered_list_ = true; + c->index_ol = 0; + + ++c->index_li; + + c->ReplacePreviousSpaceInLineByNewline(); + + c->appendToMd('\n'); +} + +void Converter::TagOrderedList::OnHasLeftClosingTag(Converter *c) { + if (c->is_in_table_) + return; + + c->is_in_ordered_list_ = false; + + if (c->index_li != 0) + --c->index_li; + + c->is_in_list_ = c->index_li != 0; + + c->appendToMd('\n'); +} + +void Converter::TagParagraph::OnHasLeftOpeningTag(Converter *c) { + c->is_in_p_ = true; + + if (c->is_in_list_ && c->prev_tag_ == kTagParagraph) + c->appendToMd("\n\t"); + else if (!c->is_in_list_) + c->appendToMd('\n'); +} + +void Converter::TagParagraph::OnHasLeftClosingTag(Converter *c) { + c->is_in_p_ = false; + + if (!c->md_.empty()) + c->appendToMd("\n"); // Workaround \n restriction for blockquotes + + if (c->index_blockquote != 0) + c->appendToMd(Repeat("> ", c->index_blockquote)); +} + +void Converter::TagPre::OnHasLeftOpeningTag(Converter *c) { + c->is_in_pre_ = true; + + if (c->prev_ch_in_md_ != '\n') + c->appendToMd('\n'); + + if (c->prev_prev_ch_in_md_ != '\n') + c->appendToMd('\n'); + + if (c->is_in_list_ && c->prev_tag_ != kTagParagraph) + c->ShortenMarkdown(2); + + if (c->is_in_list_) + c->appendToMd("\t\t"); + else + c->appendToMd("```"); +} + +void Converter::TagPre::OnHasLeftClosingTag(Converter *c) { + c->is_in_pre_ = false; + + if (c->is_in_list_) + return; + + c->appendToMd("```"); + c->appendToMd('\n'); // Don't combine because of blockquote +} + +void Converter::TagCode::OnHasLeftOpeningTag(Converter *c) { + c->is_in_code_ = true; + + if (c->is_in_pre_) { + if (c->is_in_list_) + return; + + auto code = c->ExtractAttributeFromTagLeftOf(kAttributeClass); + if (!code.empty()) { + if (startsWith(code, "language-")) + code.erase(0, 9); // remove language- + c->appendToMd(code); + } + c->appendToMd('\n'); + } else + c->appendToMd('`'); +} + +void Converter::TagCode::OnHasLeftClosingTag(Converter *c) { + c->is_in_code_ = false; + + if (c->is_in_pre_) + return; + + c->appendToMd('`'); +} + +void Converter::TagSpan::OnHasLeftOpeningTag(Converter *c) {} + +void Converter::TagSpan::OnHasLeftClosingTag(Converter *c) {} + +void Converter::TagTitle::OnHasLeftOpeningTag(Converter *c) {} + +void Converter::TagTitle::OnHasLeftClosingTag(Converter *c) { + c->TurnLineIntoHeader1(); +} + +void Converter::TagUnorderedList::OnHasLeftOpeningTag(Converter *c) { + if (c->is_in_list_ || c->is_in_table_) + return; + + c->is_in_list_ = true; + + ++c->index_li; + + c->appendToMd('\n'); +} + +void Converter::TagUnorderedList::OnHasLeftClosingTag(Converter *c) { + if (c->is_in_table_) + return; + + if (c->index_li != 0) + --c->index_li; + + c->is_in_list_ = c->index_li != 0; + + if (c->prev_prev_ch_in_md_ == '\n' && c->prev_ch_in_md_ == '\n') + c->ShortenMarkdown(); + else if (c->prev_ch_in_md_ != '\n') + c->appendToMd('\n'); +} + +void Converter::TagImage::OnHasLeftOpeningTag(Converter *c) { + if (c->prev_tag_ != kTagAnchor && c->prev_ch_in_md_ != '\n') + c->appendToMd('\n'); + + c->appendToMd("![") + ->appendToMd(c->ExtractAttributeFromTagLeftOf(kAttributeAlt)) + ->appendToMd("](") + ->appendToMd(c->ExtractAttributeFromTagLeftOf(kAttributeSrc)); + + auto title = c->ExtractAttributeFromTagLeftOf(kAttributeTitle); + if (!title.empty()) { + c->appendToMd(" \"")->appendToMd(title)->appendToMd('"'); + } + + c->appendToMd(")"); +} + +void Converter::TagImage::OnHasLeftClosingTag(Converter *c) { + if (c->prev_tag_ == kTagAnchor) + c->appendToMd('\n'); +} + +void Converter::TagSeperator::OnHasLeftOpeningTag(Converter *c) { + c->appendToMd("\n---\n"); // NOTE: We can make this an option +} + +void Converter::TagSeperator::OnHasLeftClosingTag(Converter *c) {} + +void Converter::TagTable::OnHasLeftOpeningTag(Converter *c) { + c->is_in_table_ = true; + c->appendToMd('\n'); + c->table_start = c->md_.length(); // Set start AFTER the newline +} + +void Converter::TagTable::OnHasLeftClosingTag(Converter *c) { + c->is_in_table_ = false; + c->appendToMd('\n'); + + if (!c->option.formatTable) + return; + + string table = c->md_.substr(c->table_start); + table = formatMarkdownTable(table); + c->ShortenMarkdown(c->md_.size() - c->table_start); + c->appendToMd(table); +} + +void Converter::TagTableRow::OnHasLeftOpeningTag(Converter *c) { + // Don't add newline here - it creates empty rows + // The newline is added by the closing tag of the previous row +} + +void Converter::TagTableRow::OnHasLeftClosingTag(Converter *c) { + c->UpdatePrevChFromMd(); + + // Always close the row with a pipe and space, then newline + if (c->prev_ch_in_md_ != '|') { + c->appendToMd(" |"); + } + c->appendToMd('\n'); + + if (!c->tableLine.empty()) { + c->tableLine.append("|\n"); + c->appendToMd(c->tableLine); + c->tableLine.clear(); + } +} + + +void Converter::TagTableHeader::OnHasLeftOpeningTag(Converter *c) { + auto align = c->ExtractAttributeFromTagLeftOf(kAttrinuteAlign); + + string line = "| "; + + if (align == "left" || align == "center") + line += ':'; + + line += '-'; + + if (align == "right" || align == "center") + line += ": "; + else + line += ' '; + + c->tableLine.append(line); + + c->appendToMd("| "); +} + +void Converter::TagTableHeader::OnHasLeftClosingTag(Converter *c) { + c->appendToMd(" "); +} + + +void Converter::TagTableData::OnHasLeftOpeningTag(Converter *c) { + c->appendToMd("| "); +} + + +void Converter::TagTableData::OnHasLeftClosingTag(Converter *c) { + c->appendToMd(" "); +} + + +void Converter::TagBlockquote::OnHasLeftOpeningTag(Converter *c) { + ++c->index_blockquote; + c->appendToMd("\n"); + c->appendToMd(Repeat("> ", c->index_blockquote)); +} + +void Converter::TagBlockquote::OnHasLeftClosingTag(Converter *c) { + --c->index_blockquote; + // Only shorten if a "> " was added (i.e., a newline was processed in the blockquote) + if (!c->md_.empty() && c->md_.length() >= 2 && + c->md_.substr(c->md_.length() - 2) == "> ") { + c->ShortenMarkdown(2); // Remove the '> ' only if it exists + } +} + +void Converter::reset() { + md_.clear(); + prev_ch_in_md_ = 0; + prev_prev_ch_in_md_ = 0; + index_ch_in_html_ = 0; +} + +bool Converter::IsInIgnoredTag() const { + if (current_tag_ == kTagTitle && !option.includeTitle) + return true; + + return IsIgnoredTag(current_tag_); +} +} // namespace html2md diff --git a/packages/html/src/table.cpp b/packages/html/src/table.cpp new file mode 100644 index 0000000..9a1ccdc --- /dev/null +++ b/packages/html/src/table.cpp @@ -0,0 +1,106 @@ +// Copyright (c) Tim Gromeyer +// Licensed under the MIT License - https://opensource.org/licenses/MIT + +#include "html/table.h" + +#include <iomanip> +#include <iostream> +#include <sstream> +#include <vector> + +using std::string; +using std::vector; + +const size_t MIN_LINE_LENGTH = 3; // Minimum length of line + +void removeLeadingTrailingSpaces(string &str) { + size_t firstNonSpace = str.find_first_not_of(' '); + if (firstNonSpace == string::npos) { + str.clear(); // Entire string is spaces + return; + } + + size_t lastNonSpace = str.find_last_not_of(' '); + str = str.substr(firstNonSpace, lastNonSpace - firstNonSpace + 1); +} + +string enlargeTableHeaderLine(const string &str, size_t length) { + if (str.empty() || length < MIN_LINE_LENGTH) + return ""; + + size_t first = str.find_first_of(':'); + size_t last = str.find_last_of(':'); + + if (first == 0 && first == last) + last = string::npos; + + string line = string(length, '-'); + + if (first == 0) + line[0] = ':'; + if (last == str.length() - 1) + line[length - 1] = ':'; + + return line; +} + +string formatMarkdownTable(const string &inputTable) { + std::istringstream iss(inputTable); + string line; + vector<vector<string>> tableData; + + // Parse the input table into a 2D vector + while (std::getline(iss, line)) { + std::istringstream lineStream(line); + string cell; + vector<string> rowData; + + while (std::getline(lineStream, cell, '|')) { + removeLeadingTrailingSpaces(cell); // Trim first + if (!cell.empty()) { // Then check if empty + rowData.push_back(cell); + } + } + + if (!rowData.empty()) { + tableData.push_back(std::move(rowData)); // Move rowData to avoid copying + } + } + + if (tableData.empty()) { + return ""; + } + + // Determine maximum width of each column + vector<size_t> columnWidths(tableData[0].size(), 0); + for (const auto &row : tableData) { + if (columnWidths.size() < row.size()) { + columnWidths.resize(row.size(), 0); + } + + for (size_t i = 0; i < row.size(); ++i) { + columnWidths[i] = std::max(columnWidths[i], row[i].size()); + } + } + + // Build the formatted table + std::ostringstream formattedTable; + for (size_t rowNumber = 0; rowNumber < tableData.size(); ++rowNumber) { + const auto &row = tableData[rowNumber]; + + formattedTable << "|"; + + for (size_t i = 0; i < row.size(); ++i) { + if (rowNumber == 1) { + formattedTable << enlargeTableHeaderLine(row[i], columnWidths[i] + 2) + << "|"; + continue; + } + formattedTable << " " << std::setw(columnWidths[i]) << std::left << row[i] + << " |"; + } + formattedTable << "\n"; + } + + return formattedTable.str(); +} diff --git a/packages/http/CMakeLists.txt b/packages/http/CMakeLists.txt index afcd299..9b170f5 100644 --- a/packages/http/CMakeLists.txt +++ b/packages/http/CMakeLists.txt @@ -14,9 +14,14 @@ set(BUILD_CURL_EXE OFF CACHE BOOL "" FORCE) set(BUILD_SHARED_LIBS OFF CACHE BOOL "" FORCE) set(BUILD_TESTING OFF CACHE BOOL "" FORCE) -# TLS backend: Windows native SChannel -set(CURL_USE_OPENSSL OFF CACHE BOOL "" FORCE) -set(CURL_USE_SCHANNEL ON CACHE BOOL "" FORCE) +# TLS backend: platform-appropriate +if(WIN32) + set(CURL_USE_OPENSSL OFF CACHE BOOL "" FORCE) + set(CURL_USE_SCHANNEL ON CACHE BOOL "" FORCE) +else() + set(CURL_USE_SCHANNEL OFF CACHE BOOL "" FORCE) + set(CURL_USE_OPENSSL ON CACHE BOOL "" FORCE) +endif() # Disable optional compression/protocol deps set(CURL_ZLIB OFF CACHE BOOL "" FORCE) diff --git a/packages/http/include/http/http.h b/packages/http/include/http/http.h index 3f4f764..1f848d6 100644 --- a/packages/http/include/http/http.h +++ b/packages/http/include/http/http.h @@ -9,11 +9,32 @@ struct Response { std::string body; }; +/// Options for customisable HTTP GET requests. +struct GetOptions { + std::string user_agent = "Mozilla/5.0 (compatible; PolymechBot/1.0)"; + int timeout_ms = 10000; + bool follow_redirects = true; +}; + /// Perform an HTTP GET request. Returns the response body and status code. Response get(const std::string &url); +/// Perform an HTTP GET request with custom options. +Response get(const std::string &url, const GetOptions &opts); + /// Perform an HTTP POST request with a body. Returns the response and status. Response post(const std::string &url, const std::string &body, const std::string &content_type = "application/json"); +/// Options for customisable HTTP POST requests. +struct PostOptions { + std::string content_type = "application/json"; + std::string bearer_token; // Authorization: Bearer <token> + int timeout_ms = 30000; +}; + +/// Perform an HTTP POST request with custom options. +Response post(const std::string &url, const std::string &body, + const PostOptions &opts); + } // namespace http diff --git a/packages/http/src/http.cpp b/packages/http/src/http.cpp index da848de..f0c3fe7 100644 --- a/packages/http/src/http.cpp +++ b/packages/http/src/http.cpp @@ -1,9 +1,53 @@ #include "http/http.h" #include <curl/curl.h> +#include <mutex> +#include <chrono> namespace http { +static std::once_flag curl_init_flag; +static void ensure_curl_init() { + std::call_once(curl_init_flag, []() { + curl_global_init(CURL_GLOBAL_ALL); + }); +} + +struct ThreadLocalCurl { + CURL *handle; + ThreadLocalCurl() { + ensure_curl_init(); + handle = curl_easy_init(); + } + ~ThreadLocalCurl() { + if (handle) curl_easy_cleanup(handle); + } + CURL *get() { + if (handle) curl_easy_reset(handle); + return handle; + } +}; + +thread_local ThreadLocalCurl tl_curl; + +struct ProgressData { + std::chrono::steady_clock::time_point start_time; + int timeout_ms; +}; + +static int progress_cb(void *clientp, curl_off_t dltotal, curl_off_t dlnow, + curl_off_t ultotal, curl_off_t ulnow) { + auto *pd = static_cast<ProgressData *>(clientp); + if (pd->timeout_ms <= 0) return 0; + + auto now = std::chrono::steady_clock::now(); + auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(now - pd->start_time).count(); + if (elapsed > pd->timeout_ms) { + return 1; // Return non-zero to abort the transfer + } + return 0; // Continue +} + static size_t write_cb(void *contents, size_t size, size_t nmemb, void *userp) { auto *out = static_cast<std::string *>(userp); out->append(static_cast<char *>(contents), size * nmemb); @@ -11,20 +55,50 @@ static size_t write_cb(void *contents, size_t size, size_t nmemb, void *userp) { } Response get(const std::string &url) { + return get(url, GetOptions{}); +} + +Response get(const std::string &url, const GetOptions &opts) { Response resp{}; - CURL *curl = curl_easy_init(); + CURL *curl = tl_curl.get(); if (!curl) { resp.status_code = -1; - resp.body = "curl_easy_init failed"; + resp.body = "curl_easy_init (thread_local) failed"; return resp; } curl_easy_setopt(curl, CURLOPT_URL, url.c_str()); curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_cb); curl_easy_setopt(curl, CURLOPT_WRITEDATA, &resp.body); - curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); - curl_easy_setopt(curl, CURLOPT_TIMEOUT, 10L); + curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, opts.follow_redirects ? 1L : 0L); + + ProgressData prog_data; + if (opts.timeout_ms > 0) { + curl_easy_setopt(curl, CURLOPT_TIMEOUT_MS, static_cast<long>(opts.timeout_ms)); + prog_data.start_time = std::chrono::steady_clock::now(); + prog_data.timeout_ms = opts.timeout_ms + 1000; + curl_easy_setopt(curl, CURLOPT_XFERINFOFUNCTION, progress_cb); + curl_easy_setopt(curl, CURLOPT_XFERINFODATA, &prog_data); + curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L); + } + + // Fail fast on dead sites (TCP SYN timeout) + curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT_MS, 5000L); + + // Prevent stalling: abort if transfer speed is less than 1 byte/sec for 10 seconds + curl_easy_setopt(curl, CURLOPT_LOW_SPEED_LIMIT, 1L); + curl_easy_setopt(curl, CURLOPT_LOW_SPEED_TIME, 10L); + + // Prevent signal handlers from breaking in multithreaded environments + curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1L); + + if (!opts.user_agent.empty()) { + curl_easy_setopt(curl, CURLOPT_USERAGENT, opts.user_agent.c_str()); + } + + // Accept-Encoding for compressed responses + curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, ""); CURLcode res = curl_easy_perform(curl); if (res != CURLE_OK) { @@ -34,7 +108,6 @@ Response get(const std::string &url) { curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &resp.status_code); } - curl_easy_cleanup(curl); return resp; } @@ -42,7 +115,7 @@ Response post(const std::string &url, const std::string &body, const std::string &content_type) { Response resp{}; - CURL *curl = curl_easy_init(); + CURL *curl = tl_curl.get(); if (!curl) { resp.status_code = -1; resp.body = "curl_easy_init failed"; @@ -61,6 +134,73 @@ Response post(const std::string &url, const std::string &body, curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); curl_easy_setopt(curl, CURLOPT_TIMEOUT, 10L); + ProgressData prog_data; + prog_data.start_time = std::chrono::steady_clock::now(); + prog_data.timeout_ms = 11000; + curl_easy_setopt(curl, CURLOPT_XFERINFOFUNCTION, progress_cb); + curl_easy_setopt(curl, CURLOPT_XFERINFODATA, &prog_data); + curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L); + + // Prevent stalling: abort if transfer speed is less than 1 byte/sec for 10 seconds + curl_easy_setopt(curl, CURLOPT_LOW_SPEED_LIMIT, 1L); + curl_easy_setopt(curl, CURLOPT_LOW_SPEED_TIME, 10L); + curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1L); + + CURLcode res = curl_easy_perform(curl); + if (res != CURLE_OK) { + resp.status_code = -1; + resp.body = curl_easy_strerror(res); + } else { + curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &resp.status_code); + } + + curl_slist_free_all(headers); + return resp; +} + +Response post(const std::string &url, const std::string &body, + const PostOptions &opts) { + Response resp{}; + + CURL *curl = tl_curl.get(); + if (!curl) { + resp.status_code = -1; + resp.body = "curl_easy_init failed"; + return resp; + } + + struct curl_slist *headers = nullptr; + headers = + curl_slist_append(headers, ("Content-Type: " + opts.content_type).c_str()); + if (!opts.bearer_token.empty()) { + headers = curl_slist_append( + headers, ("Authorization: Bearer " + opts.bearer_token).c_str()); + headers = curl_slist_append( + headers, ("x-api-token: " + opts.bearer_token).c_str()); + } + + curl_easy_setopt(curl, CURLOPT_URL, url.c_str()); + curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); + curl_easy_setopt(curl, CURLOPT_POSTFIELDS, body.c_str()); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_cb); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, &resp.body); + curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); + + ProgressData prog_data; + if (opts.timeout_ms > 0) { + curl_easy_setopt(curl, CURLOPT_TIMEOUT_MS, static_cast<long>(opts.timeout_ms)); + prog_data.start_time = std::chrono::steady_clock::now(); + prog_data.timeout_ms = opts.timeout_ms + 1000; + curl_easy_setopt(curl, CURLOPT_XFERINFOFUNCTION, progress_cb); + curl_easy_setopt(curl, CURLOPT_XFERINFODATA, &prog_data); + curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L); + } + + // Prevent stalling: abort if transfer speed is less than 1 byte/sec for 10 seconds + curl_easy_setopt(curl, CURLOPT_LOW_SPEED_LIMIT, 1L); + curl_easy_setopt(curl, CURLOPT_LOW_SPEED_TIME, 10L); + curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1L); + CURLcode res = curl_easy_perform(curl); if (res != CURLE_OK) { resp.status_code = -1; @@ -70,7 +210,6 @@ Response post(const std::string &url, const std::string &body, } curl_slist_free_all(headers); - curl_easy_cleanup(curl); return resp; } diff --git a/packages/logger/include/logger/logger.h b/packages/logger/include/logger/logger.h index 3132ada..0d1840a 100644 --- a/packages/logger/include/logger/logger.h +++ b/packages/logger/include/logger/logger.h @@ -5,10 +5,10 @@ namespace logger { /// Initialize the default logger (call once at startup). -void init(const std::string &app_name = "polymech"); +void init(const std::string &app_name = "polymech", const std::string &log_level = "info"); /// Initialize logger with stderr sink (use in worker/IPC mode). -void init_stderr(const std::string &app_name = "polymech-worker"); +void init_stderr(const std::string &app_name = "polymech-worker", const std::string &log_level = "info"); /// Log at various levels. void info(const std::string &msg); diff --git a/packages/logger/src/logger.cpp b/packages/logger/src/logger.cpp index fa53180..522ad63 100644 --- a/packages/logger/src/logger.cpp +++ b/packages/logger/src/logger.cpp @@ -6,17 +6,24 @@ namespace logger { -void init(const std::string &app_name) { +static void apply_log_level(const std::string& level) { + if (level == "debug") spdlog::set_level(spdlog::level::debug); + else if (level == "warn") spdlog::set_level(spdlog::level::warn); + else if (level == "error") spdlog::set_level(spdlog::level::err); + else spdlog::set_level(spdlog::level::info); +} + +void init(const std::string &app_name, const std::string &log_level) { auto console = spdlog::stdout_color_mt(app_name); spdlog::set_default_logger(console); - spdlog::set_level(spdlog::level::debug); + apply_log_level(log_level); spdlog::set_pattern("[%H:%M:%S] [%^%l%$] %v"); } -void init_stderr(const std::string &app_name) { +void init_stderr(const std::string &app_name, const std::string &log_level) { auto console = spdlog::stderr_color_mt(app_name); spdlog::set_default_logger(console); - spdlog::set_level(spdlog::level::debug); + apply_log_level(log_level); spdlog::set_pattern("[%H:%M:%S] [%^%l%$] %v"); } diff --git a/packages/postgres/include/postgres/postgres.h b/packages/postgres/include/postgres/postgres.h index 79e089f..fe84d04 100644 --- a/packages/postgres/include/postgres/postgres.h +++ b/packages/postgres/include/postgres/postgres.h @@ -31,4 +31,16 @@ std::string query(const std::string &table, const std::string &select = "*", /// Returns the created row as JSON. std::string insert(const std::string &table, const std::string &json_body); +/// Upsert a row into a table. Body is a JSON array or object string. +/// Returns the upserted array as JSON. +std::string upsert(const std::string &table, const std::string &json_body, const std::string &on_conflict = ""); + +/// Update rows in a table. Body is a JSON object string. +/// Returns the updated rows as JSON. +std::string update(const std::string &table, const std::string &json_body, const std::string &filter); + +/// Delete rows from a table. +/// Returns the deleted rows as JSON. +std::string del(const std::string &table, const std::string &filter); + } // namespace postgres diff --git a/packages/postgres/src/postgres.cpp b/packages/postgres/src/postgres.cpp index e0d513d..dfa4021 100644 --- a/packages/postgres/src/postgres.cpp +++ b/packages/postgres/src/postgres.cpp @@ -82,9 +82,11 @@ static http::Response supabase_get(const std::string &url) { return resp; } -/// Make an authenticated POST request. -static http::Response supabase_post(const std::string &url, - const std::string &body) { +/// Make an authenticated request with a JSON body (POST, PATCH, DELETE). +static http::Response supabase_request(const std::string &method, + const std::string &url, + const std::string &body, + const std::string &prefer_header) { CURL *curl = curl_easy_init(); http::Response resp{}; if (!curl) { @@ -94,8 +96,12 @@ static http::Response supabase_post(const std::string &url, } struct curl_slist *headers = nullptr; - headers = curl_slist_append(headers, "Content-Type: application/json"); - headers = curl_slist_append(headers, "Prefer: return=representation"); + if (!body.empty()) { + headers = curl_slist_append(headers, "Content-Type: application/json"); + } + if (!prefer_header.empty()) { + headers = curl_slist_append(headers, ("Prefer: " + prefer_header).c_str()); + } headers = curl_slist_append(headers, ("apikey: " + s_config.supabase_key).c_str()); headers = curl_slist_append( @@ -108,8 +114,11 @@ static http::Response supabase_post(const std::string &url, }; curl_easy_setopt(curl, CURLOPT_URL, url.c_str()); + curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, method.c_str()); curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); - curl_easy_setopt(curl, CURLOPT_POSTFIELDS, body.c_str()); + if (!body.empty()) { + curl_easy_setopt(curl, CURLOPT_POSTFIELDS, body.c_str()); + } curl_easy_setopt( curl, CURLOPT_WRITEFUNCTION, static_cast<size_t (*)(void *, size_t, size_t, void *)>(+write_cb)); @@ -164,7 +173,7 @@ std::string insert(const std::string &table, const std::string &json_body) { auto url = s_config.supabase_url + "/rest/v1/" + table; logger::debug("postgres::insert → " + url); - auto resp = supabase_post(url, json_body); + auto resp = supabase_request("POST", url, json_body, "return=representation"); if (resp.status_code >= 200 && resp.status_code < 300) { return resp.body; } @@ -173,4 +182,55 @@ std::string insert(const std::string &table, const std::string &json_body) { return resp.body; } +std::string upsert(const std::string &table, const std::string &json_body, const std::string &on_conflict) { + ensure_init(); + auto url = s_config.supabase_url + "/rest/v1/" + table; + if (!on_conflict.empty()) { + url += "?on_conflict=" + on_conflict; + } + logger::debug("postgres::upsert → " + url); + + auto resp = supabase_request("POST", url, json_body, "return=minimal, resolution=merge-duplicates"); + if (resp.status_code >= 200 && resp.status_code < 300) { + return resp.body; + } + logger::error("postgres::upsert → HTTP " + std::to_string(resp.status_code) + + ": " + resp.body); + return resp.body; +} + +std::string update(const std::string &table, const std::string &json_body, const std::string &filter) { + ensure_init(); + auto url = s_config.supabase_url + "/rest/v1/" + table; + if (!filter.empty()) { + url += "?" + filter; + } + logger::debug("postgres::update → " + url); + + auto resp = supabase_request("PATCH", url, json_body, "return=representation"); + if (resp.status_code >= 200 && resp.status_code < 300) { + return resp.body; + } + logger::error("postgres::update → HTTP " + std::to_string(resp.status_code) + + ": " + resp.body); + return resp.body; +} + +std::string del(const std::string &table, const std::string &filter) { + ensure_init(); + auto url = s_config.supabase_url + "/rest/v1/" + table; + if (!filter.empty()) { + url += "?" + filter; + } + logger::debug("postgres::del → " + url); + + auto resp = supabase_request("DELETE", url, "", "return=representation"); + if (resp.status_code >= 200 && resp.status_code < 300) { + return resp.body; + } + logger::error("postgres::del → HTTP " + std::to_string(resp.status_code) + + ": " + resp.body); + return resp.body; +} + } // namespace postgres diff --git a/packages/search/include/search/search.h b/packages/search/include/search/search.h index 1723118..3a0e136 100644 --- a/packages/search/include/search/search.h +++ b/packages/search/include/search/search.h @@ -25,6 +25,8 @@ struct MapResult { int reviews = 0; GpsCoordinates gps; std::string thumbnail; + std::string raw_json; + std::string geo_json; }; struct SearchResult { @@ -35,17 +37,35 @@ struct SearchResult { // ── Config ────────────────────────────────────────────────────────────────── +struct SystemTuningOptions { + int executor_threads = 0; // 0 = hardware concurrency + int max_concurrent_jobs_per_user = 10; + int http_concurrency_throttle = 50; + int queue_depth_max = 10000; + int bulk_dequeue_size = 1; + int ipc_timeout_ms = 300000; + int max_ipc_connections = 100; + int buffer_size_max = 50 * 1024 * 1024; +}; + struct Config { + SystemTuningOptions system; std::string serpapi_key; std::string geocoder_key; std::string bigdata_key; + std::string scrapeless_key; std::string postgres_url; std::string supabase_url; std::string supabase_service_key; + // [enricher] + std::string enricher_meta_scraper; + int enricher_meta_concurrency = 5; + int enricher_meta_idle_timeout = 60; + int enricher_location_concurrency = 1; }; /// Load config from a TOML file (e.g. config/postgres.toml) -Config load_config(const std::string& path = "config/postgres.toml"); +Config load_config(const std::string &path = "config/postgres.toml"); // ── Search API ────────────────────────────────────────────────────────────── @@ -61,6 +81,13 @@ struct SearchOptions { }; /// Execute a SerpAPI Google Maps search. Handles pagination up to opts.limit. -SearchResult search_google_maps(const Config& cfg, const SearchOptions& opts); +SearchResult search_google_maps(const Config &cfg, const SearchOptions &opts); + +/// Resolve geo coordinate to place info +std::string resolve_geo(double lat, double lng, const std::string &key, + int timeout_ms = 3000); + +void resolve_geo_batch(std::vector<MapResult> &results, const std::string &key, + int concurrency = 10, int timeout_ms = 3000); } // namespace search diff --git a/packages/search/src/search.cpp b/packages/search/src/search.cpp index 62656fe..93d1ee1 100644 --- a/packages/search/src/search.cpp +++ b/packages/search/src/search.cpp @@ -6,6 +6,12 @@ #include <sstream> #include <cstdio> +#include <iostream> +#include <rapidjson/stringbuffer.h> +#include <rapidjson/writer.h> +#include <thread> +#include <mutex> +#include <atomic> namespace search { @@ -15,7 +21,7 @@ static std::string url_encode(const std::string& val) { std::string result; result.reserve(val.size() * 2); for (unsigned char c : val) { - if (isalnum(c) || c == '-' || c == '_' || c == '.' || c == '~') { + if (isalnum(static_cast<unsigned char>(c)) || c == '-' || c == '_' || c == '.' || c == '~') { result += static_cast<char>(c); } else { char buf[4]; @@ -44,10 +50,26 @@ Config load_config(const std::string& path) { if (auto v = tbl["services"]["SERPAPI_KEY"].value<std::string>()) cfg.serpapi_key = *v; if (auto v = tbl["services"]["GEO_CODER_KEY"].value<std::string>()) cfg.geocoder_key = *v; if (auto v = tbl["services"]["BIG_DATA_KEY"].value<std::string>()) cfg.bigdata_key = *v; + if (auto v = tbl["services"]["SCRAPELESS_KEY"].value<std::string>()) cfg.scrapeless_key = *v; + + // [enricher] + if (auto v = tbl["enricher"]["ENRICHER_META_SCRAPER"].value<std::string>()) cfg.enricher_meta_scraper = *v; + if (auto v = tbl["enricher"]["ENRICHER_META_CONCURRENCY"].value<int>()) cfg.enricher_meta_concurrency = *v; + if (auto v = tbl["enricher"]["ENRICHER_META_IDLE_TIMEOUT"].value<int>()) cfg.enricher_meta_idle_timeout = *v; + if (auto v = tbl["enricher"]["ENRICHER_LOCATION_CONCURRENCY"].value<int>()) cfg.enricher_location_concurrency = *v; + + // [system] + if (auto v = tbl["system"]["executor_threads"].value<int>()) cfg.system.executor_threads = *v; + if (auto v = tbl["system"]["max_concurrent_jobs_per_user"].value<int>()) cfg.system.max_concurrent_jobs_per_user = *v; + if (auto v = tbl["system"]["http_concurrency_throttle"].value<int>()) cfg.system.http_concurrency_throttle = *v; + if (auto v = tbl["system"]["queue_depth_max"].value<int>()) cfg.system.queue_depth_max = *v; + if (auto v = tbl["system"]["bulk_dequeue_size"].value<int>()) cfg.system.bulk_dequeue_size = *v; + if (auto v = tbl["system"]["ipc_timeout_ms"].value<int>()) cfg.system.ipc_timeout_ms = *v; + if (auto v = tbl["system"]["max_ipc_connections"].value<int>()) cfg.system.max_ipc_connections = *v; + if (auto v = tbl["system"]["buffer_size_max"].value<int>()) cfg.system.buffer_size_max = *v; } catch (const toml::parse_error& err) { - // Config file missing or malformed — caller should check empty keys - (void)err; + std::cerr << "[config] TOML parse error in " << path << ": " << err.what() << "\n"; } return cfg; } @@ -86,6 +108,13 @@ static void parse_results(const rapidjson::Value& arr, std::vector<MapResult>& o if (!obj.IsObject()) continue; MapResult r; + + // Capture raw JSON string + rapidjson::StringBuffer buf; + rapidjson::Writer<rapidjson::StringBuffer> writer(buf); + obj.Accept(writer); + r.raw_json = std::string(buf.GetString(), buf.GetSize()); + if (obj.HasMember("title") && obj["title"].IsString()) r.title = obj["title"].GetString(); if (obj.HasMember("place_id") && obj["place_id"].IsString()) @@ -196,4 +225,49 @@ SearchResult search_google_maps(const Config& cfg, const SearchOptions& opts) { return result; } +// ── Geo enrichment ────────────────────────────────────────────────────────── + +std::string resolve_geo(double lat, double lng, const std::string& key, int timeout_ms) { + if (key.empty()) return "{}"; + char url[512]; + snprintf(url, sizeof(url), + "https://api.bigdatacloud.net/data/reverse-geocode?latitude=%.7f&longitude=%.7f&localityLanguage=en&key=%s", + lat, lng, key.c_str()); + + http::GetOptions opts; + opts.timeout_ms = timeout_ms; + auto resp = http::get(url, opts); + if (resp.status_code == 200 && !resp.body.empty()) { + return resp.body; + } + return "{}"; +} + +void resolve_geo_batch(std::vector<MapResult>& results, const std::string& key, int concurrency, int timeout_ms) { + if (key.empty() || results.empty()) return; + + std::atomic<size_t> current_idx{0}; + std::vector<std::thread> threads; + + int num_threads = std::min<int>(concurrency, static_cast<int>(results.size())); + + for (int i = 0; i < num_threads; ++i) { + threads.emplace_back([&]() { + while (true) { + size_t idx = current_idx.fetch_add(1); + if (idx >= results.size()) break; + + auto& r = results[idx]; + if (r.gps.lat != 0 || r.gps.lng != 0) { + r.geo_json = resolve_geo(r.gps.lat, r.gps.lng, key, timeout_ms); + } + } + }); + } + + for (auto& t : threads) { + if (t.joinable()) t.join(); + } +} + } // namespace search diff --git a/docs/polymech.md b/polymech.md similarity index 67% rename from docs/polymech.md rename to polymech.md index 0ac2563..87f4252 100644 --- a/docs/polymech.md +++ b/polymech.md @@ -15,8 +15,8 @@ Port the [gridsearch-worker.ts](../src/products/locations/gridsearch-worker.ts) | `grid` | ✅ Done | 13 | 105 | | `search` | ✅ Done | 8 | 13 | | CLI `gridsearch` | ✅ Done | — | dry-run verified (3ms) | -| IPC `gridsearch` | 🔧 Stub | — | routes msg, TODO: parse payload | -| **Total** | | **62** | **248** | +| IPC `gridsearch` | ✅ Done | 1 | 30 | +| **Total** | | **63** | **278** | --- @@ -44,7 +44,7 @@ GADM Resolve → Grid Generate → SerpAPI Search → Enrich → Supabase Upsert | **1. GADM Resolve** | GID list + target level | `GridFeature[]` (GeoJSON polygons with GHS props) | Read pre-cached JSON files from `cache/gadm/boundary_{GID}_{LEVEL}.json` | | **2. Grid Generate** | `GridFeature[]` + settings | `GridSearchHop[]` (waypoints: lat/lng/radius) | Centroid, bbox, distance, area, point-in-polygon, cell sorting | | **3. Search** | Waypoints + query + SerpAPI key | Place results (JSON) | HTTP calls to `serpapi.com`, per-waypoint caching | -| **4. Enrich** | Place results | Enriched data (emails, pages) | HTTP scraping — **defer to Phase 2** | +| **4. Enrich** | Place results | Enriched data (emails, pages) | HTTP scraping | | **5. Persist** | Enriched places | Supabase `places` + `grid_search_runs` | PostgREST upsert | --- @@ -168,14 +168,16 @@ Reads `[services].SERPAPI_KEY`, `GEO_CODER_KEY`, `BIG_DATA_KEY` from `config/pos --- -## CLI Subcommand: `gridsearch` ✅ +## CLI Subcommands ✅ + +### 1. `gridsearch` (One-shot execution) ``` polymech-cli gridsearch <GID> <QUERY> [OPTIONS] Positionals: - GID GADM GID (e.g. ESP.1.1_1) - QUERY Search query (e.g. 'mecanizado cnc') + GID GADM GID (e.g. ESP.1.1_1) — ignored when --settings is used + QUERY Search query — ignored when --settings is used Options: -l, --level INT Target GADM level (default: 0) @@ -186,9 +188,25 @@ Options: --sort TEXT Path order: snake|zigzag|spiral-out|spiral-in|shortest -c, --config TEXT TOML config path (default: config/postgres.toml) --cache-dir TEXT GADM cache directory (default: cache/gadm) + --settings TEXT JSON settings file (matches TypeScript GuidedPreset shape) + --enrich Run enrichment pipeline (meta + email) after search + --persistence-postgres Persist run data natively via Postgres + -o, --output TEXT Output JSON file (default: gridsearch-HH-MM.json in cwd) --dry-run Generate grid only, skip SerpAPI search ``` +### 2. `worker` (IPC Daemon execution) + +``` +polymech-cli worker [OPTIONS] + +Options: + --daemon Run persistent daemon pool (tier-based) + -c, --config TEXT TOML config path (default: config/postgres.toml) + --user-uid TEXT User ID to bind this daemon to (needed for place owner) + --uds TEXT Run over Unix Domain Socket / Named Pipe (TCP on Windows) at the given path +``` + ### Execution flow ``` @@ -210,7 +228,27 @@ polymech-cli gridsearch ABW "recycling" --dry-run ### IPC worker mode -The `worker` subcommand routes `gridsearch` message type (currently echoes payload — TODO: wire full pipeline from parsed JSON). +The `worker` subcommand natively routes multiplexed asynchronous `gridsearch` payloads. When launched via `--uds <path>`, it provisions a high-performance Asio streaming server (AF_UNIX sockets on POSIX, TCP sockets on Windows). Event frames (`grid-ready`, `waypoint-start`, `location`, `node`, etc) emit bi-directionally utilizing the IPC bridging protocol, dropping locking blockades completely. + +--- + +## Exposed Configuration / Tuning Parameters + +As we integrate deeper with the core business logic, the Node orchestrator and internal services should configure and enforce limits on the underlying C++ concurrent engine. Relevant configuration surfaces we need to expose for the primary ecosystem libraries include: + +### 1. Taskflow (`https://github.com/taskflow/taskflow`) +- **`executor_threads` (`num_workers`)**: The size of the `tf::Executor` thread pool. As Gridsearch is heavily I/O network bound (HTTP calls for search/enrichment), setting this significantly higher than `std::thread::hardware_concurrency()` may aggressively improve HTTP ingestion throughput globally. +- **`max_concurrent_jobs_per_user`**: A structural limit dictating how many concurrent gridsearch invocation graphs a single tenant/user can enqueue and run actively to prevent monopolization. +- **`http_concurrency_throttle`**: Task limits enforced upon node scraping or SerpAPI requests per-pipeline graph to avoid widespread `429 Too Many Requests` bans. + +### 2. Moodycamel ConcurrentQueue (`https://github.com/cameron314/concurrentqueue`) +- **`queue_depth_max` / `backpressure`**: Since Moodycamel queue memory allocates dynamically and lock-free to any capacity, we must mandate a hard software ceiling/backpressure limit over the Node-to-C++ IPC layer. If Node blindly streams jobs faster than Taskflow can execute them, the daemon will eventually OOM. +- **`bulk_dequeue_size`**: Exposing tuning parameters for the dispatch thread on how many concurrent IPC tasks should be sucked out of the queue simultaneously. + +### 3. Boost.Asio (`https://github.com/chriskohlhoff/asio`) +- **`ipc_timeout_ms` (Read/Write)**: Mandatory timeouts for the IPC socket layer. If the orchestrator stalls, crashes, or goes silent, Asio must reap the connection and automatically GC the in-flight tasks to prevent Zombie worker processes. +- **`max_ipc_connections`**: Absolute limit on simultaneous orchestration pipelines dialing into a single Worker Pod. +- **`buffer_size_max`**: Soft constraints on async payload allocations so a malformed 200MB JSON frame from Node.js doesn't memory-spike the `asio::read` operations abruptly. --- @@ -260,7 +298,15 @@ All packages depend on `logger` and `json` implicitly. ### Integration test (Node.js) - Existing `orchestrator/test-ipc.mjs` validates spawn/lifecycle/ping/job -- TODO: `test-gridsearch.mjs` for full pipeline via IPC +- `orchestrator/test-gridsearch-ipc.mjs` validates full pipeline via IPC (8 event types + job result) +- `orchestrator/test-gridsearch-ipc-uds.mjs` validates high-throughput Unix Domain Sockets mapping, backpressure boundaries, and soft cancellation injections utilizing `action: cancel` frames mid-flight. + +--- + +## IPC Cancellation & Dynamic Job Tuning + +The high-performance UDS daemon now natively tracks and intercepts JSON `action: cancel` frames referencing specific `jobId`s to gracefully exit Taskflow jobs mid-flight. +Dynamic tuning limits, such as memory buffering boundaries or threading capacities, are inherently validated and bound by hard ceilings established inside the `[system]` constraint block of `config/postgres.toml`. --- @@ -268,10 +314,7 @@ All packages depend on `logger` and `json` implicitly. | Item | Reason | |------|--------| -| Enrichment (email scraping) | Complex + browser-dependent; keep in Node.js | | SerpAPI response caching | State store managed by orchestrator for now | | Protobuf framing | JSON IPC sufficient for current throughput | | Multi-threaded search | Sequential is fine for SerpAPI rate limits | | GEOS integration | Custom geo is sufficient for grid math | -| IPC gridsearch payload parser | Currently a stub; wire full pipeline from JSON | -| Supabase upsert in CLI | Use postgres package for batch insert | diff --git a/src/main.cpp b/src/main.cpp index be9dee8..8724e95 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1,351 +1,265 @@ -#include <iostream> -#include <string> -#include <chrono> - -#include <CLI/CLI.hpp> -#include <toml++/toml.hpp> - -#include "html/html.h" -#include "http/http.h" -#include "ipc/ipc.h" -#include "logger/logger.h" -#include "postgres/postgres.h" -#include "json/json.h" -#include "gadm_reader/gadm_reader.h" -#include "grid/grid.h" -#include "search/search.h" - -#ifndef PROJECT_VERSION -#define PROJECT_VERSION "0.1.0" -#endif - -int main(int argc, char *argv[]) { - CLI::App app{"polymech-cli — Polymech C++ CLI", "polymech-cli"}; - app.set_version_flag("-v,--version", PROJECT_VERSION); - - // Subcommand: parse HTML - std::string html_input; - auto *parse_cmd = app.add_subcommand("parse", "Parse HTML and list elements"); - parse_cmd->add_option("html", html_input, "HTML string to parse")->required(); - - // Subcommand: select from HTML - std::string select_input; - std::string selector; - auto *select_cmd = - app.add_subcommand("select", "CSS-select elements from HTML"); - select_cmd->add_option("html", select_input, "HTML string")->required(); - select_cmd->add_option("selector", selector, "CSS selector")->required(); - - // Subcommand: config — read a TOML file - std::string config_path; - auto *config_cmd = - app.add_subcommand("config", "Read and display a TOML config file"); - config_cmd->add_option("file", config_path, "Path to TOML file")->required(); - - // Subcommand: fetch — HTTP GET a URL - std::string fetch_url; - auto *fetch_cmd = - app.add_subcommand("fetch", "HTTP GET a URL and print the response"); - fetch_cmd->add_option("url", fetch_url, "URL to fetch")->required(); - - // Subcommand: json — prettify JSON - std::string json_input; - auto *json_cmd = app.add_subcommand("json", "Prettify a JSON string"); - json_cmd->add_option("input", json_input, "JSON string")->required(); - - // Subcommand: db — connect to Supabase and query - std::string db_config_path = "config/postgres.toml"; - std::string db_table; - int db_limit = 10; - auto *db_cmd = - app.add_subcommand("db", "Connect to Supabase and query a table"); - db_cmd->add_option("-c,--config", db_config_path, "TOML config path") - ->default_val("config/postgres.toml"); - db_cmd->add_option("table", db_table, "Table to query (optional)"); - db_cmd->add_option("-l,--limit", db_limit, "Row limit")->default_val(10); - - // Subcommand: worker — IPC mode (spawned by Node.js orchestrator) - auto *worker_cmd = app.add_subcommand( - "worker", "Run as IPC worker (stdin/stdout length-prefixed JSON)"); - - // Subcommand: gridsearch — Run a full gridsearch pipeline - std::string gs_gid; - int gs_level = 0; - std::string gs_query; - std::string gs_grid_mode = "hex"; - double gs_cell_size = 5.0; - int gs_limit = 20; - int gs_zoom = 13; - std::string gs_sort = "snake"; - std::string gs_config_path = "config/postgres.toml"; - std::string gs_cache_dir = "cache/gadm"; - bool gs_dry_run = false; - auto *gs_cmd = app.add_subcommand("gridsearch", "Run a full gridsearch pipeline (enumerate → grid → search)"); - gs_cmd->add_option("gid", gs_gid, "GADM GID (e.g. ESP.1.1_1)")->required(); - gs_cmd->add_option("query", gs_query, "Search query (e.g. 'mecanizado cnc')")->required(); - gs_cmd->add_option("-l,--level", gs_level, "Target GADM level")->default_val(0); - gs_cmd->add_option("-m,--mode", gs_grid_mode, "Grid mode: hex|square|admin|centers")->default_val("hex"); - gs_cmd->add_option("-s,--cell-size", gs_cell_size, "Cell size in km")->default_val(5.0); - gs_cmd->add_option("--limit", gs_limit, "Max results per area")->default_val(20); - gs_cmd->add_option("-z,--zoom", gs_zoom, "Google Maps zoom")->default_val(13); - gs_cmd->add_option("--sort", gs_sort, "Path order: snake|zigzag|spiral-out|spiral-in|shortest")->default_val("snake"); - gs_cmd->add_option("-c,--config", gs_config_path, "TOML config path")->default_val("config/postgres.toml"); - gs_cmd->add_option("--cache-dir", gs_cache_dir, "GADM cache directory")->default_val("cache/gadm"); - gs_cmd->add_flag("--dry-run", gs_dry_run, "Generate grid only, skip SerpAPI search"); - - CLI11_PARSE(app, argc, argv); - - // Worker mode uses stderr for logs to keep stdout clean for IPC frames - if (worker_cmd->parsed()) { - logger::init_stderr("polymech-worker"); - } else { - logger::init("polymech-cli"); - } - - // ── worker mode ───────────────────────────────────────────────────────── - if (worker_cmd->parsed()) { - logger::info("Worker mode: listening on stdin"); - - // Send a "ready" message so the orchestrator knows we're alive - ipc::write_message({"0", "ready", "{}"}); - - while (true) { - ipc::Message req; - if (!ipc::read_message(req)) { - logger::info("Worker: stdin closed, exiting"); - break; - } - - logger::debug("Worker recv: type=" + req.type + " id=" + req.id); - - if (req.type == "ping") { - ipc::write_message({req.id, "pong", "{}"}); - - } else if (req.type == "gridsearch") { - // Parse gridsearch job from payload - logger::info("Worker: gridsearch job received"); - // TODO: parse req.payload JSON into gs options, run pipeline, emit progress - ipc::write_message({req.id, "job_result", req.payload}); - - } else if (req.type == "job") { - // Stub: echo the payload back as job_result - ipc::write_message({req.id, "job_result", req.payload}); - - } else if (req.type == "shutdown") { - ipc::write_message({req.id, "shutdown_ack", "{}"}); - logger::info("Worker: shutdown requested, exiting"); - break; - - } else { - // Unknown type — respond with error - ipc::write_message( - {req.id, "error", - "{\"message\":\"unknown type: " + req.type + "\"}"}); - } - } - - return 0; - } - - // ── existing subcommands ──────────────────────────────────────────────── - if (parse_cmd->parsed()) { - auto elements = html::parse(html_input); - logger::info("Parsed " + std::to_string(elements.size()) + " elements"); - for (const auto &el : elements) { - std::cout << "<" << el.tag << "> " << el.text << "\n"; - } - return 0; - } - - if (select_cmd->parsed()) { - auto matches = html::select(select_input, selector); - logger::info("Matched " + std::to_string(matches.size()) + " elements"); - for (const auto &m : matches) { - std::cout << m << "\n"; - } - return 0; - } - - if (config_cmd->parsed()) { - try { - auto tbl = toml::parse_file(config_path); - logger::info("Loaded config: " + config_path); - std::cout << tbl << "\n"; - } catch (const toml::parse_error &err) { - logger::error("TOML parse error: " + std::string(err.what())); - return 1; - } - return 0; - } - - if (fetch_cmd->parsed()) { - auto resp = http::get(fetch_url); - logger::info("HTTP " + std::to_string(resp.status_code) + " from " + - fetch_url); - if (json::is_valid(resp.body)) { - std::cout << json::prettify(resp.body) << "\n"; - } else { - std::cout << resp.body << "\n"; - } - return 0; - } - - if (json_cmd->parsed()) { - if (!json::is_valid(json_input)) { - logger::error("Invalid JSON input"); - return 1; - } - std::cout << json::prettify(json_input) << "\n"; - return 0; - } - - if (db_cmd->parsed()) { - try { - auto cfg = toml::parse_file(db_config_path); - postgres::Config pg_cfg; - pg_cfg.supabase_url = cfg["supabase"]["url"].value_or(std::string("")); - pg_cfg.supabase_key = - cfg["supabase"]["publishable_key"].value_or(std::string("")); - postgres::init(pg_cfg); - - auto status = postgres::ping(); - logger::info("Supabase: " + status); - - if (!db_table.empty()) { - auto result = postgres::query(db_table, "*", "", db_limit); - if (json::is_valid(result)) { - std::cout << json::prettify(result) << "\n"; - } else { - std::cout << result << "\n"; - } - } - } catch (const std::exception &e) { - logger::error(std::string("db error: ") + e.what()); - return 1; - } - return 0; - } - - // ── gridsearch subcommand ────────────────────────────────────────────── - if (gs_cmd->parsed()) { - logger::info("Gridsearch: gid=" + gs_gid + " query=\"" + gs_query + "\" mode=" + gs_grid_mode); - - auto t0 = std::chrono::steady_clock::now(); - - // 1. Load config - auto cfg = search::load_config(gs_config_path); - if (cfg.serpapi_key.empty() && !gs_dry_run) { - logger::error("No SERPAPI_KEY in " + gs_config_path); - return 1; - } - - // 2. Resolve GADM boundaries - logger::info("Loading boundary for " + gs_gid + " level=" + std::to_string(gs_level)); - auto boundary = gadm::load_boundary(gs_gid, gs_level, gs_cache_dir); - if (!boundary.error.empty()) { - logger::error("Boundary error: " + boundary.error); - return 1; - } - logger::info("Resolved " + std::to_string(boundary.features.size()) + " features"); - - // 3. Generate grid - grid::GridOptions grid_opts; - grid_opts.gridMode = gs_grid_mode; - grid_opts.cellSize = gs_cell_size; - grid_opts.cellOverlap = 0; - grid_opts.centroidOverlap = 0; - grid_opts.maxCellsLimit = 10000; - grid_opts.maxElevation = 0; - grid_opts.minDensity = 0; - grid_opts.minGhsPop = 0; - grid_opts.minGhsBuilt = 0; - grid_opts.ghsFilterMode = "OR"; - grid_opts.allowMissingGhs = true; - grid_opts.bypassFilters = false; - grid_opts.pathOrder = gs_sort; - grid_opts.groupByRegion = false; - - auto grid_result = grid::generate(boundary.features, grid_opts); - if (!grid_result.error.empty()) { - logger::error("Grid error: " + grid_result.error); - return 1; - } - logger::info("Grid: " + std::to_string(grid_result.waypoints.size()) + " waypoints, " - + std::to_string(grid_result.skippedCells) + " skipped"); - - if (gs_dry_run) { - // Output waypoints as JSON array - std::cout << "["; - for (size_t i = 0; i < grid_result.waypoints.size(); ++i) { - const auto& wp = grid_result.waypoints[i]; - if (i > 0) std::cout << ","; - char buf[256]; - snprintf(buf, sizeof(buf), - "{\"step\":%d,\"lat\":%.6f,\"lng\":%.6f,\"radius_km\":%.3f}", - wp.step, wp.lat, wp.lng, wp.radius_km); - std::cout << buf; - } - std::cout << "]\n"; - - auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>( - std::chrono::steady_clock::now() - t0).count(); - logger::info("Dry-run complete in " + std::to_string(elapsed) + "ms"); - return 0; - } - - // 4. Search each waypoint via SerpAPI - logger::info("Starting SerpAPI search for " + std::to_string(grid_result.waypoints.size()) + " waypoints"); - - int totalResults = 0; - int totalApiCalls = 0; - - std::cout << "{\"waypoints\":["; - for (size_t i = 0; i < grid_result.waypoints.size(); ++i) { - const auto& wp = grid_result.waypoints[i]; - - search::SearchOptions sopts; - sopts.query = gs_query; - sopts.lat = wp.lat; - sopts.lng = wp.lng; - sopts.zoom = gs_zoom; - sopts.limit = gs_limit; - - auto sr = search::search_google_maps(cfg, sopts); - totalResults += static_cast<int>(sr.results.size()); - totalApiCalls += sr.apiCalls; - - if (i > 0) std::cout << ","; - char hdr[256]; - snprintf(hdr, sizeof(hdr), - "{\"step\":%d,\"lat\":%.6f,\"lng\":%.6f,\"results\":%zu,\"apiCalls\":%d}", - wp.step, wp.lat, wp.lng, sr.results.size(), sr.apiCalls); - std::cout << hdr; - - // Log progress - logger::info("Waypoint " + std::to_string(i + 1) + "/" + - std::to_string(grid_result.waypoints.size()) + - " → " + std::to_string(sr.results.size()) + " results"); - } - std::cout << "],"; - - auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>( - std::chrono::steady_clock::now() - t0).count(); - - char summary[512]; - snprintf(summary, sizeof(summary), - "\"summary\":{\"waypoints\":%zu,\"totalResults\":%d," - "\"totalApiCalls\":%d,\"elapsedMs\":%lld}}", - grid_result.waypoints.size(), totalResults, totalApiCalls, - static_cast<long long>(elapsed)); - std::cout << summary << "\n"; - - logger::info("Gridsearch done: " + std::to_string(totalResults) + - " results, " + std::to_string(totalApiCalls) + - " API calls, " + std::to_string(elapsed) + "ms"); - return 0; - } - - // No subcommand — show help - std::cout << app.help() << "\n"; - return 0; -} +#include <iostream> +#include <fstream> +#include <string> +#include <chrono> +#include <set> +#include <ctime> +#include <iomanip> +#include <sstream> +#include <rapidjson/document.h> + +#include <CLI/CLI.hpp> +#include <toml++/toml.hpp> + +#include "html/html.h" +#include "http/http.h" +#include "ipc/ipc.h" +#include "logger/logger.h" +#include "postgres/postgres.h" +#include "json/json.h" +#include "gadm_reader/gadm_reader.h" +#include "grid/grid.h" +#include "search/search.h" +#include "enrichers/enrichers.h" +#include "cmd_gridsearch.h" + +#ifndef PROJECT_VERSION +#define PROJECT_VERSION "0.1.0" +#endif + +int main(int argc, char *argv[]) { + CLI::App app{"polymech-cli — Polymech C++ CLI", "polymech-cli"}; + app.set_version_flag("-v,--version", PROJECT_VERSION); + + std::string log_level = "info"; + app.add_option("--log-level", log_level, "Set log level (debug/info/warn/error)")->default_val("info"); + + // Subcommand: parse HTML + std::string html_input; + auto *parse_cmd = app.add_subcommand("parse", "Parse HTML and list elements"); + parse_cmd->add_option("html", html_input, "HTML string to parse")->required(); + + // Subcommand: select from HTML + std::string select_input; + std::string selector; + auto *select_cmd = + app.add_subcommand("select", "CSS-select elements from HTML"); + select_cmd->add_option("html", select_input, "HTML string")->required(); + select_cmd->add_option("selector", selector, "CSS selector")->required(); + + // Subcommand: config — read a TOML file + std::string config_path; + auto *config_cmd = + app.add_subcommand("config", "Read and display a TOML config file"); + config_cmd->add_option("file", config_path, "Path to TOML file")->required(); + + // Subcommand: fetch — HTTP GET a URL + std::string fetch_url; + auto *fetch_cmd = + app.add_subcommand("fetch", "HTTP GET a URL and print the response"); + fetch_cmd->add_option("url", fetch_url, "URL to fetch")->required(); + + // Subcommand: json — prettify JSON + std::string json_input; + auto *json_cmd = app.add_subcommand("json", "Prettify a JSON string"); + json_cmd->add_option("input", json_input, "JSON string")->required(); + + // Subcommand: db — connect to Supabase and query + std::string db_config_path = "config/postgres.toml"; + std::string db_table; + int db_limit = 10; + auto *db_cmd = + app.add_subcommand("db", "Connect to Supabase and query a table"); + db_cmd->add_option("-c,--config", db_config_path, "TOML config path") + ->default_val("config/postgres.toml"); + db_cmd->add_option("table", db_table, "Table to query (optional)"); + db_cmd->add_option("-l,--limit", db_limit, "Row limit")->default_val(10); + + // Subcommand: worker — IPC mode (spawned by Node.js orchestrator) + bool daemon_mode = false; + std::string daemon_uid; + std::string worker_config = "config/postgres.toml"; + std::string uds_path; + + auto *worker_cmd = app.add_subcommand( + "worker", "Run as IPC worker (stdin/stdout length-prefixed JSON)"); + worker_cmd->add_flag("--daemon", daemon_mode, "Run persistent daemon pool (tier-based)"); + worker_cmd->add_option("-c,--config", worker_config, "TOML config path")->default_val("config/postgres.toml"); + worker_cmd->add_option("--user-uid", daemon_uid, "User ID to bind this daemon to (needed for place owner)"); + worker_cmd->add_option("--uds", uds_path, "Run over Unix Domain Socket / Named Pipe at the given path"); + + // Subcommand: gridsearch — Run a full gridsearch pipeline + auto* gs_cmd = polymech::setup_cmd_gridsearch(app); + + CLI11_PARSE(app, argc, argv); + + // Worker mode uses stderr for logs to keep stdout clean for IPC frames + if (worker_cmd->parsed()) { + logger::init_stderr("polymech-worker", log_level); + } else { + logger::init("polymech-cli", log_level); + } + + // ── worker mode ───────────────────────────────────────────────────────── + if (worker_cmd->parsed()) { + logger::info("Worker mode: listening on stdin"); + + if (daemon_mode) { + logger::info("Daemon mode enabled. Pre-initializing Postgres pool and binding to User: " + (daemon_uid.empty() ? "None" : daemon_uid)); + auto cfg = search::load_config(worker_config); + postgres::Config pcfg; + pcfg.supabase_url = cfg.supabase_url; + pcfg.supabase_key = cfg.supabase_service_key; + postgres::init(pcfg); + } + + if (!uds_path.empty()) { + logger::info("Worker mode: UDS Server active on " + uds_path); + int rc = polymech::run_cmd_gridsearch_uds(uds_path, daemon_mode, daemon_uid); + return rc; + } + + // Send a "ready" message so the orchestrator knows we're alive + ipc::write_message({"0", "ready", "{}"}); + + while (true) { + ipc::Message req; + if (!ipc::read_message(req)) { + logger::info("Worker: stdin closed, exiting"); + break; + } + + logger::debug("Worker recv: type=" + req.type + " id=" + req.id); + + if (req.type == "ping") { + ipc::write_message({req.id, "pong", "{}"}); + + } else if (req.type == "gridsearch") { + logger::info("Worker: gridsearch job received"); + + // Build callbacks that emit IPC events. + // Progress events use id "0" (unmatched → event for orchestrator). + // The final job_result uses the original req.id so the promise resolves. + std::string req_id = req.id; + polymech::GridsearchCallbacks cb; + cb.onEvent = [&req_id](const std::string& type, const std::string& json) { + if (type == "job_result") { + ipc::write_message({req_id, "job_result", json}); + } else { + ipc::write_message({"0", type, json}); + } + }; + + int rc = polymech::run_cmd_gridsearch_ipc(req.payload, req.id, cb, daemon_mode, daemon_uid); + if (rc != 0) { + ipc::write_message({req.id, "error", "{\"message\":\"gridsearch pipeline failed\"}"}); + } + + } else if (req.type == "job") { + // Stub: echo the payload back as job_result + ipc::write_message({req.id, "job_result", req.payload}); + + } else if (req.type == "shutdown") { + ipc::write_message({req.id, "shutdown_ack", "{}"}); + logger::info("Worker: shutdown requested, exiting"); + break; + + } else { + // Unknown type — respond with error + ipc::write_message( + {req.id, "error", + "{\"message\":\"unknown type: " + req.type + "\"}"}); + } + } + + return 0; + } + + // ── existing subcommands ──────────────────────────────────────────────── + if (parse_cmd->parsed()) { + auto elements = html::parse(html_input); + logger::info("Parsed " + std::to_string(elements.size()) + " elements"); + for (const auto &el : elements) { + std::cout << "<" << el.tag << "> " << el.text << "\n"; + } + return 0; + } + + if (select_cmd->parsed()) { + auto matches = html::select(select_input, selector); + logger::info("Matched " + std::to_string(matches.size()) + " elements"); + for (const auto &m : matches) { + std::cout << m << "\n"; + } + return 0; + } + + if (config_cmd->parsed()) { + try { + auto tbl = toml::parse_file(config_path); + logger::info("Loaded config: " + config_path); + std::cout << tbl << "\n"; + } catch (const toml::parse_error &err) { + logger::error("TOML parse error: " + std::string(err.what())); + return 1; + } + return 0; + } + + if (fetch_cmd->parsed()) { + auto resp = http::get(fetch_url); + logger::info("HTTP " + std::to_string(resp.status_code) + " from " + + fetch_url); + if (json::is_valid(resp.body)) { + std::cout << json::prettify(resp.body) << "\n"; + } else { + std::cout << resp.body << "\n"; + } + return 0; + } + + if (json_cmd->parsed()) { + if (!json::is_valid(json_input)) { + logger::error("Invalid JSON input"); + return 1; + } + std::cout << json::prettify(json_input) << "\n"; + return 0; + } + + if (db_cmd->parsed()) { + try { + auto cfg = toml::parse_file(db_config_path); + postgres::Config pg_cfg; + pg_cfg.supabase_url = cfg["supabase"]["url"].value_or(std::string("")); + pg_cfg.supabase_key = + cfg["supabase"]["publishable_key"].value_or(std::string("")); + postgres::init(pg_cfg); + + auto status = postgres::ping(); + logger::info("Supabase: " + status); + + if (!db_table.empty()) { + auto result = postgres::query(db_table, "*", "", db_limit); + if (json::is_valid(result)) { + std::cout << json::prettify(result) << "\n"; + } else { + std::cout << result << "\n"; + } + } + } catch (const std::exception &e) { + logger::error(std::string("db error: ") + e.what()); + return 1; + } + return 0; + } + + // ── gridsearch subcommand ────────────────────────────────────────────── + if (gs_cmd->parsed()) { + return polymech::run_cmd_gridsearch(); + } + + // No subcommand — show help + std::cout << app.help() << "\n"; + return 0; +} diff --git a/src/sys_metrics.cpp b/src/sys_metrics.cpp new file mode 100644 index 0000000..e31d255 --- /dev/null +++ b/src/sys_metrics.cpp @@ -0,0 +1,36 @@ +#include "sys_metrics.h" + +#ifdef _WIN32 +#define NOMINMAX +#include <windows.h> +#include <psapi.h> +#pragma comment(lib, "psapi.lib") + +namespace polymech { +size_t get_current_rss_mb() { + PROCESS_MEMORY_COUNTERS info; + if (GetProcessMemoryInfo(GetCurrentProcess(), &info, sizeof(info))) { + return (size_t)(info.WorkingSetSize) / (1024 * 1024); + } + return 0; +} + +uint64_t get_cpu_time_ms() { + FILETIME creationTime, exitTime, kernelTime, userTime; + if (GetProcessTimes(GetCurrentProcess(), &creationTime, &exitTime, &kernelTime, &userTime)) { + ULARGE_INTEGER kernel, user; + kernel.LowPart = kernelTime.dwLowDateTime; + kernel.HighPart = kernelTime.dwHighDateTime; + user.LowPart = userTime.dwLowDateTime; + user.HighPart = userTime.dwHighDateTime; + return (kernel.QuadPart + user.QuadPart) / 10000; + } + return 0; +} +} +#else +namespace polymech { +size_t get_current_rss_mb() { return 0; } +uint64_t get_cpu_time_ms() { return 0; } +} +#endif diff --git a/src/sys_metrics.h b/src/sys_metrics.h new file mode 100644 index 0000000..871ab6e --- /dev/null +++ b/src/sys_metrics.h @@ -0,0 +1,8 @@ +#pragma once +#include <cstddef> +#include <cstdint> + +namespace polymech { +size_t get_current_rss_mb(); +uint64_t get_cpu_time_ms(); +} diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 40bcfad..1254016 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,67 +1,74 @@ -# ── Test targets ────────────────────────────────────────────────────────────── -include(CTest) -include(Catch) - -# Unit tests — one per package -add_executable(test_logger unit/test_logger.cpp) -target_link_libraries(test_logger PRIVATE Catch2::Catch2WithMain logger) -catch_discover_tests(test_logger) - -add_executable(test_html unit/test_html.cpp) -target_link_libraries(test_html PRIVATE Catch2::Catch2WithMain html) -catch_discover_tests(test_html) - -add_executable(test_postgres unit/test_postgres.cpp) -target_link_libraries(test_postgres PRIVATE Catch2::Catch2WithMain postgres) -catch_discover_tests(test_postgres) - -add_executable(test_json unit/test_json.cpp) -target_link_libraries(test_json PRIVATE Catch2::Catch2WithMain json) -catch_discover_tests(test_json) - -add_executable(test_http unit/test_http.cpp) -target_link_libraries(test_http PRIVATE Catch2::Catch2WithMain http) -catch_discover_tests(test_http) - -# Functional test — end-to-end CLI -add_executable(test_functional functional/test_cli.cpp) -target_link_libraries(test_functional PRIVATE Catch2::Catch2WithMain CLI11::CLI11 tomlplusplus::tomlplusplus logger html postgres http json) -catch_discover_tests(test_functional) - -# E2E test — real Supabase connection (requires config/postgres.toml + network) -add_executable(test_supabase e2e/test_supabase.cpp) -target_link_libraries(test_supabase PRIVATE Catch2::Catch2WithMain tomlplusplus::tomlplusplus logger postgres json) -catch_discover_tests(test_supabase WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}) - -add_executable(test_polymech unit/test_polymech.cpp) -target_link_libraries(test_polymech PRIVATE Catch2::Catch2WithMain polymech postgres) -catch_discover_tests(test_polymech) - -# E2E test — polymech fetch_pages from live Supabase -add_executable(test_polymech_e2e e2e/test_polymech_e2e.cpp) -target_link_libraries(test_polymech_e2e PRIVATE Catch2::Catch2WithMain tomlplusplus::tomlplusplus logger postgres polymech json) -catch_discover_tests(test_polymech_e2e WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}) - -add_executable(test_ipc unit/test_ipc.cpp) -target_link_libraries(test_ipc PRIVATE Catch2::Catch2WithMain ipc) -catch_discover_tests(test_ipc) - -add_executable(test_geo unit/test_geo.cpp) -target_link_libraries(test_geo PRIVATE Catch2::Catch2WithMain geo) -catch_discover_tests(test_geo) - -add_executable(test_gadm_reader unit/test_gadm_reader.cpp) -target_link_libraries(test_gadm_reader PRIVATE Catch2::Catch2WithMain gadm_reader) -catch_discover_tests(test_gadm_reader WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}) - -add_executable(test_grid unit/test_grid.cpp) -target_link_libraries(test_grid PRIVATE Catch2::Catch2WithMain grid) -catch_discover_tests(test_grid WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}) - -add_executable(test_search unit/test_search.cpp) -target_link_libraries(test_search PRIVATE Catch2::Catch2WithMain search) -catch_discover_tests(test_search WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}) - - - - +# ── Test targets ────────────────────────────────────────────────────────────── +include(CTest) +include(Catch) + +# pthread is required on Linux for Catch2 tests +find_package(Threads REQUIRED) + +# Unit tests — one per package +add_executable(test_logger unit/test_logger.cpp) +target_link_libraries(test_logger PRIVATE Catch2::Catch2WithMain logger Threads::Threads) +catch_discover_tests(test_logger) + +add_executable(test_html unit/test_html.cpp) +target_link_libraries(test_html PRIVATE Catch2::Catch2WithMain html Threads::Threads) +catch_discover_tests(test_html) + +add_executable(test_postgres unit/test_postgres.cpp) +target_link_libraries(test_postgres PRIVATE Catch2::Catch2WithMain postgres Threads::Threads) +catch_discover_tests(test_postgres) + +add_executable(test_json unit/test_json.cpp) +target_link_libraries(test_json PRIVATE Catch2::Catch2WithMain json Threads::Threads) +catch_discover_tests(test_json) + +add_executable(test_http unit/test_http.cpp) +target_link_libraries(test_http PRIVATE Catch2::Catch2WithMain http Threads::Threads) +catch_discover_tests(test_http) + +# Functional test — end-to-end CLI +add_executable(test_functional functional/test_cli.cpp) +target_link_libraries(test_functional PRIVATE Catch2::Catch2WithMain CLI11::CLI11 tomlplusplus::tomlplusplus logger html postgres http json Threads::Threads) +catch_discover_tests(test_functional) + +# E2E test — real Supabase connection (requires config/postgres.toml + network) +add_executable(test_supabase e2e/test_supabase.cpp) +target_link_libraries(test_supabase PRIVATE Catch2::Catch2WithMain tomlplusplus::tomlplusplus logger postgres json Threads::Threads) +catch_discover_tests(test_supabase WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}) + +add_executable(test_postgres_live functional/test_postgres_live.cpp) +target_link_libraries(test_postgres_live PRIVATE Catch2::Catch2WithMain postgres search json logger tomlplusplus::tomlplusplus Threads::Threads) +catch_discover_tests(test_postgres_live WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}) + +add_executable(test_polymech unit/test_polymech.cpp) +target_link_libraries(test_polymech PRIVATE Catch2::Catch2WithMain polymech postgres Threads::Threads) +catch_discover_tests(test_polymech) + +# E2E test — polymech fetch_pages from live Supabase +add_executable(test_polymech_e2e e2e/test_polymech_e2e.cpp) +target_link_libraries(test_polymech_e2e PRIVATE Catch2::Catch2WithMain tomlplusplus::tomlplusplus logger postgres polymech json Threads::Threads) +catch_discover_tests(test_polymech_e2e WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}) + +add_executable(test_ipc unit/test_ipc.cpp) +target_link_libraries(test_ipc PRIVATE Catch2::Catch2WithMain ipc Threads::Threads) +catch_discover_tests(test_ipc) + +add_executable(test_geo unit/test_geo.cpp) +target_link_libraries(test_geo PRIVATE Catch2::Catch2WithMain geo Threads::Threads) +catch_discover_tests(test_geo) + +add_executable(test_gadm_reader unit/test_gadm_reader.cpp) +target_link_libraries(test_gadm_reader PRIVATE Catch2::Catch2WithMain gadm_reader Threads::Threads) +catch_discover_tests(test_gadm_reader WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}) + +add_executable(test_grid unit/test_grid.cpp) +target_link_libraries(test_grid PRIVATE Catch2::Catch2WithMain grid Threads::Threads) +catch_discover_tests(test_grid WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}) + +add_executable(test_search unit/test_search.cpp) +target_link_libraries(test_search PRIVATE Catch2::Catch2WithMain search Threads::Threads) +catch_discover_tests(test_search WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}) + +add_executable(test_enrichers unit/test_enrichers.cpp) +target_link_libraries(test_enrichers PRIVATE Catch2::Catch2WithMain enrichers Threads::Threads) +catch_discover_tests(test_enrichers) diff --git a/tests/functional/test_postgres_live.cpp b/tests/functional/test_postgres_live.cpp new file mode 100644 index 0000000..0ea5a7d --- /dev/null +++ b/tests/functional/test_postgres_live.cpp @@ -0,0 +1,81 @@ +#include <catch2/catch_test_macros.hpp> +#include "postgres/postgres.h" +#include "search/search.h" +#include "json/json.h" +#include "logger/logger.h" + +#include <toml++/toml.h> + +// Note: This test requires a valid config/postgres.toml pointing to a Supabase instance. +// We test against an arbitrary table 'test_items' or standard table. +// In this case we'll test against `grid_search_runs` since we know it exists, +// using a dummy uuid for testing. +// DO NOT RUN UNLESS CONFIGURED. + +TEST_CASE("Postgres Live Operations", "[postgres_live]") { + // Load config + std::string supabase_url; + std::string supabase_key; + try { + auto config = toml::parse_file("config/postgres.toml"); + supabase_url = config["supabase"]["url"].value_or(""); + supabase_key = config["supabase"]["service_key"].value_or(""); + } catch (const std::exception &e) { + WARN("Skipping postgres live tests. Config missing or invalid: " << e.what()); + return; + } + + if (supabase_url.empty() || supabase_key.empty()) { + WARN("Skipping postgres live tests. Supabase credentials missing."); + return; + } + + postgres::Config pg_cfg; + pg_cfg.supabase_url = supabase_url; + pg_cfg.supabase_key = supabase_key; + postgres::init(pg_cfg); + + REQUIRE(postgres::ping() == "ok"); + + std::string test_id = "00000000-0000-0000-0000-0000000000cc"; + std::string user_id = "3bb4cfbf-318b-44d3-a9d3-35680e738421"; + + SECTION("Insert, Query, Update, Upsert, Delete") { + // 1. Clean up first just in case + postgres::del("grid_search_runs", "id=eq." + test_id); + + // 2. Insert + std::string insert_body = R"({"id": ")" + test_id + R"(", "user_id": ")" + user_id + R"(", "run_id": "test_run", "status": "searching", "request": {}})"; + std::string res1 = postgres::insert("grid_search_runs", insert_body); + + // 3. Query + std::string res2 = postgres::query("grid_search_runs", "*", "id=eq." + test_id); + WARN("Insert Result: " << res1); + WARN("Query Result: " << res2); + REQUIRE(json::is_valid(res2)); + REQUIRE(res2.find("test_run") != std::string::npos); + + // 4. Update + std::string update_body = R"({"status": "enriching"})"; + std::string res3 = postgres::update("grid_search_runs", update_body, "id=eq." + test_id); + REQUIRE(json::is_valid(res3)); + REQUIRE(res3.find("error") == std::string::npos); + + // 5. Upsert + std::string upsert_body = R"({"id": ")" + test_id + R"(", "user_id": ")" + user_id + R"(", "run_id": "upsert_run", "status": "complete", "request": {}})"; + std::string res4 = postgres::upsert("grid_search_runs", upsert_body, "id"); + REQUIRE(res4.find("error") == std::string::npos); + + // Query again to verify upsert + std::string res5 = postgres::query("grid_search_runs", "*", "id=eq." + test_id); + REQUIRE(res5.find("upsert_run") != std::string::npos); + + // 6. Delete + std::string res6 = postgres::del("grid_search_runs", "id=eq." + test_id); + REQUIRE(json::is_valid(res6)); + + // Verify deleted + std::string res7 = postgres::query("grid_search_runs", "*", "id=eq." + test_id); + REQUIRE(res7 == "[]"); + } +} diff --git a/tests/unit/test_enrichers.cpp b/tests/unit/test_enrichers.cpp new file mode 100644 index 0000000..9b43390 --- /dev/null +++ b/tests/unit/test_enrichers.cpp @@ -0,0 +1,115 @@ +#include <catch2/catch_test_macros.hpp> +#include "enrichers/enrichers.h" + +using namespace enrichers; + +// ── is_likely_email ───────────────────────────────────────────────────────── + +TEST_CASE("is_likely_email: valid emails", "[enrichers]") { + CHECK(is_likely_email("info@example.com")); + CHECK(is_likely_email("john.doe@company.co.uk")); + CHECK(is_likely_email("contact@recycling-firm.de")); + CHECK(is_likely_email("hello@my-domain.org")); +} + +TEST_CASE("is_likely_email: rejects non-emails", "[enrichers]") { + CHECK_FALSE(is_likely_email("")); + CHECK_FALSE(is_likely_email("not-an-email")); + CHECK_FALSE(is_likely_email("@no-user.com")); + CHECK_FALSE(is_likely_email("user@")); +} + +TEST_CASE("is_likely_email: rejects asset extensions", "[enrichers]") { + CHECK_FALSE(is_likely_email("logo@site.png")); + CHECK_FALSE(is_likely_email("icon@site.svg")); + CHECK_FALSE(is_likely_email("style@site.css")); + CHECK_FALSE(is_likely_email("script@site.js")); + CHECK_FALSE(is_likely_email("photo@site.jpg")); + CHECK_FALSE(is_likely_email("photo@site.webp")); +} + +TEST_CASE("is_likely_email: rejects placeholder/hash patterns", "[enrichers]") { + CHECK_FALSE(is_likely_email("user@example.com")); + CHECK_FALSE(is_likely_email("test@test.com")); + CHECK_FALSE(is_likely_email("a3f2b@hash.com")); + CHECK_FALSE(is_likely_email("your@email.com")); + CHECK_FALSE(is_likely_email("email@email.com")); + CHECK_FALSE(is_likely_email("name@domain.com")); +} + +// ── extract_emails ────────────────────────────────────────────────────────── + +TEST_CASE("extract_emails: finds emails in text", "[enrichers]") { + auto emails = extract_emails("Contact us at info@example.org or sales@company.com"); + CHECK(emails.size() >= 2); + + bool found_info = false, found_sales = false; + for (auto& e : emails) { + if (e == "info@example.org") found_info = true; + if (e == "sales@company.com") found_sales = true; + } + CHECK(found_info); + CHECK(found_sales); +} + +TEST_CASE("extract_emails: deduplicates", "[enrichers]") { + auto emails = extract_emails("info@acme.org info@acme.org info@acme.org"); + CHECK(emails.size() == 1); +} + +TEST_CASE("extract_emails: empty text returns empty", "[enrichers]") { + auto emails = extract_emails(""); + CHECK(emails.empty()); +} + +TEST_CASE("extract_emails: filters out asset emails", "[enrichers]") { + auto emails = extract_emails("logo@site.png info@real-company.de"); + CHECK(emails.size() == 1); + CHECK(emails[0] == "info@real-company.de"); +} + +// ── resolve_url ───────────────────────────────────────────────────────────── + +TEST_CASE("resolve_url: absolute stays absolute", "[enrichers]") { + CHECK(resolve_url("https://example.com", "https://other.com/page") == "https://other.com/page"); +} + +TEST_CASE("resolve_url: relative path", "[enrichers]") { + auto r = resolve_url("https://example.com/page", "/contact"); + CHECK(r == "https://example.com/contact"); +} + +TEST_CASE("resolve_url: protocol-relative", "[enrichers]") { + auto r = resolve_url("https://example.com", "//other.com/foo"); + CHECK(r == "https://other.com/foo"); +} + +TEST_CASE("resolve_url: relative without slash", "[enrichers]") { + auto r = resolve_url("https://example.com/dir/page", "about.html"); + CHECK(r == "https://example.com/dir/about.html"); +} + +// ── status_string ─────────────────────────────────────────────────────────── + +TEST_CASE("status_string: covers all statuses", "[enrichers]") { + CHECK(std::string(status_string(EnrichStatus::OK)) == "OK"); + CHECK(std::string(status_string(EnrichStatus::NO_EMAIL)) == "NO_EMAIL"); + CHECK(std::string(status_string(EnrichStatus::META_TIMEOUT)) == "META_TIMEOUT"); + CHECK(std::string(status_string(EnrichStatus::EMAIL_TIMEOUT)) == "EMAIL_TIMEOUT"); + CHECK(std::string(status_string(EnrichStatus::FETCH_ERROR)) == "FETCH_ERROR"); + CHECK(std::string(status_string(EnrichStatus::NO_PAGES)) == "NO_PAGES"); + CHECK(std::string(status_string(EnrichStatus::ERROR)) == "ERROR"); +} + +// ── EnrichConfig defaults ─────────────────────────────────────────────────── + +TEST_CASE("EnrichConfig: default values", "[enrichers]") { + EnrichConfig cfg; + CHECK(cfg.meta_timeout_ms == 20000); + CHECK(cfg.email_timeout_ms == 30000); + CHECK(cfg.email_page_timeout_ms == 10000); + CHECK(cfg.email_max_pages == 8); + CHECK(cfg.email_abort_after == 1); + CHECK_FALSE(cfg.contact_patterns.empty()); + CHECK_FALSE(cfg.probe_paths.empty()); +} diff --git a/tests/unit/test_html.cpp b/tests/unit/test_html.cpp index 8263b73..e49f895 100644 --- a/tests/unit/test_html.cpp +++ b/tests/unit/test_html.cpp @@ -1,6 +1,14 @@ #include <catch2/catch_test_macros.hpp> +#include <string> +#include <thread> +#include <vector> #include "html/html.h" +#include "html/html2md.h" + +// ═══════════════════════════════════════════════════════ +// html::parse / html::select (existing) +// ═══════════════════════════════════════════════════════ TEST_CASE("html::parse returns elements from valid HTML", "[html]") { auto elements = @@ -22,15 +30,12 @@ TEST_CASE("html::parse returns elements from valid HTML", "[html]") { TEST_CASE("html::parse returns empty for empty input", "[html]") { auto elements = html::parse(""); - // Empty or minimal — parser may produce an empty body REQUIRE(elements.empty()); } TEST_CASE("html::parse handles nested elements", "[html]") { auto elements = html::parse("<div><span>Nested</span></div>"); - // Parent nodes (body, div) also get text "Nested" via node_text. - // Just verify that the span element is present among the results. bool found_span = false; for (const auto &el : elements) { if (el.tag == "span" && el.text == "Nested") { @@ -61,3 +66,387 @@ TEST_CASE("html::select works with class selector", "[html][select]") { REQUIRE(matches.size() == 1); CHECK(matches[0] == "X"); } + +// ═══════════════════════════════════════════════════════ +// html2md — conversion & large-chunk robustness +// ═══════════════════════════════════════════════════════ + +TEST_CASE("html2md basic conversion", "[html2md]") { + std::string md = html2md::Convert("<h1>Hello</h1><p>World</p>"); + CHECK(md.find("Hello") != std::string::npos); + CHECK(md.find("World") != std::string::npos); +} + +TEST_CASE("html2md empty input", "[html2md]") { + std::string md = html2md::Convert(""); + CHECK(md.empty()); +} + +TEST_CASE("html2md whitespace-only input", "[html2md]") { + std::string md = html2md::Convert(" \n\t "); + // Should return empty or whitespace — must not crash + CHECK(md.size() < 20); +} + +// ---------- large payload stress tests ---------- + +static std::string make_paragraphs(size_t count) { + std::string html; + html.reserve(count * 40); + for (size_t i = 0; i < count; ++i) { + html += "<p>Paragraph number "; + html += std::to_string(i); + html += " with some filler text.</p>\n"; + } + return html; +} + +static std::string make_large_html(size_t target_bytes) { + // Build a chunk of roughly target_bytes by repeating a row + const std::string row = "<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor.</p>\n"; + std::string html; + html.reserve(target_bytes + 256); + html += "<html><body>"; + while (html.size() < target_bytes) { + html += row; + } + html += "</body></html>"; + return html; +} + +TEST_CASE("html2md handles 64KB HTML", "[html2md][large]") { + auto html = make_large_html(64 * 1024); + REQUIRE(html.size() >= 64 * 1024); + std::string md = html2md::Convert(html); + CHECK(!md.empty()); + CHECK(md.find("Lorem ipsum") != std::string::npos); +} + +TEST_CASE("html2md handles 512KB HTML", "[html2md][large]") { + auto html = make_large_html(512 * 1024); + std::string md = html2md::Convert(html); + CHECK(!md.empty()); +} + +TEST_CASE("html2md handles 1MB HTML", "[html2md][large]") { + auto html = make_large_html(1024 * 1024); + std::string md = html2md::Convert(html); + CHECK(!md.empty()); +} + +TEST_CASE("html2md 10K paragraphs", "[html2md][large]") { + auto html = make_paragraphs(10000); + std::string md = html2md::Convert(html); + CHECK(!md.empty()); + CHECK(md.find("Paragraph number 9999") != std::string::npos); +} + +// ---------- deeply nested HTML ---------- + +TEST_CASE("html2md deeply nested divs (500 levels)", "[html2md][large]") { + const int depth = 500; + std::string html; + for (int i = 0; i < depth; ++i) html += "<div>"; + html += "deep content"; + for (int i = 0; i < depth; ++i) html += "</div>"; + + std::string md = html2md::Convert(html); + CHECK(md.find("deep content") != std::string::npos); +} + +// ---------- wide table ---------- + +TEST_CASE("html2md wide table (200 columns)", "[html2md][large]") { + std::string html = "<table><tr>"; + for (int i = 0; i < 200; ++i) { + html += "<td>C" + std::to_string(i) + "</td>"; + } + html += "</tr></table>"; + + std::string md = html2md::Convert(html); + CHECK(!md.empty()); + CHECK(md.find("C0") != std::string::npos); + CHECK(md.find("C199") != std::string::npos); +} + +// ---------- concurrent conversion ---------- + +TEST_CASE("html2md concurrent conversions are thread-safe", "[html2md][threads]") { + const int num_threads = 8; + const std::string html = make_large_html(32 * 1024); // 32KB each + std::vector<std::string> results(num_threads); + std::vector<std::thread> threads; + + for (int i = 0; i < num_threads; ++i) { + threads.emplace_back([&results, &html, i]() { + results[i] = html2md::Convert(html); + }); + } + + for (auto &t : threads) t.join(); + + for (int i = 0; i < num_threads; ++i) { + CHECK(!results[i].empty()); + CHECK(results[i].find("Lorem ipsum") != std::string::npos); + } +} + +// ═══════════════════════════════════════════════════════ +// html2md — malformed / faulty HTML robustness +// ═══════════════════════════════════════════════════════ + +TEST_CASE("html2md unclosed tags", "[html2md][faulty]") { + std::string md = html2md::Convert("<p>Hello <b>bold <i>italic"); + CHECK(md.find("Hello") != std::string::npos); + CHECK(md.find("bold") != std::string::npos); +} + +TEST_CASE("html2md mismatched/overlapping tags", "[html2md][faulty]") { + std::string md = html2md::Convert("<b>bold <i>both</b> italic</i>"); + CHECK(md.find("bold") != std::string::npos); +} + +TEST_CASE("html2md broken attributes", "[html2md][faulty]") { + std::string md = html2md::Convert(R"(<a href="http://example.com class="bad>Link</a>)"); + // must not crash — output may vary + (void)md; +} + +TEST_CASE("html2md bare text (no tags)", "[html2md][faulty]") { + std::string md = html2md::Convert("Just plain text, no HTML at all."); + CHECK(md.find("Just plain text") != std::string::npos); +} + +TEST_CASE("html2md random binary noise", "[html2md][faulty]") { + // Full 0-255 byte range — previously crashed on MSVC debug builds due to + // signed char passed to isspace() without unsigned cast. Fixed in html2md.cpp. + std::string noise(4096, '\0'); + for (size_t i = 0; i < noise.size(); ++i) { + noise[i] = static_cast<char>((i * 131 + 17) % 256); + } + std::string md = html2md::Convert(noise); + // No assertion on content — just survival + (void)md; +} + +TEST_CASE("html2md truncated document", "[html2md][faulty]") { + std::string html = "<html><body><table><tr><td>Cell1</td><td>Cell2"; + // abruptly ends mid-table + std::string md = html2md::Convert(html); + CHECK(md.find("Cell1") != std::string::npos); +} + +TEST_CASE("html2md script and style tags", "[html2md][faulty]") { + std::string html = R"( + <p>Before</p> + <script>alert('xss');</script> + <style>.foo { color: red; }</style> + <p>After</p> + )"; + std::string md = html2md::Convert(html); + CHECK(md.find("Before") != std::string::npos); + CHECK(md.find("After") != std::string::npos); + // script/style content should be stripped + CHECK(md.find("alert") == std::string::npos); +} + +TEST_CASE("html2md null bytes in input", "[html2md][faulty]") { + std::string html = "<p>Hello"; + html += '\0'; + html += "World</p>"; + // html2md may stop at null or handle it — must not crash + std::string md = html2md::Convert(html); + (void)md; +} + +// ═══════════════════════════════════════════════════════ +// html2md — web scraper real-world edge cases +// ═══════════════════════════════════════════════════════ + +TEST_CASE("html2md UTF-8 multibyte (CJK, Arabic, emoji)", "[html2md][scraper]") { + std::string html = + "<h1>日本語テスト</h1>" + "<p>مرحبا بالعالم</p>" + "<p>Ñoño señor über straße</p>" + "<p>Emoji: 🚀🔥💀👻 and 中文混合English</p>"; + std::string md = html2md::Convert(html); + CHECK(md.find("Emoji") != std::string::npos); +} + +TEST_CASE("html2md BOM prefix", "[html2md][scraper]") { + // UTF-8 BOM (EF BB BF) prepended — common from Windows-origin pages + std::string html = "\xEF\xBB\xBF<html><body><p>Content after BOM</p></body></html>"; + std::string md = html2md::Convert(html); + CHECK(md.find("Content after BOM") != std::string::npos); +} + +TEST_CASE("html2md entity soup", "[html2md][scraper]") { + std::string html = + "<p>Price: €10 & <20> items</p>" + "<p>   indented — dashes – more</p>" + "<p>Bad entity: ¬real; and 󴈿 and &#xZZZZ;</p>"; + std::string md = html2md::Convert(html); + CHECK(md.find("Price") != std::string::npos); +} + +TEST_CASE("html2md CDATA and comments", "[html2md][scraper]") { + std::string html = + "<p>Before</p>" + "<!-- <script>alert('xss')</script> -->" + "<![CDATA[This is raw <data> & stuff]]>" + "<!-- multi\nline\ncomment -->" + "<p>After</p>"; + std::string md = html2md::Convert(html); + CHECK(md.find("Before") != std::string::npos); + CHECK(md.find("After") != std::string::npos); +} + +TEST_CASE("html2md deeply nested inline tags", "[html2md][scraper]") { + // Real pages sometimes have insanely nested spans from WYSIWYG editors + std::string html = "<p>"; + for (int i = 0; i < 100; ++i) html += "<span><b><i><em><strong>"; + html += "deep text"; + for (int i = 0; i < 100; ++i) html += "</strong></em></i></b></span>"; + html += "</p>"; + std::string md = html2md::Convert(html); + // 100 layers of bold/italic produce tons of ** and * markers — + // just verify no crash and non-empty output + CHECK(!md.empty()); +} + +TEST_CASE("html2md huge single line (no newlines)", "[html2md][scraper]") { + // Minified HTML — one giant line, 200KB + std::string html; + html.reserve(200 * 1024); + html += "<html><body>"; + for (int i = 0; i < 5000; ++i) { + html += "<div><span class=\"c" + std::to_string(i) + "\">item" + + std::to_string(i) + "</span></div>"; + } + html += "</body></html>"; + std::string md = html2md::Convert(html); + CHECK(md.find("item0") != std::string::npos); + CHECK(md.find("item4999") != std::string::npos); +} + +TEST_CASE("html2md data URI in img src", "[html2md][scraper]") { + std::string html = + "<p>Before image</p>" + "<img src=\"data:image/png;base64,iVBORw0KGgoAAAANSU" + "hEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwAD" + "hgGAWjR9awAAAABJRU5ErkJggg==\" alt=\"pixel\">" + "<p>After image</p>"; + std::string md = html2md::Convert(html); + CHECK(md.find("Before image") != std::string::npos); + CHECK(md.find("After image") != std::string::npos); +} + +TEST_CASE("html2md mixed Latin-1 and UTF-8 bytes", "[html2md][scraper]") { + // Latin-1 encoded chars (0x80-0xFF) that are NOT valid UTF-8 + // Common when scraping pages with wrong charset declaration + std::string html = "<p>caf\xe9 na\xefve r\xe9sum\xe9</p>"; // café naïve résumé in Latin-1 + std::string md = html2md::Convert(html); + CHECK(md.find("caf") != std::string::npos); +} + +TEST_CASE("html2md HTML with HTTP headers prepended", "[html2md][scraper]") { + // Sometimes raw HTTP responses leak into scraper output + std::string html = + "HTTP/1.1 200 OK\r\n" + "Content-Type: text/html; charset=utf-8\r\n" + "Content-Length: 42\r\n" + "\r\n" + "<html><body><p>Real content</p></body></html>"; + std::string md = html2md::Convert(html); + CHECK(md.find("Real content") != std::string::npos); +} + +TEST_CASE("html2md Google Maps / Places markup soup", "[html2md][scraper]") { + // Simplified version of real Google Places HTML with data attributes, + // inline styles, aria labels, and deeply nested structure + std::string html = R"( + <div class="section-result" data-result-index="0" jsaction="pane.resultSection.click"> + <div class="section-result-title"> + <span><span>Müller's Büro & Café</span></span> + </div> + <div class="section-result-details"> + <span class="section-result-location">Königstraße 42, München</span> + <span class="section-result-rating"> + <span aria-label="4.5 stars">★★★★☆</span> + <span>(1,234)</span> + </span> + </div> + <div style="display:none" aria-hidden="true"> + <script type="application/ld+json">{"@type":"LocalBusiness","name":"test"}</script> + </div> + </div> + )"; + std::string md = html2md::Convert(html); + CHECK(md.find("Café") != std::string::npos); + CHECK(md.find("München") != std::string::npos); +} + +// ═══════════════════════════════════════════════════════ +// html2md — output amplification & pathological input +// ═══════════════════════════════════════════════════════ + +TEST_CASE("html2md nested blockquotes (output amplification)", "[html2md][amplification]") { + // Each <blockquote> nesting adds a ">" prefix per line in markdown. + // 50 deep = each line gets 50 ">" prefixes — tests that output doesn't + // explode exponentially. + std::string html; + for (int i = 0; i < 50; ++i) html += "<blockquote>"; + html += "<p>deep quote</p>"; + for (int i = 0; i < 50; ++i) html += "</blockquote>"; + auto md = html2md::Convert(html); + // Output size should be reasonable — not exponential. + // 50 levels * "> " prefix = ~100 chars + text < 1 KB + CHECK(md.size() < 4096); + CHECK(!md.empty()); +} + +TEST_CASE("html2md very long attribute value", "[html2md][amplification]") { + // 1 MB href — tests ExtractAttributeFromTagLeftOf won't choke + std::string long_url(1024 * 1024, 'A'); + std::string html = "<a href=\"" + long_url + "\">Click</a>"; + auto md = html2md::Convert(html); + // Must survive without crash + CHECK(!md.empty()); +} + +TEST_CASE("html2md 10K unclosed p tags", "[html2md][amplification]") { + // Each unclosed <p> generates "\n\n" — tests that md_ doesn't + // grow beyond reasonable bounds + std::string html; + html.reserve(50000); + for (int i = 0; i < 10000; ++i) html += "<p>text"; + auto md = html2md::Convert(html); + CHECK(!md.empty()); + // Should contain the text, output gets big but not catastrophic + CHECK(md.find("text") != std::string::npos); +} + +TEST_CASE("html2md output-to-input ratio check", "[html2md][amplification]") { + // Verify that for normal, representative HTML, output is smaller + // than input (html2md strips tags, so markdown should be leaner) + std::string html; + html.reserve(100 * 1024); + html += "<html><body>"; + for (int i = 0; i < 1000; ++i) { + html += "<div class=\"wrapper\"><p class=\"content\">Paragraph " + + std::to_string(i) + " with some text.</p></div>\n"; + } + html += "</body></html>"; + auto md = html2md::Convert(html); + // Markdown should be smaller than HTML (we stripped all the divs/classes) + CHECK(md.size() < html.size()); + CHECK(md.size() > 0); +} + +TEST_CASE("html2md pathological repeated angle brackets", "[html2md][amplification]") { + // Incomplete tags: lots of "<" without closing ">" — stresses tag parser + std::string html(8192, '<'); + auto md = html2md::Convert(html); + // Must not infinite-loop — just survive + (void)md; +}