#include #include #include #include #include "html/html.h" #include "html/html2md.h" // ═══════════════════════════════════════════════════════ // html::parse / html::select (existing) // ═══════════════════════════════════════════════════════ TEST_CASE("html::parse returns elements from valid HTML", "[html]") { auto elements = html::parse("

Title

Body

"); REQUIRE(elements.size() >= 2); bool found_h1 = false; bool found_p = false; for (const auto &el : elements) { if (el.tag == "h1" && el.text == "Title") found_h1 = true; if (el.tag == "p" && el.text == "Body") found_p = true; } CHECK(found_h1); CHECK(found_p); } TEST_CASE("html::parse returns empty for empty input", "[html]") { auto elements = html::parse(""); REQUIRE(elements.empty()); } TEST_CASE("html::parse handles nested elements", "[html]") { auto elements = html::parse("

Nested

"); bool found_span = false; for (const auto &el : elements) { if (el.tag == "span" && el.text == "Nested") { found_span = true; } } CHECK(found_span); } TEST_CASE("html::select finds elements by CSS selector", "[html][select]") { auto matches = html::select("

", "li"); REQUIRE(matches.size() == 3); CHECK(matches[0] == "A"); CHECK(matches[1] == "B"); CHECK(matches[2] == "C"); } TEST_CASE("html::select returns empty for no matches", "[html][select]") { auto matches = html::select("

Hello

", "h1"); REQUIRE(matches.empty()); } TEST_CASE("html::select works with class selector", "[html][select]") { auto matches = html::select( R"(

)", ".a"); REQUIRE(matches.size() == 1); CHECK(matches[0] == "X"); } // ═══════════════════════════════════════════════════════ // html2md — conversion & large-chunk robustness // ═══════════════════════════════════════════════════════ TEST_CASE("html2md basic conversion", "[html2md]") { std::string md = html2md::Convert("

Hello

World

"); CHECK(md.find("Hello") != std::string::npos); CHECK(md.find("World") != std::string::npos); } TEST_CASE("html2md empty input", "[html2md]") { std::string md = html2md::Convert(""); CHECK(md.empty()); } TEST_CASE("html2md whitespace-only input", "[html2md]") { std::string md = html2md::Convert(" \n\t "); // Should return empty or whitespace — must not crash CHECK(md.size() < 20); } // ---------- large payload stress tests ---------- static std::string make_paragraphs(size_t count) { std::string html; html.reserve(count * 40); for (size_t i = 0; i < count; ++i) { html += "

Paragraph number "; html += std::to_string(i); html += " with some filler text.

\n"; } return html; } static std::string make_large_html(size_t target_bytes) { // Build a chunk of roughly target_bytes by repeating a row const std::string row = "

Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor.

\n"; std::string html; html.reserve(target_bytes + 256); html += ""; while (html.size() < target_bytes) { html += row; } html += ""; return html; } TEST_CASE("html2md handles 64KB HTML", "[html2md][large]") { auto html = make_large_html(64 * 1024); REQUIRE(html.size() >= 64 * 1024); std::string md = html2md::Convert(html); CHECK(!md.empty()); CHECK(md.find("Lorem ipsum") != std::string::npos); } TEST_CASE("html2md handles 512KB HTML", "[html2md][large]") { auto html = make_large_html(512 * 1024); std::string md = html2md::Convert(html); CHECK(!md.empty()); } TEST_CASE("html2md handles 1MB HTML", "[html2md][large]") { auto html = make_large_html(1024 * 1024); std::string md = html2md::Convert(html); CHECK(!md.empty()); } TEST_CASE("html2md 10K paragraphs", "[html2md][large]") { auto html = make_paragraphs(10000); std::string md = html2md::Convert(html); CHECK(!md.empty()); CHECK(md.find("Paragraph number 9999") != std::string::npos); } // ---------- deeply nested HTML ---------- TEST_CASE("html2md deeply nested divs (500 levels)", "[html2md][large]") { const int depth = 500; std::string html; for (int i = 0; i < depth; ++i) html += "

"; html += "deep content"; for (int i = 0; i < depth; ++i) html += "

"; std::string md = html2md::Convert(html); CHECK(md.find("deep content") != std::string::npos); } // ---------- wide table ---------- TEST_CASE("html2md wide table (200 columns)", "[html2md][large]") { std::string html = ""; for (int i = 0; i < 200; ++i) { html += ""; } html += "

C" + std::to_string(i) + "

"; std::string md = html2md::Convert(html); CHECK(!md.empty()); CHECK(md.find("C0") != std::string::npos); CHECK(md.find("C199") != std::string::npos); } // ---------- concurrent conversion ---------- TEST_CASE("html2md concurrent conversions are thread-safe", "[html2md][threads]") { const int num_threads = 8; const std::string html = make_large_html(32 * 1024); // 32KB each std::vector results(num_threads); std::vector threads; for (int i = 0; i < num_threads; ++i) { threads.emplace_back([&results, &html, i]() { results[i] = html2md::Convert(html); }); } for (auto &t : threads) t.join(); for (int i = 0; i < num_threads; ++i) { CHECK(!results[i].empty()); CHECK(results[i].find("Lorem ipsum") != std::string::npos); } } // ═══════════════════════════════════════════════════════ // html2md — malformed / faulty HTML robustness // ═══════════════════════════════════════════════════════ TEST_CASE("html2md unclosed tags", "[html2md][faulty]") { std::string md = html2md::Convert("

Hello bold italic"); CHECK(md.find("Hello") != std::string::npos); CHECK(md.find("bold") != std::string::npos); } TEST_CASE("html2md mismatched/overlapping tags", "[html2md][faulty]") { std::string md = html2md::Convert("bold both italic"); CHECK(md.find("bold") != std::string::npos); } TEST_CASE("html2md broken attributes", "[html2md][faulty]") { std::string md = html2md::Convert(R"(Link)"); // must not crash — output may vary (void)md; } TEST_CASE("html2md bare text (no tags)", "[html2md][faulty]") { std::string md = html2md::Convert("Just plain text, no HTML at all."); CHECK(md.find("Just plain text") != std::string::npos); } TEST_CASE("html2md random binary noise", "[html2md][faulty]") { // Full 0-255 byte range — previously crashed on MSVC debug builds due to // signed char passed to isspace() without unsigned cast. Fixed in html2md.cpp. std::string noise(4096, '\0'); for (size_t i = 0; i < noise.size(); ++i) { noise[i] = static_cast((i * 131 + 17) % 256); } std::string md = html2md::Convert(noise); // No assertion on content — just survival (void)md; } TEST_CASE("html2md truncated document", "[html2md][faulty]") { std::string html = "
Cell1 Cell2"; // abruptly ends mid-table std::string md = html2md::Convert(html); CHECK(md.find("Cell1") != std::string::npos); } TEST_CASE("html2md script and style tags", "[html2md][faulty]") { std::string html = R"(
Before

After
)"; std::string md = html2md::Convert(html); CHECK(md.find("Before") != std::string::npos); CHECK(md.find("After") != std::string::npos); // script/style content should be stripped CHECK(md.find("alert") == std::string::npos); } TEST_CASE("html2md null bytes in input", "[html2md][faulty]") { std::string html = "
Hello"; html += '\0'; html += "World
"; // html2md may stop at null or handle it — must not crash std::string md = html2md::Convert(html); (void)md; } // ═══════════════════════════════════════════════════════ // html2md — web scraper real-world edge cases // ═══════════════════════════════════════════════════════ TEST_CASE("html2md UTF-8 multibyte (CJK, Arabic, emoji)", "[html2md][scraper]") { std::string html = "
日本語テスト
" "
مرحبا بالعالم
" "
Ñoño señor über straße
" "
Emoji: 🚀🔥💀👻 and 中文混合English
"; std::string md = html2md::Convert(html); CHECK(md.find("Emoji") != std::string::npos); } TEST_CASE("html2md BOM prefix", "[html2md][scraper]") { // UTF-8 BOM (EF BB BF) prepended — common from Windows-origin pages std::string html = "\xEF\xBB\xBF
Content after BOM
"; std::string md = html2md::Convert(html); CHECK(md.find("Content after BOM") != std::string::npos); } TEST_CASE("html2md entity soup", "[html2md][scraper]") { std::string html = "
Price: €10 & <20> items
" "
indented — dashes – more
" "
Bad entity: ¬real; and 󴈿 and &#xZZZZ;
"; std::string md = html2md::Convert(html); CHECK(md.find("Price") != std::string::npos); } TEST_CASE("html2md CDATA and comments", "[html2md][scraper]") { std::string html = "
Before
" "" " & stuff]]>" "" "
After
"; std::string md = html2md::Convert(html); CHECK(md.find("Before") != std::string::npos); CHECK(md.find("After") != std::string::npos); } TEST_CASE("html2md deeply nested inline tags", "[html2md][scraper]") { // Real pages sometimes have insanely nested spans from WYSIWYG editors std::string html = "
"; for (int i = 0; i < 100; ++i) html += ""; html += "deep text"; for (int i = 0; i < 100; ++i) html += ""; html += "
"; std::string md = html2md::Convert(html); // 100 layers of bold/italic produce tons of ** and * markers — // just verify no crash and non-empty output CHECK(!md.empty()); } TEST_CASE("html2md huge single line (no newlines)", "[html2md][scraper]") { // Minified HTML — one giant line, 200KB std::string html; html.reserve(200 * 1024); html += ""; for (int i = 0; i < 5000; ++i) { html += "
item" + std::to_string(i) + "
"; } html += ""; std::string md = html2md::Convert(html); CHECK(md.find("item0") != std::string::npos); CHECK(md.find("item4999") != std::string::npos); } TEST_CASE("html2md data URI in img src", "[html2md][scraper]") { std::string html = "
Before image
" " $\"pixel\"$ " "
After image
"; std::string md = html2md::Convert(html); CHECK(md.find("Before image") != std::string::npos); CHECK(md.find("After image") != std::string::npos); } TEST_CASE("html2md mixed Latin-1 and UTF-8 bytes", "[html2md][scraper]") { // Latin-1 encoded chars (0x80-0xFF) that are NOT valid UTF-8 // Common when scraping pages with wrong charset declaration std::string html = "
caf\xe9 na\xefve r\xe9sum\xe9
"; // café naïve résumé in Latin-1 std::string md = html2md::Convert(html); CHECK(md.find("caf") != std::string::npos); } TEST_CASE("html2md HTML with HTTP headers prepended", "[html2md][scraper]") { // Sometimes raw HTTP responses leak into scraper output std::string html = "HTTP/1.1 200 OK\r\n" "Content-Type: text/html; charset=utf-8\r\n" "Content-Length: 42\r\n" "\r\n" "
Real content
"; std::string md = html2md::Convert(html); CHECK(md.find("Real content") != std::string::npos); } TEST_CASE("html2md Google Maps / Places markup soup", "[html2md][scraper]") { // Simplified version of real Google Places HTML with data attributes, // inline styles, aria labels, and deeply nested structure std::string html = R"(

Müller's Büro & Café

Königstraße 42, München ★★★★☆ (1,234)

)"; std::string md = html2md::Convert(html); CHECK(md.find("Café") != std::string::npos); CHECK(md.find("München") != std::string::npos); } // ═══════════════════════════════════════════════════════ // html2md — output amplification & pathological input // ═══════════════════════════════════════════════════════ TEST_CASE("html2md nested blockquotes (output amplification)", "[html2md][amplification]") { // Each
nesting adds a ">" prefix per line in markdown. // 50 deep = each line gets 50 ">" prefixes — tests that output doesn't // explode exponentially. std::string html; for (int i = 0; i < 50; ++i) html += "
"; html += "
deep quote
"; for (int i = 0; i < 50; ++i) html += "
"; auto md = html2md::Convert(html); // Output size should be reasonable — not exponential. // 50 levels * "> " prefix = ~100 chars + text < 1 KB CHECK(md.size() < 4096); CHECK(!md.empty()); } TEST_CASE("html2md very long attribute value", "[html2md][amplification]") { // 1 MB href — tests ExtractAttributeFromTagLeftOf won't choke std::string long_url(1024 * 1024, 'A'); std::string html = "Click"; auto md = html2md::Convert(html); // Must survive without crash CHECK(!md.empty()); } TEST_CASE("html2md 10K unclosed p tags", "[html2md][amplification]") { // Each unclosed
generates "\n\n" — tests that md_ doesn't // grow beyond reasonable bounds std::string html; html.reserve(50000); for (int i = 0; i < 10000; ++i) html += "
text"; auto md = html2md::Convert(html); CHECK(!md.empty()); // Should contain the text, output gets big but not catastrophic CHECK(md.find("text") != std::string::npos); } TEST_CASE("html2md output-to-input ratio check", "[html2md][amplification]") { // Verify that for normal, representative HTML, output is smaller // than input (html2md strips tags, so markdown should be leaner) std::string html; html.reserve(100 * 1024); html += ""; for (int i = 0; i < 1000; ++i) { html += "
Paragraph " + std::to_string(i) + " with some text.
\n"; } html += ""; auto md = html2md::Convert(html); // Markdown should be smaller than HTML (we stripped all the divs/classes) CHECK(md.size() < html.size()); CHECK(md.size() > 0); } TEST_CASE("html2md pathological repeated angle brackets", "[html2md][amplification]") { // Incomplete tags: lots of "<" without closing ">" — stresses tag parser std::string html(8192, '<'); auto md = html2md::Convert(html); // Must not infinite-loop — just survive (void)md; }