453 lines
17 KiB
C++
453 lines
17 KiB
C++
#include <catch2/catch_test_macros.hpp>
|
|
#include <string>
|
|
#include <thread>
|
|
#include <vector>
|
|
|
|
#include "html/html.h"
|
|
#include "html/html2md.h"
|
|
|
|
// ═══════════════════════════════════════════════════════
|
|
// html::parse / html::select (existing)
|
|
// ═══════════════════════════════════════════════════════
|
|
|
|
TEST_CASE("html::parse returns elements from valid HTML", "[html]") {
|
|
auto elements =
|
|
html::parse("<html><body><h1>Title</h1><p>Body</p></body></html>");
|
|
|
|
REQUIRE(elements.size() >= 2);
|
|
|
|
bool found_h1 = false;
|
|
bool found_p = false;
|
|
for (const auto &el : elements) {
|
|
if (el.tag == "h1" && el.text == "Title")
|
|
found_h1 = true;
|
|
if (el.tag == "p" && el.text == "Body")
|
|
found_p = true;
|
|
}
|
|
CHECK(found_h1);
|
|
CHECK(found_p);
|
|
}
|
|
|
|
TEST_CASE("html::parse returns empty for empty input", "[html]") {
|
|
auto elements = html::parse("");
|
|
REQUIRE(elements.empty());
|
|
}
|
|
|
|
TEST_CASE("html::parse handles nested elements", "[html]") {
|
|
auto elements = html::parse("<div><span>Nested</span></div>");
|
|
|
|
bool found_span = false;
|
|
for (const auto &el : elements) {
|
|
if (el.tag == "span" && el.text == "Nested") {
|
|
found_span = true;
|
|
}
|
|
}
|
|
CHECK(found_span);
|
|
}
|
|
|
|
TEST_CASE("html::select finds elements by CSS selector", "[html][select]") {
|
|
auto matches = html::select("<ul><li>A</li><li>B</li><li>C</li></ul>", "li");
|
|
|
|
REQUIRE(matches.size() == 3);
|
|
CHECK(matches[0] == "A");
|
|
CHECK(matches[1] == "B");
|
|
CHECK(matches[2] == "C");
|
|
}
|
|
|
|
TEST_CASE("html::select returns empty for no matches", "[html][select]") {
|
|
auto matches = html::select("<p>Hello</p>", "h1");
|
|
REQUIRE(matches.empty());
|
|
}
|
|
|
|
TEST_CASE("html::select works with class selector", "[html][select]") {
|
|
auto matches = html::select(
|
|
R"(<div><span class="a">X</span><span class="b">Y</span></div>)", ".a");
|
|
|
|
REQUIRE(matches.size() == 1);
|
|
CHECK(matches[0] == "X");
|
|
}
|
|
|
|
// ═══════════════════════════════════════════════════════
|
|
// html2md — conversion & large-chunk robustness
|
|
// ═══════════════════════════════════════════════════════
|
|
|
|
TEST_CASE("html2md basic conversion", "[html2md]") {
|
|
std::string md = html2md::Convert("<h1>Hello</h1><p>World</p>");
|
|
CHECK(md.find("Hello") != std::string::npos);
|
|
CHECK(md.find("World") != std::string::npos);
|
|
}
|
|
|
|
TEST_CASE("html2md empty input", "[html2md]") {
|
|
std::string md = html2md::Convert("");
|
|
CHECK(md.empty());
|
|
}
|
|
|
|
TEST_CASE("html2md whitespace-only input", "[html2md]") {
|
|
std::string md = html2md::Convert(" \n\t ");
|
|
// Should return empty or whitespace — must not crash
|
|
CHECK(md.size() < 20);
|
|
}
|
|
|
|
// ---------- large payload stress tests ----------
|
|
|
|
static std::string make_paragraphs(size_t count) {
|
|
std::string html;
|
|
html.reserve(count * 40);
|
|
for (size_t i = 0; i < count; ++i) {
|
|
html += "<p>Paragraph number ";
|
|
html += std::to_string(i);
|
|
html += " with some filler text.</p>\n";
|
|
}
|
|
return html;
|
|
}
|
|
|
|
static std::string make_large_html(size_t target_bytes) {
|
|
// Build a chunk of roughly target_bytes by repeating a row
|
|
const std::string row = "<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor.</p>\n";
|
|
std::string html;
|
|
html.reserve(target_bytes + 256);
|
|
html += "<html><body>";
|
|
while (html.size() < target_bytes) {
|
|
html += row;
|
|
}
|
|
html += "</body></html>";
|
|
return html;
|
|
}
|
|
|
|
TEST_CASE("html2md handles 64KB HTML", "[html2md][large]") {
|
|
auto html = make_large_html(64 * 1024);
|
|
REQUIRE(html.size() >= 64 * 1024);
|
|
std::string md = html2md::Convert(html);
|
|
CHECK(!md.empty());
|
|
CHECK(md.find("Lorem ipsum") != std::string::npos);
|
|
}
|
|
|
|
TEST_CASE("html2md handles 512KB HTML", "[html2md][large]") {
|
|
auto html = make_large_html(512 * 1024);
|
|
std::string md = html2md::Convert(html);
|
|
CHECK(!md.empty());
|
|
}
|
|
|
|
TEST_CASE("html2md handles 1MB HTML", "[html2md][large]") {
|
|
auto html = make_large_html(1024 * 1024);
|
|
std::string md = html2md::Convert(html);
|
|
CHECK(!md.empty());
|
|
}
|
|
|
|
TEST_CASE("html2md 10K paragraphs", "[html2md][large]") {
|
|
auto html = make_paragraphs(10000);
|
|
std::string md = html2md::Convert(html);
|
|
CHECK(!md.empty());
|
|
CHECK(md.find("Paragraph number 9999") != std::string::npos);
|
|
}
|
|
|
|
// ---------- deeply nested HTML ----------
|
|
|
|
TEST_CASE("html2md deeply nested divs (500 levels)", "[html2md][large]") {
|
|
const int depth = 500;
|
|
std::string html;
|
|
for (int i = 0; i < depth; ++i) html += "<div>";
|
|
html += "deep content";
|
|
for (int i = 0; i < depth; ++i) html += "</div>";
|
|
|
|
std::string md = html2md::Convert(html);
|
|
CHECK(md.find("deep content") != std::string::npos);
|
|
}
|
|
|
|
// ---------- wide table ----------
|
|
|
|
TEST_CASE("html2md wide table (200 columns)", "[html2md][large]") {
|
|
std::string html = "<table><tr>";
|
|
for (int i = 0; i < 200; ++i) {
|
|
html += "<td>C" + std::to_string(i) + "</td>";
|
|
}
|
|
html += "</tr></table>";
|
|
|
|
std::string md = html2md::Convert(html);
|
|
CHECK(!md.empty());
|
|
CHECK(md.find("C0") != std::string::npos);
|
|
CHECK(md.find("C199") != std::string::npos);
|
|
}
|
|
|
|
// ---------- concurrent conversion ----------
|
|
|
|
TEST_CASE("html2md concurrent conversions are thread-safe", "[html2md][threads]") {
|
|
const int num_threads = 8;
|
|
const std::string html = make_large_html(32 * 1024); // 32KB each
|
|
std::vector<std::string> results(num_threads);
|
|
std::vector<std::thread> threads;
|
|
|
|
for (int i = 0; i < num_threads; ++i) {
|
|
threads.emplace_back([&results, &html, i]() {
|
|
results[i] = html2md::Convert(html);
|
|
});
|
|
}
|
|
|
|
for (auto &t : threads) t.join();
|
|
|
|
for (int i = 0; i < num_threads; ++i) {
|
|
CHECK(!results[i].empty());
|
|
CHECK(results[i].find("Lorem ipsum") != std::string::npos);
|
|
}
|
|
}
|
|
|
|
// ═══════════════════════════════════════════════════════
|
|
// html2md — malformed / faulty HTML robustness
|
|
// ═══════════════════════════════════════════════════════
|
|
|
|
TEST_CASE("html2md unclosed tags", "[html2md][faulty]") {
|
|
std::string md = html2md::Convert("<p>Hello <b>bold <i>italic");
|
|
CHECK(md.find("Hello") != std::string::npos);
|
|
CHECK(md.find("bold") != std::string::npos);
|
|
}
|
|
|
|
TEST_CASE("html2md mismatched/overlapping tags", "[html2md][faulty]") {
|
|
std::string md = html2md::Convert("<b>bold <i>both</b> italic</i>");
|
|
CHECK(md.find("bold") != std::string::npos);
|
|
}
|
|
|
|
TEST_CASE("html2md broken attributes", "[html2md][faulty]") {
|
|
std::string md = html2md::Convert(R"(<a href="http://example.com class="bad>Link</a>)");
|
|
// must not crash — output may vary
|
|
(void)md;
|
|
}
|
|
|
|
TEST_CASE("html2md bare text (no tags)", "[html2md][faulty]") {
|
|
std::string md = html2md::Convert("Just plain text, no HTML at all.");
|
|
CHECK(md.find("Just plain text") != std::string::npos);
|
|
}
|
|
|
|
TEST_CASE("html2md random binary noise", "[html2md][faulty]") {
|
|
// Full 0-255 byte range — previously crashed on MSVC debug builds due to
|
|
// signed char passed to isspace() without unsigned cast. Fixed in html2md.cpp.
|
|
std::string noise(4096, '\0');
|
|
for (size_t i = 0; i < noise.size(); ++i) {
|
|
noise[i] = static_cast<char>((i * 131 + 17) % 256);
|
|
}
|
|
std::string md = html2md::Convert(noise);
|
|
// No assertion on content — just survival
|
|
(void)md;
|
|
}
|
|
|
|
TEST_CASE("html2md truncated document", "[html2md][faulty]") {
|
|
std::string html = "<html><body><table><tr><td>Cell1</td><td>Cell2";
|
|
// abruptly ends mid-table
|
|
std::string md = html2md::Convert(html);
|
|
CHECK(md.find("Cell1") != std::string::npos);
|
|
}
|
|
|
|
TEST_CASE("html2md script and style tags", "[html2md][faulty]") {
|
|
std::string html = R"(
|
|
<p>Before</p>
|
|
<script>alert('xss');</script>
|
|
<style>.foo { color: red; }</style>
|
|
<p>After</p>
|
|
)";
|
|
std::string md = html2md::Convert(html);
|
|
CHECK(md.find("Before") != std::string::npos);
|
|
CHECK(md.find("After") != std::string::npos);
|
|
// script/style content should be stripped
|
|
CHECK(md.find("alert") == std::string::npos);
|
|
}
|
|
|
|
TEST_CASE("html2md null bytes in input", "[html2md][faulty]") {
|
|
std::string html = "<p>Hello";
|
|
html += '\0';
|
|
html += "World</p>";
|
|
// html2md may stop at null or handle it — must not crash
|
|
std::string md = html2md::Convert(html);
|
|
(void)md;
|
|
}
|
|
|
|
// ═══════════════════════════════════════════════════════
|
|
// html2md — web scraper real-world edge cases
|
|
// ═══════════════════════════════════════════════════════
|
|
|
|
TEST_CASE("html2md UTF-8 multibyte (CJK, Arabic, emoji)", "[html2md][scraper]") {
|
|
std::string html =
|
|
"<h1>日本語テスト</h1>"
|
|
"<p>مرحبا بالعالم</p>"
|
|
"<p>Ñoño señor über straße</p>"
|
|
"<p>Emoji: 🚀🔥💀👻 and 中文混合English</p>";
|
|
std::string md = html2md::Convert(html);
|
|
CHECK(md.find("Emoji") != std::string::npos);
|
|
}
|
|
|
|
TEST_CASE("html2md BOM prefix", "[html2md][scraper]") {
|
|
// UTF-8 BOM (EF BB BF) prepended — common from Windows-origin pages
|
|
std::string html = "\xEF\xBB\xBF<html><body><p>Content after BOM</p></body></html>";
|
|
std::string md = html2md::Convert(html);
|
|
CHECK(md.find("Content after BOM") != std::string::npos);
|
|
}
|
|
|
|
TEST_CASE("html2md entity soup", "[html2md][scraper]") {
|
|
std::string html =
|
|
"<p>Price: €10 & <20> items</p>"
|
|
"<p> indented — dashes – more</p>"
|
|
"<p>Bad entity: ¬real; and 󴈿 and &#xZZZZ;</p>";
|
|
std::string md = html2md::Convert(html);
|
|
CHECK(md.find("Price") != std::string::npos);
|
|
}
|
|
|
|
TEST_CASE("html2md CDATA and comments", "[html2md][scraper]") {
|
|
std::string html =
|
|
"<p>Before</p>"
|
|
"<!-- <script>alert('xss')</script> -->"
|
|
"<![CDATA[This is raw <data> & stuff]]>"
|
|
"<!-- multi\nline\ncomment -->"
|
|
"<p>After</p>";
|
|
std::string md = html2md::Convert(html);
|
|
CHECK(md.find("Before") != std::string::npos);
|
|
CHECK(md.find("After") != std::string::npos);
|
|
}
|
|
|
|
TEST_CASE("html2md deeply nested inline tags", "[html2md][scraper]") {
|
|
// Real pages sometimes have insanely nested spans from WYSIWYG editors
|
|
std::string html = "<p>";
|
|
for (int i = 0; i < 100; ++i) html += "<span><b><i><em><strong>";
|
|
html += "deep text";
|
|
for (int i = 0; i < 100; ++i) html += "</strong></em></i></b></span>";
|
|
html += "</p>";
|
|
std::string md = html2md::Convert(html);
|
|
// 100 layers of bold/italic produce tons of ** and * markers —
|
|
// just verify no crash and non-empty output
|
|
CHECK(!md.empty());
|
|
}
|
|
|
|
TEST_CASE("html2md huge single line (no newlines)", "[html2md][scraper]") {
|
|
// Minified HTML — one giant line, 200KB
|
|
std::string html;
|
|
html.reserve(200 * 1024);
|
|
html += "<html><body>";
|
|
for (int i = 0; i < 5000; ++i) {
|
|
html += "<div><span class=\"c" + std::to_string(i) + "\">item" +
|
|
std::to_string(i) + "</span></div>";
|
|
}
|
|
html += "</body></html>";
|
|
std::string md = html2md::Convert(html);
|
|
CHECK(md.find("item0") != std::string::npos);
|
|
CHECK(md.find("item4999") != std::string::npos);
|
|
}
|
|
|
|
TEST_CASE("html2md data URI in img src", "[html2md][scraper]") {
|
|
std::string html =
|
|
"<p>Before image</p>"
|
|
"<img src=\"data:image/png;base64,iVBORw0KGgoAAAANSU"
|
|
"hEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwAD"
|
|
"hgGAWjR9awAAAABJRU5ErkJggg==\" alt=\"pixel\">"
|
|
"<p>After image</p>";
|
|
std::string md = html2md::Convert(html);
|
|
CHECK(md.find("Before image") != std::string::npos);
|
|
CHECK(md.find("After image") != std::string::npos);
|
|
}
|
|
|
|
TEST_CASE("html2md mixed Latin-1 and UTF-8 bytes", "[html2md][scraper]") {
|
|
// Latin-1 encoded chars (0x80-0xFF) that are NOT valid UTF-8
|
|
// Common when scraping pages with wrong charset declaration
|
|
std::string html = "<p>caf\xe9 na\xefve r\xe9sum\xe9</p>"; // café naïve résumé in Latin-1
|
|
std::string md = html2md::Convert(html);
|
|
CHECK(md.find("caf") != std::string::npos);
|
|
}
|
|
|
|
TEST_CASE("html2md HTML with HTTP headers prepended", "[html2md][scraper]") {
|
|
// Sometimes raw HTTP responses leak into scraper output
|
|
std::string html =
|
|
"HTTP/1.1 200 OK\r\n"
|
|
"Content-Type: text/html; charset=utf-8\r\n"
|
|
"Content-Length: 42\r\n"
|
|
"\r\n"
|
|
"<html><body><p>Real content</p></body></html>";
|
|
std::string md = html2md::Convert(html);
|
|
CHECK(md.find("Real content") != std::string::npos);
|
|
}
|
|
|
|
TEST_CASE("html2md Google Maps / Places markup soup", "[html2md][scraper]") {
|
|
// Simplified version of real Google Places HTML with data attributes,
|
|
// inline styles, aria labels, and deeply nested structure
|
|
std::string html = R"(
|
|
<div class="section-result" data-result-index="0" jsaction="pane.resultSection.click">
|
|
<div class="section-result-title">
|
|
<span><span>Müller's Büro & Café</span></span>
|
|
</div>
|
|
<div class="section-result-details">
|
|
<span class="section-result-location">Königstraße 42, München</span>
|
|
<span class="section-result-rating">
|
|
<span aria-label="4.5 stars">★★★★☆</span>
|
|
<span>(1,234)</span>
|
|
</span>
|
|
</div>
|
|
<div style="display:none" aria-hidden="true">
|
|
<script type="application/ld+json">{"@type":"LocalBusiness","name":"test"}</script>
|
|
</div>
|
|
</div>
|
|
)";
|
|
std::string md = html2md::Convert(html);
|
|
CHECK(md.find("Café") != std::string::npos);
|
|
CHECK(md.find("München") != std::string::npos);
|
|
}
|
|
|
|
// ═══════════════════════════════════════════════════════
|
|
// html2md — output amplification & pathological input
|
|
// ═══════════════════════════════════════════════════════
|
|
|
|
TEST_CASE("html2md nested blockquotes (output amplification)", "[html2md][amplification]") {
|
|
// Each <blockquote> nesting adds a ">" prefix per line in markdown.
|
|
// 50 deep = each line gets 50 ">" prefixes — tests that output doesn't
|
|
// explode exponentially.
|
|
std::string html;
|
|
for (int i = 0; i < 50; ++i) html += "<blockquote>";
|
|
html += "<p>deep quote</p>";
|
|
for (int i = 0; i < 50; ++i) html += "</blockquote>";
|
|
auto md = html2md::Convert(html);
|
|
// Output size should be reasonable — not exponential.
|
|
// 50 levels * "> " prefix = ~100 chars + text < 1 KB
|
|
CHECK(md.size() < 4096);
|
|
CHECK(!md.empty());
|
|
}
|
|
|
|
TEST_CASE("html2md very long attribute value", "[html2md][amplification]") {
|
|
// 1 MB href — tests ExtractAttributeFromTagLeftOf won't choke
|
|
std::string long_url(1024 * 1024, 'A');
|
|
std::string html = "<a href=\"" + long_url + "\">Click</a>";
|
|
auto md = html2md::Convert(html);
|
|
// Must survive without crash
|
|
CHECK(!md.empty());
|
|
}
|
|
|
|
TEST_CASE("html2md 10K unclosed p tags", "[html2md][amplification]") {
|
|
// Each unclosed <p> generates "\n\n" — tests that md_ doesn't
|
|
// grow beyond reasonable bounds
|
|
std::string html;
|
|
html.reserve(50000);
|
|
for (int i = 0; i < 10000; ++i) html += "<p>text";
|
|
auto md = html2md::Convert(html);
|
|
CHECK(!md.empty());
|
|
// Should contain the text, output gets big but not catastrophic
|
|
CHECK(md.find("text") != std::string::npos);
|
|
}
|
|
|
|
TEST_CASE("html2md output-to-input ratio check", "[html2md][amplification]") {
|
|
// Verify that for normal, representative HTML, output is smaller
|
|
// than input (html2md strips tags, so markdown should be leaner)
|
|
std::string html;
|
|
html.reserve(100 * 1024);
|
|
html += "<html><body>";
|
|
for (int i = 0; i < 1000; ++i) {
|
|
html += "<div class=\"wrapper\"><p class=\"content\">Paragraph " +
|
|
std::to_string(i) + " with some text.</p></div>\n";
|
|
}
|
|
html += "</body></html>";
|
|
auto md = html2md::Convert(html);
|
|
// Markdown should be smaller than HTML (we stripped all the divs/classes)
|
|
CHECK(md.size() < html.size());
|
|
CHECK(md.size() > 0);
|
|
}
|
|
|
|
TEST_CASE("html2md pathological repeated angle brackets", "[html2md][amplification]") {
|
|
// Incomplete tags: lots of "<" without closing ">" — stresses tag parser
|
|
std::string html(8192, '<');
|
|
auto md = html2md::Convert(html);
|
|
// Must not infinite-loop — just survive
|
|
(void)md;
|
|
}
|