From 93010bf75ba908ac365d4c83eb89bb59fdc68852 Mon Sep 17 00:00:00 2001 From: xj Date: Sun, 1 Mar 2026 15:27:08 -0800 Subject: [PATCH] fix(web-fetch): wire html2md feature dependency --- Cargo.lock | 199 ++++++++++++++++++++++++++++++++++++++++++++++++----- Cargo.toml | 5 +- 2 files changed, 184 insertions(+), 20 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ba77ba558..a4b9d7ee3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -427,6 +427,19 @@ version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" +[[package]] +name = "auto_encoder" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f6364e11e0270035ec392151a54f1476e6b3612ef9f4fe09d35e72a8cebcb65" +dependencies = [ + "chardetng", + "encoding_rs", + "percent-encoding", + "phf 0.11.3", + "phf_codegen 0.11.3", +] + [[package]] name = "autocfg" version = "1.5.0" @@ -768,7 +781,7 @@ dependencies = [ "cap-primitives", "cap-std", "io-lifetimes", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -797,7 +810,7 @@ dependencies = [ "maybe-owned", "rustix 1.1.4", "rustix-linux-procfs", - "windows-sys 0.52.0", + "windows-sys 0.59.0", "winx", ] @@ -917,6 +930,17 @@ dependencies = [ "zeroize", ] +[[package]] +name = "chardetng" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14b8f0b65b7b08ae3c8187e8d77174de20cb6777864c6b832d8ad365999cf1ea" +dependencies = [ + "cfg-if", + "encoding_rs", + "memchr", +] + [[package]] name = "chrono" version = "0.4.44" @@ -1468,6 +1492,29 @@ dependencies = [ "typenum", ] +[[package]] +name = "cssparser" +version = "0.36.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dae61cf9c0abb83bd659dab65b7e4e38d8236824c85f0f804f173567bda257d2" +dependencies = [ + "cssparser-macros", + "dtoa-short", + "itoa", + "phf 0.13.1", + "smallvec", +] + +[[package]] +name = "cssparser-macros" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331" +dependencies = [ + "quote", + "syn 2.0.117", +] + [[package]] name = "csv" version = "1.4.0" @@ -1845,6 +1892,21 @@ dependencies = [ "litrs", ] +[[package]] +name = "dtoa" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c3cf4824e2d5f025c7b531afcb2325364084a16806f6d47fbc1f5fbd9960590" + +[[package]] +name = "dtoa-short" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87" +dependencies = [ + "dtoa", +] + [[package]] name = "dunce" version = "1.0.5" @@ -2005,7 +2067,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] @@ -2170,6 +2232,21 @@ dependencies = [ "webdriver", ] +[[package]] +name = "fast_html2md" +version = "0.0.58" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af3a0122fee1bcf6bb9f3d73782e911cce69d95b76a5e29e930af92cd4a8e4e3" +dependencies = [ + "auto_encoder", + "futures-util", + "lazy_static", + "lol_html", + "percent-encoding", + "regex", + "url", +] + [[package]] name = "fastrand" version = "2.3.0" @@ -2184,7 +2261,7 @@ checksum = "0ce92ff622d6dadf7349484f42c93271a0d49b7cc4d466a936405bacbe10aa78" dependencies = [ "cfg-if", "rustix 1.1.4", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -2243,6 +2320,12 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" +[[package]] +name = "foldhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" + [[package]] name = "form_urlencoded" version = "1.2.2" @@ -2260,7 +2343,7 @@ checksum = "94e7099f6313ecacbe1256e8ff9d617b75d1bcb16a6fddef94866d225a01a14a" dependencies = [ "io-lifetimes", "rustix 1.1.4", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -2562,7 +2645,7 @@ version = "0.15.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" dependencies = [ - "foldhash", + "foldhash 0.1.5", "serde", ] @@ -2571,6 +2654,11 @@ name = "hashbrown" version = "0.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash 0.2.0", +] [[package]] name = "hashify" @@ -3184,7 +3272,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2285ddfe3054097ef4b2fe909ef8c3bcd1ea52a8f0d274416caebeef39f04a65" dependencies = [ "io-lifetimes", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -3471,6 +3559,25 @@ version = "0.4.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" +[[package]] +name = "lol_html" +version = "2.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ff94cb6aef6ee52afd2c69331e9109906d855e82bd241f3110dfdf6185899ab" +dependencies = [ + "bitflags 2.11.0", + "cfg-if", + "cssparser", + "encoding_rs", + "foldhash 0.2.0", + "hashbrown 0.16.1", + "memchr", + "mime", + "precomputed-hash", + "selectors", + "thiserror 2.0.18", +] + [[package]] name = "lopdf" version = "0.38.0" @@ -4592,6 +4699,7 @@ version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" dependencies = [ + "phf_macros 0.11.3", "phf_shared 0.11.3", ] @@ -4610,6 +4718,7 @@ version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c1562dc717473dbaa4c1f85a36410e03c047b2e7df7f45ee938fbef64ae7fadf" dependencies = [ + "phf_macros 0.13.1", "phf_shared 0.13.1", "serde", ] @@ -4654,6 +4763,32 @@ dependencies = [ "phf_shared 0.13.1", ] +[[package]] +name = "phf_macros" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f84ac04429c13a7ff43785d75ad27569f2951ce0ffd30a3321230db2fc727216" +dependencies = [ + "phf_generator 0.11.3", + "phf_shared 0.11.3", + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "phf_macros" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "812f032b54b1e759ccd5f8b6677695d5268c588701effba24601f6932f8269ef" +dependencies = [ + "phf_generator 0.13.1", + "phf_shared 0.13.1", + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "phf_shared" version = "0.11.3" @@ -5051,7 +5186,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "343d3bd7056eda839b03204e68deff7d1b13aba7af2b2fd16890697274262ee7" dependencies = [ "heck", - "itertools 0.10.5", + "itertools 0.14.0", "log", "multimap", "petgraph", @@ -5068,7 +5203,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d" dependencies = [ "anyhow", - "itertools 0.10.5", + "itertools 0.14.0", "proc-macro2", "quote", "syn 2.0.117", @@ -5081,7 +5216,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "27c6023962132f4b30eb4c172c91ce92d933da334c59c23cddee82358ddafb0b" dependencies = [ "anyhow", - "itertools 0.10.5", + "itertools 0.14.0", "proc-macro2", "quote", "syn 2.0.117", @@ -5246,7 +5381,7 @@ dependencies = [ "once_cell", "socket2", "tracing", - "windows-sys 0.52.0", + "windows-sys 0.60.2", ] [[package]] @@ -5830,7 +5965,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys 0.4.15", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -5843,7 +5978,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys 0.12.1", - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] @@ -6079,6 +6214,25 @@ dependencies = [ "libc", ] +[[package]] +name = "selectors" +version = "0.33.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "feef350c36147532e1b79ea5c1f3791373e61cbd9a6a2615413b3807bb164fb7" +dependencies = [ + "bitflags 2.11.0", + "cssparser", + "derive_more 2.1.1", + "log", + "new_debug_unreachable", + "phf 0.13.1", + "phf_codegen 0.13.1", + "precomputed-hash", + "rustc-hash", + "servo_arc", + "smallvec", +] + [[package]] name = "self_cell" version = "1.2.2" @@ -6279,6 +6433,15 @@ dependencies = [ "winapi", ] +[[package]] +name = "servo_arc" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "170fb83ab34de17dc69aa7c67482b22218ddb85da56546f9bd6b929e32a05930" +dependencies = [ + "stable_deref_trait", +] + [[package]] name = "sha1" version = "0.10.6" @@ -6425,7 +6588,6 @@ dependencies = [ "cfg-if", "libc", "psm", - "windows-sys 0.52.0", "windows-sys 0.59.0", ] @@ -6574,7 +6736,7 @@ dependencies = [ "fd-lock", "io-lifetimes", "rustix 0.38.44", - "windows-sys 0.52.0", + "windows-sys 0.59.0", "winx", ] @@ -6606,7 +6768,7 @@ dependencies = [ "getrandom 0.4.1", "once_cell", "rustix 1.1.4", - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] @@ -8528,7 +8690,7 @@ version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] @@ -8812,7 +8974,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f3fd376f71958b862e7afb20cfe5a22830e1963462f3a17f49d82a6c1d1f42d" dependencies = [ "bitflags 2.11.0", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -9075,6 +9237,7 @@ dependencies = [ "dialoguer", "directories", "fantoccini", + "fast_html2md", "futures-util", "glob", "hex", diff --git a/Cargo.toml b/Cargo.toml index 20cee6612..c401bbb45 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -58,8 +58,9 @@ image = { version = "0.25", default-features = false, features = ["jpeg", "png"] # URL encoding for web search urlencoding = "2.1" -# HTML to plain text conversion (web_fetch tool) +# HTML to plain text / markdown conversion (web_fetch tool) nanohtml2text = "0.2" +html2md = { package = "fast_html2md", version = "0.0.58", optional = true } # Zip archive extraction zip = { version = "8.1", default-features = false, features = ["deflate"] } @@ -240,7 +241,7 @@ whatsapp-web = ["dep:wa-rs", "dep:wa-rs-core", "dep:wa-rs-binary", "dep:wa-rs-pr # Optional provider feature flags used by cfg(feature = "...") guards. # Keep disabled by default to preserve current runtime behavior. firecrawl = [] -web-fetch-html2md = [] +web-fetch-html2md = ["dep:html2md"] [profile.release] opt-level = "z" # Optimize for size