From 09ac65771f4d7aee273bfeba8af4ca052ed0c5b1 Mon Sep 17 00:00:00 2001 From: Chris Emerson Date: Sun, 19 Apr 2026 15:31:42 +0100 Subject: [PATCH 1/3] Add `Config::parse_xml()`. This makes it possible to support XHTML in the few cases where it differs from HTML5. --- Cargo.lock | 11 +++++++++++ Cargo.toml | 2 ++ src/lib.rs | 15 +++++++++++++++ src/tests.rs | 41 ++++++++++++++++++++++++++++++++++++++++- 4 files changed, 68 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index 9c34e12..bae330c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -241,6 +241,7 @@ dependencies = [ "termion", "thiserror", "unicode-width", + "xml5ever", ] [[package]] @@ -891,6 +892,16 @@ dependencies = [ "windows-link", ] +[[package]] +name = "xml5ever" +version = "0.39.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ab627f34ff61b80d756180d556f9c68801d836d271b3b8c094504ceca69d221" +dependencies = [ + "log", + "markup5ever", +] + [[package]] name = "yaml-rust" version = "0.4.5" diff --git a/Cargo.toml b/Cargo.toml index 0a2a01b..b55ee9d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -33,6 +33,7 @@ backtrace = { version = "0.3", optional=true } thiserror = "2.0.0" log = { version = "0.4.20", optional = true } nom = { version = "8.0.0", optional = true } +xml5ever = { version = "0.39.0", optional = true } [features] html_trace = ["dep:log"] @@ -40,6 +41,7 @@ html_trace_bt = ["html_trace", "dep:backtrace"] default = [] css = [ "dep:nom" ] css_ext = ["css"] +xml = ["dep:xml5ever"] [[example]] name = "html2term" diff --git a/src/lib.rs b/src/lib.rs index b01cb6f..8eb8eac 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2962,6 +2962,21 @@ pub mod config { .read_from(&mut input)?) } + #[cfg(feature = "xml")] + /// Parse document as XML into a DOM structure. + pub fn parse_xml(&self, mut input: R) -> Result { + use ::xml5ever::{ + driver::{ + parse_document, + }, + tendril::TendrilSink, + }; + let opts = Default::default(); + Ok(parse_document(super::RcDom::default(), opts) + .from_utf8() + .read_from(&mut input)?) + } + /// Convert an HTML DOM into a RenderTree. pub fn dom_to_render_tree(&self, dom: &super::RcDom) -> Result { Ok(RenderTree( diff --git a/src/tests.rs b/src/tests.rs index 880399f..6054ea7 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -3,7 +3,7 @@ use std::str; use crate::config::Config; use crate::render::TaggedLineElement; use crate::render::text_renderer::{PlainDecorator, TaggedString}; -use crate::{Error, config}; +use crate::{config, Error}; use super::render::text_renderer::{RichAnnotation, RichDecorator, TaggedLine, TrivialDecorator}; use super::{TextDecorator, from_read, from_read_with_decorator, parse}; @@ -194,6 +194,20 @@ where assert_eq_str!(rt.to_string(), expected.to_string()); } +#[track_caller] +fn test_xml(input: &[u8], expected: &str, width: usize) { + let conf = config::plain(); + let dom = conf + .parse_xml(input) + .expect("Failed to parse XHTML"); + let rt = conf.dom_to_render_tree(&dom) + .expect("To Render Tree"); + let output = conf.render_to_string(rt, width) + .expect("Render to string"); + + assert_eq_str!(output, expected); +} + #[test] fn test_table() { test_html( @@ -3474,6 +3488,31 @@ fn test_issue_252() { ); } +#[test] +fn test_xml1() { + let doc = br#" + + +Testing, testing + + +

+

Not Heading

+ +"#; + // Parsing XHTML as HTML - expect wrong output. + test_html(doc, r"# Not Heading +", + 20, + ); + // Parsing XHTML as XHTML - expect wrong output. + test_xml(doc, + r"Not Heading +", + 20, + ); +} + #[cfg(feature = "css_ext")] mod css_ext_tests { use super::test_html_conf_rendertree; From ca5e28b3d19ecc1a699ec5da710746cbc07f0ad1 Mon Sep 17 00:00:00 2001 From: Chris Emerson Date: Sun, 19 Apr 2026 17:32:14 +0100 Subject: [PATCH 2/3] Add XHTML handling to the Config API. The default is to auto-detect. --- CHANGELOG.md | 5 ++++ Cargo.lock | 4 +-- Cargo.toml | 2 +- README.md | 23 +++++++++++++++ html2text-cli/Cargo.toml | 1 + src/lib.rs | 62 ++++++++++++++++++++++++++++++++++++---- src/tests.rs | 41 +++++++++++++++++++------- 7 files changed, 118 insertions(+), 20 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f5d7b3b..a96d17c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,11 @@ Possible log types: - `[fixed]` for any bug fixes. - `[security]` to invite users to upgrade in case of vulnerabilities. +### 0.17.1 + +- [added] Add support for XHTML (for the cases where it doesn't quite behave + like HTML). + ### 0.17.0 - [changed] Split `html2text` example into `html2text-cli` crate diff --git a/Cargo.lock b/Cargo.lock index bae330c..cb79c48 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -229,7 +229,7 @@ checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51" [[package]] name = "html2text" -version = "0.17.0" +version = "0.17.1" dependencies = [ "argparse", "backtrace", @@ -246,7 +246,7 @@ dependencies = [ [[package]] name = "html2text-cli" -version = "0.17.0" +version = "0.17.1" dependencies = [ "argparse", "html2text", diff --git a/Cargo.toml b/Cargo.toml index b55ee9d..6cc2c02 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,7 +18,7 @@ members = [ ] [workspace.package] -version = "0.17.0" +version = "0.17.1" repository = "https://github.com/jugglerchris/rust-html2text/" license = "MIT" authors = ["Chris Emerson "] diff --git a/README.md b/README.md index 0052c4b..dd1d567 100644 --- a/README.md +++ b/README.md @@ -96,6 +96,7 @@ read keys from stdin. |-------|------------| |css | Limited handling of CSS, adding Coloured nodes to the render tree. | |css\_ext| Some CSS extensions (see below for details) | +|xml | Support XHTML (for cases where a document may be parsed differently than as HTML). | |html\_trace| Add verbose internal logging (not recommended) | |html\_trace\_bt| Add backtraces to the verbose internal logging | @@ -144,3 +145,25 @@ The following CSS extensions are implemented: * `display: x-raw-dom;` - Show the HTML elements instead of rendering them. (Useful for debugging, along with something like `:nth-child(...)` to select a particular node) + +### XML/XHTML support + +In some rare cases, parsing an XHTML document as HTML behaves differently. For example: + +```xml + + + +

+

Not Heading

+ + + +``` + +HTML does not have the self-closing `

` form, so this would parse as an +unclosed `

` element, containing the following `

`, but in XHTML this is parsed as an empty `

` followed by a `

`. + +With the `xml` Cargo feature enabled, by default documents are parsed as XHTML +if they start with the `` declaration and HTML otherwise. This can be +configured with `Config::xml_mode()`. diff --git a/html2text-cli/Cargo.toml b/html2text-cli/Cargo.toml index 7d5e298..94b1ab1 100644 --- a/html2text-cli/Cargo.toml +++ b/html2text-cli/Cargo.toml @@ -30,3 +30,4 @@ css = ["html2text/css"] css_ext = ["html2text/css_ext"] html_trace = ["html2text/html_trace"] html_trace_btr = ["html2text/html_trace_bt"] +xml = ["html2text/xml"] diff --git a/src/lib.rs b/src/lib.rs index 8eb8eac..ad70e6f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1500,6 +1500,9 @@ struct HtmlContext { use_unicode_strikeout: bool, image_mode: config::ImageRenderMode, + #[cfg(feature = "xml")] + xml_mode: config::XmlMode, + #[cfg(feature = "css_ext")] syntax_highlighters: HighlighterMap, } @@ -2884,6 +2887,21 @@ pub mod config { Filename, } + #[cfg(feature = "xml")] + /// Specify HTML vs XHTML handling + #[derive(Copy, Clone, Debug, Default, PartialEq, Eq)] + #[non_exhaustive] + pub enum XmlMode { + /// Treat as HTML unless the document starts with an XML declaration + /// (``). + #[default] + Auto, + /// Always treat as HTML + Html, + /// Always treat as XHTML + Xhtml, + } + /// Configure the HTML processing. pub struct Config { decorator: D, @@ -2905,6 +2923,9 @@ pub mod config { use_unicode_strikeout: bool, image_mode: ImageRenderMode, + #[cfg(feature = "xml")] + xml_mode: XmlMode, + #[cfg(feature = "css_ext")] syntax_highlighters: HighlighterMap, } @@ -2928,6 +2949,9 @@ pub mod config { use_unicode_strikeout: self.use_unicode_strikeout, image_mode: self.image_mode, + #[cfg(feature = "xml")] + xml_mode: self.xml_mode, + #[cfg(feature = "css_ext")] syntax_highlighters: self.syntax_highlighters.clone(), } @@ -2937,7 +2961,29 @@ pub mod config { where R: io::Read, { + #[cfg(feature = "xml")] + let dom = { + match context.xml_mode { + XmlMode::Html => self.parse_html(input)?, + XmlMode::Xhtml => self.parse_xml(input)?, + XmlMode::Auto => { + const XML_CHECK: &[u8] = b"(&self, mut input: R) -> Result { - use ::xml5ever::{ - driver::{ - parse_document, - }, - tendril::TendrilSink, - }; + use ::xml5ever::{driver::parse_document, tendril::TendrilSink}; let opts = Default::default(); Ok(parse_document(super::RcDom::default(), opts) .from_utf8() @@ -3176,6 +3217,13 @@ pub mod config { self } + #[cfg(feature = "xml")] + /// Configure the HTML vs XHTML parsing mode. + pub fn xml_mode(mut self, xml_mode: XmlMode) -> Self { + self.xml_mode = xml_mode; + self + } + #[cfg(feature = "css_ext")] /// Register a named syntax highlighter. /// @@ -3267,6 +3315,8 @@ pub mod config { include_link_footnotes: false, use_unicode_strikeout: true, image_mode: ImageRenderMode::IgnoreEmpty, + #[cfg(feature = "xml")] + xml_mode: XmlMode::Auto, #[cfg(feature = "css_ext")] syntax_highlighters: Default::default(), } diff --git a/src/tests.rs b/src/tests.rs index 6054ea7..71a57ee 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -3,7 +3,7 @@ use std::str; use crate::config::Config; use crate::render::TaggedLineElement; use crate::render::text_renderer::{PlainDecorator, TaggedString}; -use crate::{config, Error}; +use crate::{Error, config}; use super::render::text_renderer::{RichAnnotation, RichDecorator, TaggedLine, TrivialDecorator}; use super::{TextDecorator, from_read, from_read_with_decorator, parse}; @@ -197,13 +197,9 @@ where #[track_caller] fn test_xml(input: &[u8], expected: &str, width: usize) { let conf = config::plain(); - let dom = conf - .parse_xml(input) - .expect("Failed to parse XHTML"); - let rt = conf.dom_to_render_tree(&dom) - .expect("To Render Tree"); - let output = conf.render_to_string(rt, width) - .expect("Render to string"); + let dom = conf.parse_xml(input).expect("Failed to parse XHTML"); + let rt = conf.dom_to_render_tree(&dom).expect("To Render Tree"); + let output = conf.render_to_string(rt, width).expect("Render to string"); assert_eq_str!(output, expected); } @@ -3489,7 +3485,10 @@ fn test_issue_252() { } #[test] +#[cfg(feature = "xml")] fn test_xml1() { + use crate::config::XmlMode; + let doc = br#" @@ -3500,16 +3499,36 @@ fn test_xml1() {

Not Heading

"#; + // Parsing XHTML as HTML - expect wrong output. - test_html(doc, r"# Not Heading + test_html_conf( + doc, + r"# Not Heading ", 20, + |conf| conf.xml_mode(XmlMode::Html), ); - // Parsing XHTML as XHTML - expect wrong output. - test_xml(doc, + // Parsing with default settings - detects XML correctly + test_html( + doc, + r"Not Heading +", + 20, + ); + // Parsing XHTML as XHTML - expect correct output. + test_xml( + doc, + r"Not Heading +", + 20, + ); + // Parsing XHTML as XHTML - using config and explicit Xml mode. + test_html_conf( + doc, r"Not Heading ", 20, + |conf| conf.xml_mode(XmlMode::Xhtml), ); } From 196e25127a93d3ce367b96cca7da1f7deef5402d Mon Sep 17 00:00:00 2001 From: Chris Emerson Date: Sun, 19 Apr 2026 17:38:46 +0100 Subject: [PATCH 3/3] Fix a missing `#[cfg]` and add the `xml` feature to the CI runs. --- .circleci/config.yml | 4 ++-- src/tests.rs | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index c45617a..6cb7588 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -30,8 +30,8 @@ jobs: steps: - checkout - run: cargo --version - - run: cargo build --features=css,css_ext --workspace - - run: cargo test --features=css,css_ext + - run: cargo build --features=css,css_ext,xml --workspace + - run: cargo test --features=css,css_ext,xml build-1-85: docker: - image: cimg/rust:1.85 diff --git a/src/tests.rs b/src/tests.rs index 71a57ee..9b31eed 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -194,6 +194,7 @@ where assert_eq_str!(rt.to_string(), expected.to_string()); } +#[cfg(feature = "xml")] #[track_caller] fn test_xml(input: &[u8], expected: &str, width: usize) { let conf = config::plain();