Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ jobs:
steps:
- checkout
- run: cargo --version
- run: cargo build --features=css,css_ext --workspace
- run: cargo test --features=css,css_ext
- run: cargo build --features=css,css_ext,xml --workspace
- run: cargo test --features=css,css_ext,xml
build-1-85:
docker:
- image: cimg/rust:1.85
Expand Down
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@ Possible log types:
- `[fixed]` for any bug fixes.
- `[security]` to invite users to upgrade in case of vulnerabilities.

### 0.17.1

- [added] Add support for XHTML (for the cases where it doesn't quite behave
like HTML).

### 0.17.0

- [changed] Split `html2text` example into `html2text-cli` crate
Expand Down
15 changes: 13 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ members = [
]

[workspace.package]
version = "0.17.0"
version = "0.17.1"
repository = "https://github.com/jugglerchris/rust-html2text/"
license = "MIT"
authors = ["Chris Emerson <github@mail.nosreme.org>"]
Expand All @@ -33,13 +33,15 @@ backtrace = { version = "0.3", optional=true }
thiserror = "2.0.0"
log = { version = "0.4.20", optional = true }
nom = { version = "8.0.0", optional = true }
xml5ever = { version = "0.39.0", optional = true }

[features]
html_trace = ["dep:log"]
html_trace_bt = ["html_trace", "dep:backtrace"]
default = []
css = [ "dep:nom" ]
css_ext = ["css"]
xml = ["dep:xml5ever"]

[[example]]
name = "html2term"
Expand Down
23 changes: 23 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ read keys from stdin.
|-------|------------|
|css | Limited handling of CSS, adding Coloured nodes to the render tree. |
|css\_ext| Some CSS extensions (see below for details) |
|xml | Support XHTML (for cases where a document may be parsed differently than as HTML). |
|html\_trace| Add verbose internal logging (not recommended) |
|html\_trace\_bt| Add backtraces to the verbose internal logging |

Expand Down Expand Up @@ -144,3 +145,25 @@ The following CSS extensions are implemented:
* `display: x-raw-dom;`
- Show the HTML elements instead of rendering them. (Useful for debugging, along
with something like `:nth-child(...)` to select a particular node)

### XML/XHTML support

In some rare cases, parsing an XHTML document as HTML behaves differently. For example:

```xml
<?xml version="1.0"?>
<html xmlns="http://www.w3.org/1999/xhtml">
<body>
<h1/>
<p>Not Heading</p>
</body>
</html>

```

HTML does not have the self-closing `<h1/>` form, so this would parse as an
unclosed `<h1>` element, containing the following `<p>`, but in XHTML this is parsed as an empty `<h1>` followed by a `<p>`.

With the `xml` Cargo feature enabled, by default documents are parsed as XHTML
if they start with the `<?xml?>` declaration and HTML otherwise. This can be
configured with `Config::xml_mode()`.
1 change: 1 addition & 0 deletions html2text-cli/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,4 @@ css = ["html2text/css"]
css_ext = ["html2text/css_ext"]
html_trace = ["html2text/html_trace"]
html_trace_btr = ["html2text/html_trace_bt"]
xml = ["html2text/xml"]
65 changes: 65 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1500,6 +1500,9 @@ struct HtmlContext {
use_unicode_strikeout: bool,
image_mode: config::ImageRenderMode,

#[cfg(feature = "xml")]
xml_mode: config::XmlMode,

#[cfg(feature = "css_ext")]
syntax_highlighters: HighlighterMap,
}
Expand Down Expand Up @@ -2884,6 +2887,21 @@ pub mod config {
Filename,
}

#[cfg(feature = "xml")]
/// Specify HTML vs XHTML handling
#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)]
#[non_exhaustive]
pub enum XmlMode {
/// Treat as HTML unless the document starts with an XML declaration
/// (`<?xml ...?>`).
#[default]
Auto,
/// Always treat as HTML
Html,
/// Always treat as XHTML
Xhtml,
}

/// Configure the HTML processing.
pub struct Config<D: TextDecorator> {
decorator: D,
Expand All @@ -2905,6 +2923,9 @@ pub mod config {
use_unicode_strikeout: bool,
image_mode: ImageRenderMode,

#[cfg(feature = "xml")]
xml_mode: XmlMode,

#[cfg(feature = "css_ext")]
syntax_highlighters: HighlighterMap,
}
Expand All @@ -2928,6 +2949,9 @@ pub mod config {
use_unicode_strikeout: self.use_unicode_strikeout,
image_mode: self.image_mode,

#[cfg(feature = "xml")]
xml_mode: self.xml_mode,

#[cfg(feature = "css_ext")]
syntax_highlighters: self.syntax_highlighters.clone(),
}
Expand All @@ -2937,7 +2961,29 @@ pub mod config {
where
R: io::Read,
{
#[cfg(feature = "xml")]
let dom = {
match context.xml_mode {
XmlMode::Html => self.parse_html(input)?,
XmlMode::Xhtml => self.parse_xml(input)?,
XmlMode::Auto => {
const XML_CHECK: &[u8] = b"<?xml";
let mut input = input;
let mut firstbuf = [0u8; XML_CHECK.len()];
let bytes_read = input.read(&mut firstbuf)?;
let first_slice = &firstbuf[..bytes_read];
if bytes_read == XML_CHECK.len() && &firstbuf == XML_CHECK {
self.parse_xml(std::io::Read::chain(first_slice, input))?
} else {
self.parse_html(std::io::Read::chain(first_slice, input))?
}
}
}
};

#[cfg(not(feature = "xml"))]
let dom = self.parse_html(input)?;

let render_tree = super::dom_to_render_tree_with_context(
dom.document.clone(),
&mut io::sink(),
Expand All @@ -2962,6 +3008,16 @@ pub mod config {
.read_from(&mut input)?)
}

#[cfg(feature = "xml")]
/// Parse document as XML into a DOM structure.
pub fn parse_xml<R: io::Read>(&self, mut input: R) -> Result<super::RcDom> {
use ::xml5ever::{driver::parse_document, tendril::TendrilSink};
let opts = Default::default();
Ok(parse_document(super::RcDom::default(), opts)
.from_utf8()
.read_from(&mut input)?)
}

/// Convert an HTML DOM into a RenderTree.
pub fn dom_to_render_tree(&self, dom: &super::RcDom) -> Result<RenderTree> {
Ok(RenderTree(
Expand Down Expand Up @@ -3161,6 +3217,13 @@ pub mod config {
self
}

#[cfg(feature = "xml")]
/// Configure the HTML vs XHTML parsing mode.
pub fn xml_mode(mut self, xml_mode: XmlMode) -> Self {
self.xml_mode = xml_mode;
self
}

#[cfg(feature = "css_ext")]
/// Register a named syntax highlighter.
///
Expand Down Expand Up @@ -3252,6 +3315,8 @@ pub mod config {
include_link_footnotes: false,
use_unicode_strikeout: true,
image_mode: ImageRenderMode::IgnoreEmpty,
#[cfg(feature = "xml")]
xml_mode: XmlMode::Auto,
#[cfg(feature = "css_ext")]
syntax_highlighters: Default::default(),
}
Expand Down
59 changes: 59 additions & 0 deletions src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,17 @@ where
assert_eq_str!(rt.to_string(), expected.to_string());
}

#[cfg(feature = "xml")]
#[track_caller]
fn test_xml(input: &[u8], expected: &str, width: usize) {
let conf = config::plain();
let dom = conf.parse_xml(input).expect("Failed to parse XHTML");
let rt = conf.dom_to_render_tree(&dom).expect("To Render Tree");
let output = conf.render_to_string(rt, width).expect("Render to string");

assert_eq_str!(output, expected);
}

#[test]
fn test_table() {
test_html(
Expand Down Expand Up @@ -3474,6 +3485,54 @@ fn test_issue_252() {
);
}

#[test]
#[cfg(feature = "xml")]
fn test_xml1() {
use crate::config::XmlMode;

let doc = br#"<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Testing, testing</title>
</head>
<body>
<h1/>
<p>Not Heading</p>
</body>
</html>"#;

// Parsing XHTML as HTML - expect wrong output.
test_html_conf(
doc,
r"# Not Heading
",
20,
|conf| conf.xml_mode(XmlMode::Html),
);
// Parsing with default settings - detects XML correctly
test_html(
doc,
r"Not Heading
",
20,
);
// Parsing XHTML as XHTML - expect correct output.
test_xml(
doc,
r"Not Heading
",
20,
);
// Parsing XHTML as XHTML - using config and explicit Xml mode.
test_html_conf(
doc,
r"Not Heading
",
20,
|conf| conf.xml_mode(XmlMode::Xhtml),
);
}

#[cfg(feature = "css_ext")]
mod css_ext_tests {
use super::test_html_conf_rendertree;
Expand Down
Loading