Skip to content

Commit 4bf2260

Browse files
authored
feature: support latin1 characters for identifiers in spectra and chromatograms for mzML (#47)
1 parent 9140846 commit 4bf2260

4 files changed

Lines changed: 22 additions & 6 deletions

File tree

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ default = ["zlib-ng-compat", "mgf", "mzml"]
6060
checksum = ["dep:md5", "dep:sha1", "dep:base16ct"]
6161

6262
mgf = []
63-
mzml = ["dep:quick-xml", "checksum", "dep:memchr"]
63+
mzml = ["dep:quick-xml", "checksum", "dep:memchr", "dep:encoding_rs"]
6464
imzml = ["mzml", "dep:uuid"]
6565

6666
# mzsignal's main functionality requires a linear algebra backend.
@@ -201,6 +201,7 @@ pin-project-lite = { version = "0.2.16", optional = true }
201201
memchr = { version = "2.7.4", optional = true }
202202
libloading = { version = "0.8.6", optional = true }
203203
zstd = { version = "0.13.3", optional = true }
204+
encoding_rs = { version = "0.8.35", optional = true }
204205

205206

206207
[dev-dependencies]

src/io/mzml/reader.rs

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -738,7 +738,12 @@ impl<C: CentroidLike + BuildFromArrayMap, D: DeconvolutedCentroidLike + BuildFro
738738
match attr_parsed {
739739
Ok(attr) => match attr.key.as_ref() {
740740
b"id" => {
741-
self.entry_id = match attr.unescape_value() {
741+
self.entry_id = match attr.unescape_value()
742+
.map(|v| v.to_string())
743+
.or_else(|_| -> Result<String, quick_xml::Error> {
744+
log::warn!("Detected non-UTF8 character in spectrum id");
745+
Ok(quick_xml::escape::escape(encoding_rs::mem::decode_latin1(&attr.value).as_ref()).into())
746+
}) {
742747
Ok(value) => value.to_string(),
743748
Err(e) => {
744749
return Err(xml_error!(
@@ -898,8 +903,11 @@ impl<C: CentroidLike + BuildFromArrayMap, D: DeconvolutedCentroidLike + BuildFro
898903
b"id" => {
899904
self.entry_id = attr
900905
.unescape_value()
901-
.expect("Error decoding id")
902-
.to_string();
906+
.map(|v| v.to_string())
907+
.or_else(|_| -> Result<String, quick_xml::Error> {
908+
log::warn!("Detected non-UTF8 character in chromatogram id");
909+
Ok(quick_xml::escape::escape(encoding_rs::mem::decode_latin1(&attr.value).as_ref()).into())
910+
}).unwrap();
903911
trace!("Stored chromatogram id = {}", self.entry_id);
904912
}
905913
b"index" => {

src/io/mzml/reading_shared.rs

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -461,8 +461,14 @@ impl IndexedMzMLIndexExtractor {
461461
if attr.key.as_ref() == b"idRef" {
462462
self.last_id = attr
463463
.unescape_value()
464-
.expect("Error decoding idRef")
465-
.to_string();
464+
.map(|v| v.to_string())
465+
.or_else(|_| -> Result<String, quick_xml::Error> {
466+
log::warn!("Detected non-UTF8 character in idRef");
467+
Ok(quick_xml::escape::escape(encoding_rs::mem::decode_latin1(&attr.value).as_ref()).into())
468+
})
469+
.unwrap_or_else(|e| {
470+
panic!("Error decoding idRef on offset {e} from bytes {:?}", attr.value)
471+
});
466472
}
467473
}
468474
Err(err) => {

0 commit comments

Comments
 (0)