Skip to content

Commit 86c7dcb

Browse files
committed
Refactor reading numeric quantities into parser
1 parent aaa0a8f commit 86c7dcb

File tree

3 files changed

+211
-9
lines changed

3 files changed

+211
-9
lines changed

src/language/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,5 @@ mod quantity;
44
mod types;
55

66
// Re-export all public symbols
7+
pub use quantity::*;
78
pub use types::*;

src/language/quantity.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ pub fn parse_quantity(input: &str) -> Option<Quantity> {
9595
})
9696
}
9797

98-
fn parse_decimal(input: &str) -> Option<Decimal> {
98+
pub fn parse_decimal(input: &str) -> Option<Decimal> {
9999
if let Some(dot_pos) = input.find('.') {
100100
// Has decimal point
101101
let whole_part = &input[..dot_pos];
@@ -125,7 +125,8 @@ fn parse_decimal(input: &str) -> Option<Decimal> {
125125
}
126126
}
127127

128-
fn convert_superscript(input: &str) -> String {
128+
/// Convert Unicode superscript characters to ASCII digits
129+
pub fn convert_superscript(input: &str) -> String {
129130
input
130131
.chars()
131132
.map(|c| match c {

src/parsing/parser.rs

Lines changed: 207 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1639,13 +1639,204 @@ impl<'i> Parser<'i> {
16391639

16401640
let content = self.source;
16411641

1642-
// Parser is whitespace agnostic - consume entire remaining content
1643-
// The outer take_*() methods have already isolated the numeric content
1644-
let numeric = validate_numeric(content).ok_or(ParsingError::InvalidNumeric(self.offset))?;
1642+
if is_numeric_integral(content) {
1643+
self.read_numeric_integral()
1644+
} else if is_numeric_quantity(content) {
1645+
self.read_numeric_quantity()
1646+
} else {
1647+
Err(ParsingError::InvalidNumeric(self.offset))
1648+
}
1649+
}
16451650

1646-
self.advance(content.len());
1651+
/// Parse a simple integral number
1652+
fn read_numeric_integral(&mut self) -> Result<Numeric<'i>, ParsingError<'i>> {
1653+
let content = self.source;
16471654

1648-
Ok(numeric)
1655+
if let Ok(amount) = content
1656+
.trim_ascii()
1657+
.parse::<i64>()
1658+
{
1659+
self.advance(content.len());
1660+
Ok(Numeric::Integral(amount))
1661+
} else {
1662+
Err(ParsingError::InvalidNumeric(self.offset))
1663+
}
1664+
}
1665+
1666+
/// Parse a scientific quantity with units
1667+
fn read_numeric_quantity(&mut self) -> Result<Numeric<'i>, ParsingError<'i>> {
1668+
self.trim_whitespace();
1669+
1670+
// Parse mantissa (required)
1671+
let mantissa = self.read_decimal_part()?;
1672+
self.skip_whitespace();
1673+
1674+
// Parse optional uncertainty
1675+
let uncertainty = if self
1676+
.source
1677+
.starts_with('±')
1678+
|| self
1679+
.source
1680+
.starts_with("+/-")
1681+
{
1682+
if self
1683+
.source
1684+
.starts_with("+/-")
1685+
{
1686+
self.advance(3); // Skip +/- (3 bytes)
1687+
} else {
1688+
self.advance(2); // Skip ± (2 bytes in UTF-8)
1689+
}
1690+
self.skip_whitespace();
1691+
Some(self.read_decimal_part()?)
1692+
} else {
1693+
None
1694+
};
1695+
self.skip_whitespace();
1696+
1697+
// Parse optional magnitude
1698+
let magnitude = if self
1699+
.source
1700+
.starts_with('×')
1701+
|| self
1702+
.source
1703+
.starts_with('x')
1704+
|| self
1705+
.source
1706+
.starts_with('*')
1707+
{
1708+
if self
1709+
.source
1710+
.starts_with('×')
1711+
{
1712+
self.advance(2); // Skip × (2 bytes in UTF-8)
1713+
} else {
1714+
self.advance(1); // Skip x or * (1 byte each)
1715+
}
1716+
self.skip_whitespace();
1717+
if !self
1718+
.source
1719+
.starts_with("10")
1720+
{
1721+
return Err(ParsingError::InvalidNumeric(self.offset));
1722+
}
1723+
self.advance(2); // Skip "10"
1724+
1725+
if self
1726+
.source
1727+
.starts_with('^')
1728+
{
1729+
self.advance(1); // Skip ^
1730+
Some(self.read_exponent_ascii()?)
1731+
} else if let Some(exp) = self.read_exponent_superscript() {
1732+
Some(exp)
1733+
} else {
1734+
return Err(ParsingError::InvalidNumeric(self.offset));
1735+
}
1736+
} else {
1737+
None
1738+
};
1739+
self.skip_whitespace();
1740+
1741+
// Parse unit symbol (required)
1742+
let symbol = self.read_units_symbol()?;
1743+
1744+
// Verify we've consumed all the input - if there are remaining characters,
1745+
// it means there was invalid content after the unit symbol
1746+
if !self
1747+
.source
1748+
.trim_ascii()
1749+
.is_empty()
1750+
{
1751+
return Err(ParsingError::InvalidNumeric(self.offset));
1752+
}
1753+
1754+
let quantity = Quantity {
1755+
mantissa,
1756+
uncertainty,
1757+
magnitude,
1758+
symbol,
1759+
};
1760+
1761+
Ok(Numeric::Scientific(quantity))
1762+
}
1763+
1764+
fn skip_whitespace(&mut self) {
1765+
while self
1766+
.source
1767+
.starts_with(' ')
1768+
|| self
1769+
.source
1770+
.starts_with('\t')
1771+
{
1772+
self.advance(1);
1773+
}
1774+
}
1775+
1776+
fn read_decimal_part(&mut self) -> Result<crate::language::Decimal, ParsingError<'i>> {
1777+
use crate::regex::*;
1778+
let re = regex!(r"^-?[0-9]+(\.[0-9]+)?");
1779+
1780+
if let Some(mat) = re.find(self.source) {
1781+
let decimal_str = mat.as_str();
1782+
if let Some(decimal) = crate::language::parse_decimal(decimal_str) {
1783+
self.advance(decimal_str.len());
1784+
Ok(decimal)
1785+
} else {
1786+
Err(ParsingError::InvalidNumeric(self.offset))
1787+
}
1788+
} else {
1789+
Err(ParsingError::InvalidNumeric(self.offset))
1790+
}
1791+
}
1792+
1793+
fn read_exponent_ascii(&mut self) -> Result<i8, ParsingError<'i>> {
1794+
use crate::regex::*;
1795+
let re = regex!(r"^-?[0-9]+");
1796+
1797+
if let Some(mat) = re.find(self.source) {
1798+
let exp_str = mat.as_str();
1799+
if let Ok(exp) = exp_str.parse::<i8>() {
1800+
self.advance(exp_str.len());
1801+
Ok(exp)
1802+
} else {
1803+
Err(ParsingError::InvalidNumeric(self.offset))
1804+
}
1805+
} else {
1806+
Err(ParsingError::InvalidNumeric(self.offset))
1807+
}
1808+
}
1809+
1810+
fn read_exponent_superscript(&mut self) -> Option<i8> {
1811+
use crate::regex::*;
1812+
let re = regex!(r"^[⁰¹²³⁴⁵⁶⁷⁸⁹⁻]+");
1813+
1814+
if let Some(mat) = re.find(self.source) {
1815+
let super_str = mat.as_str();
1816+
let converted = crate::language::convert_superscript(super_str);
1817+
if let Ok(exp) = converted.parse::<i8>() {
1818+
self.advance(super_str.len());
1819+
Some(exp)
1820+
} else {
1821+
None
1822+
}
1823+
} else {
1824+
None
1825+
}
1826+
}
1827+
1828+
fn read_units_symbol(&mut self) -> Result<&'i str, ParsingError<'i>> {
1829+
use crate::regex::*;
1830+
let re = regex!(r"^[a-zA-Z°/μ]+");
1831+
1832+
if let Some(mat) = re.find(self.source) {
1833+
let symbol = mat.as_str();
1834+
self.advance(symbol.len());
1835+
Ok(symbol)
1836+
} else {
1837+
// Point to the invalid character
1838+
Err(ParsingError::InvalidNumeric(self.offset))
1839+
}
16491840
}
16501841

16511842
/// Parse a target like <procedure_name> or <https://example.com/proc>
@@ -2500,10 +2691,19 @@ fn malformed_response_pattern(content: &str) -> bool {
25002691
}
25012692

25022693
fn is_numeric(content: &str) -> bool {
2694+
is_numeric_integral(content) || is_numeric_quantity(content)
2695+
}
2696+
2697+
fn is_numeric_integral(content: &str) -> bool {
25032698
let integral = regex!(r"^\s*-?[0-9]+(\.[0-9]+)?\s*$");
2504-
let scientific = regex!(r"^\s*-?[0-9]+(\.[0-9]+)?(\s*[a-zA-Z°/μ]|\s*±|\s*\+/-|\s*×|\s*x\s*10|\s*\*\s*10|\*\s*10)");
2699+
integral.is_match(content)
2700+
}
25052701

2506-
integral.is_match(content) || scientific.is_match(content)
2702+
fn is_numeric_quantity(content: &str) -> bool {
2703+
let scientific = regex!(
2704+
r"^\s*-?[0-9]+(\.[0-9]+)?(\s*[a-zA-Z°/μ]|\s*±|\s*\+/-|\s*×|\s*x\s*10|\s*\*\s*10|\*\s*10)"
2705+
);
2706+
scientific.is_match(content)
25072707
}
25082708

25092709
fn is_string_literal(content: &str) -> bool {

0 commit comments

Comments
 (0)