diff --git a/src/core/crypto/crypto_uuid.cc b/src/core/crypto/crypto_uuid.cc index e69128a19f..f572438d2a 100644 --- a/src/core/crypto/crypto_uuid.cc +++ b/src/core/crypto/crypto_uuid.cc @@ -40,8 +40,8 @@ auto uuidv4() -> std::string { throw std::runtime_error("Could not generate random bytes with OpenSSL"); } #else - static std::random_device device; - static std::mt19937 generator{device()}; + thread_local std::random_device device; + thread_local std::mt19937 generator{device()}; std::uniform_int_distribution distribution(0, 15); std::uniform_int_distribution diff --git a/src/core/dns/hostname.cc b/src/core/dns/hostname.cc index e048fba735..4a980f7e24 100644 --- a/src/core/dns/hostname.cc +++ b/src/core/dns/hostname.cc @@ -20,7 +20,9 @@ auto is_hostname(const std::string_view value) -> bool { return false; } - // RFC 1123 §2.1: SHOULD handle host names of up to 255 characters + // RFC 1123 §2.1: SHOULD handle host names of up to 255 characters. This is + // intentionally looser than the stricter 253-octet cap applied to the + // internationalized form if (value.size() > 255) { return false; } diff --git a/src/core/dns/idn_hostname.cc b/src/core/dns/idn_hostname.cc index 16a32faaeb..b1d102d6d3 100644 --- a/src/core/dns/idn_hostname.cc +++ b/src/core/dns/idn_hostname.cc @@ -68,7 +68,7 @@ auto is_idn_hostname(const std::string_view value) -> bool { try { const auto body{utf32_to_punycode(decoded)}; a_label_octets = 4 + body.size(); - } catch (...) { + } catch (const PunycodeError &) { return false; } } else if (*kind == IDNALabelKind::Ascii) { diff --git a/src/core/dns/include/sourcemeta/core/dns.h b/src/core/dns/include/sourcemeta/core/dns.h index 7e84f56740..810f59b27a 100644 --- a/src/core/dns/include/sourcemeta/core/dns.h +++ b/src/core/dns/include/sourcemeta/core/dns.h @@ -34,9 +34,10 @@ namespace sourcemeta::core { /// assert(!sourcemeta::core::is_hostname("example.")); /// ``` /// -/// This function implements RFC 1123 §2.1 (ASCII only). It does not -/// perform A-label or Punycode decoding. For internationalized host -/// names see `is_idn_hostname`. +/// This function operates on ASCII input only and caps the total length at +/// 255 octets. Labels matching the case-insensitive "xn--" prefix are +/// additionally validated as RFC 5890 A-labels, so the Punycode body must +/// decode and round-trip. SOURCEMETA_CORE_DNS_EXPORT auto is_hostname(const std::string_view value) -> bool; @@ -45,7 +46,8 @@ auto is_hostname(const std::string_view value) -> bool; /// RFC 5891 Section 4. Each label is validated as an RFC 5890 A-label or /// U-label (with RFC 5892 ContextJ and ContextO contextual rules and the /// RFC 5891 §4.1.2.A NFC requirement), and the RFC 5893 Bidi rule is -/// enforced on every label of a Bidi domain name. For example: +/// enforced on every label of a Bidi domain name. The total length is capped +/// at 253 octets in A-label form. For example: /// /// ```cpp /// #include diff --git a/src/core/email/helpers.h b/src/core/email/helpers.h index 1b900eebf6..dd93f5bf4a 100644 --- a/src/core/email/helpers.h +++ b/src/core/email/helpers.h @@ -3,6 +3,7 @@ #include +#include // std::uint8_t, std::uint16_t #include // std::string_view namespace { @@ -75,6 +76,46 @@ inline constexpr auto is_ldh_str(const std::string_view value) -> bool { return true; } +// RFC 5321 §4.1.3: Snum = 1*3DIGIT ; representing a decimal integer +// value in the range 0 through 255. Leading zeros are permitted, unlike +// the RFC 3986 dec-octet that backs is_ipv4 +inline constexpr auto is_snum(const std::string_view value) -> bool { + if (value.empty() || value.size() > 3) { + return false; + } + std::uint16_t result{0}; + for (const auto character : value) { + if (character < '0' || character > '9') { + return false; + } + result = static_cast( + result * 10 + static_cast(character - '0')); + } + return result <= 255; +} + +// RFC 5321 §4.1.3: IPv4-address-literal = Snum 3("." Snum) +inline constexpr auto is_ipv4_address_literal(const std::string_view value) + -> bool { + std::string_view::size_type start{0}; + std::uint8_t octets{0}; + while (true) { + const auto dot{value.find('.', start)}; + const auto octet{dot == std::string_view::npos + ? value.substr(start) + : value.substr(start, dot - start)}; + if (!is_snum(octet)) { + return false; + } + octets = static_cast(octets + 1); + if (dot == std::string_view::npos) { + break; + } + start = dot + 1; + } + return octets == 4; +} + // RFC 5234 §2.3: ABNF literal strings are case-insensitive by default // RFC 5321 §4.1.3: IPv6-address-literal prefix is the literal "IPv6:" inline constexpr auto matches_ipv6_tag(const std::string_view value) -> bool { @@ -126,7 +167,7 @@ inline auto is_address_literal(const std::string_view domain) -> bool { // RFC 5321 §4.1.3: IPv4-address-literal has no ":"; // General-address-literal requires ":" if (inner.find(':') == std::string_view::npos) { - return sourcemeta::core::is_ipv4(inner); + return is_ipv4_address_literal(inner); } return is_general_address_literal(inner); } diff --git a/src/core/gzip/bit_reader.h b/src/core/gzip/bit_reader.h index b70956968d..bbffdae05e 100644 --- a/src/core/gzip/bit_reader.h +++ b/src/core/gzip/bit_reader.h @@ -34,6 +34,10 @@ class BitReader { } auto consume_bits(const unsigned int count) -> void { + // Consuming more bits than are buffered would underflow the unsigned + // counter. Every call site is preceded by a peek or refill that + // guarantees enough bits, so the assert documents the contract + assert(count <= this->bits_available_); this->accumulator_ >>= count; this->bits_available_ -= count; } @@ -45,6 +49,10 @@ class BitReader { } auto read_byte() -> std::uint8_t { + // Reading a byte while 1 to 7 bits are buffered would return a byte from + // ahead of them. Every call site is byte-aligned, so the assert documents + // the invariant without paying a release-build cost + assert(this->bits_available_ % 8 == 0); if (this->bits_available_ >= 8) { const auto value{static_cast(this->accumulator_ & 0xff)}; this->accumulator_ >>= 8; @@ -54,6 +62,22 @@ class BitReader { return this->pull_source_byte(); } + auto try_read_byte(std::uint8_t &byte) -> bool { + assert(this->bits_available_ % 8 == 0); + if (this->bits_available_ >= 8) { + byte = static_cast(this->accumulator_ & 0xff); + this->accumulator_ >>= 8; + this->bits_available_ -= 8; + return true; + } + if (this->buffer_position_ >= this->buffer_size_ && + !this->try_refill_buffer()) { + return false; + } + byte = this->buffer_[this->buffer_position_++]; + return true; + } + auto read_bytes(std::uint8_t *destination, const std::size_t count) -> void { for (std::size_t index = 0; index < count; ++index) { destination[index] = this->read_byte(); @@ -101,14 +125,21 @@ class BitReader { } auto refill_buffer() -> void { + if (!this->try_refill_buffer()) { + throw GZIPError{"Unexpected end of source stream"}; + } + } + + auto try_refill_buffer() -> bool { this->source_->read(reinterpret_cast(this->buffer_.data()), SOURCE_BUFFER_SIZE); const auto bytes_read{static_cast(this->source_->gcount())}; if (bytes_read == 0) { - throw GZIPError{"Unexpected end of source stream"}; + return false; } this->buffer_size_ = bytes_read; this->buffer_position_ = 0; + return true; } std::istream *source_; diff --git a/src/core/gzip/deflate.h b/src/core/gzip/deflate.h index db0a5a7b22..592341a450 100644 --- a/src/core/gzip/deflate.h +++ b/src/core/gzip/deflate.h @@ -163,8 +163,8 @@ class DeflateDecoder { for (auto &length : distance_lengths) { length = 5; } - this->distance_tree_.build(distance_lengths.data(), - distance_lengths.size()); + this->distance_tree_.build(distance_lengths.data(), distance_lengths.size(), + true); } auto read_dynamic_header() -> void { @@ -172,6 +172,11 @@ class DeflateDecoder { const auto hdist{this->reader_->read_bits(5) + 1}; const auto hclen{this->reader_->read_bits(4) + 4}; + // RFC 1951 section 3.2.7 caps the literal/length alphabet at 286 symbols + if (hlit > 286) { + throw GZIPError{"Too many literal/length codes"}; + } + std::array code_length_lengths{}; for (std::size_t index = 0; index < hclen; ++index) { code_length_lengths[DEFLATE_CODE_LENGTH_ORDER[index]] = diff --git a/src/core/gzip/gzip.cc b/src/core/gzip/gzip.cc index 83af9a56db..9ee3375a3b 100644 --- a/src/core/gzip/gzip.cc +++ b/src/core/gzip/gzip.cc @@ -8,19 +8,27 @@ extern "C" { namespace sourcemeta::core { -auto gzip(const std::uint8_t *input, const std::size_t size) -> std::string { +auto gzip(const std::uint8_t *input, const std::size_t size, const int level) + -> std::string { std::unique_ptr - compressor{libdeflate_alloc_compressor(1), libdeflate_free_compressor}; + compressor{libdeflate_alloc_compressor(level), + libdeflate_free_compressor}; if (!compressor) { throw GZIPError{"Could not allocate compressor"}; } const auto max_size{libdeflate_gzip_compress_bound(compressor.get(), size)}; std::string output; - output.resize(max_size); - - const auto actual_size{libdeflate_gzip_compress( - compressor.get(), input, size, output.data(), output.size())}; + std::size_t actual_size{0}; + // libdeflate overwrites the whole bound, so leaving the buffer uninitialised + // avoids zero-filling multi-megabyte allocations that are immediately + // discarded + output.resize_and_overwrite( + max_size, [&](char *const buffer, const std::size_t capacity) { + actual_size = libdeflate_gzip_compress(compressor.get(), input, size, + buffer, capacity); + return capacity; + }); if (actual_size == 0) { throw GZIPError{"Could not compress input"}; @@ -44,11 +52,17 @@ auto gunzip(const std::uint8_t *input, const std::size_t size, auto capacity{output_hint > 0 ? output_hint : size * 4}; for (;;) { - output.resize(capacity); std::size_t actual_size{0}; - const auto result{libdeflate_gzip_decompress(decompressor.get(), input, - size, output.data(), - output.size(), &actual_size)}; + auto result{LIBDEFLATE_BAD_DATA}; + // libdeflate writes only the decompressed bytes, so leaving the buffer + // uninitialised avoids zero-filling multi-megabyte allocations on every + // retry of the doubling loop + output.resize_and_overwrite(capacity, [&](char *const buffer, + const std::size_t buffer_size) { + result = libdeflate_gzip_decompress(decompressor.get(), input, size, + buffer, buffer_size, &actual_size); + return buffer_size; + }); if (result == LIBDEFLATE_SUCCESS) { output.resize(actual_size); diff --git a/src/core/gzip/huffman.h b/src/core/gzip/huffman.h index 03ff95a02f..01c72b330d 100644 --- a/src/core/gzip/huffman.h +++ b/src/core/gzip/huffman.h @@ -28,8 +28,11 @@ class HuffmanDecoder { HuffmanDecoder() = default; - auto build(const std::uint8_t *lengths, const std::size_t length_count) - -> void { + // The fixed distance tree of RFC 1951 section 3.2.6 is intentionally + // incomplete (30 codes of length five over a 32-slot space), so the + // completeness check is suppressed for it and enforced everywhere else + auto build(const std::uint8_t *lengths, const std::size_t length_count, + const bool allow_incomplete = false) -> void { std::ranges::fill(this->count_, std::uint16_t{0}); std::ranges::fill(this->lut_, std::uint16_t{0}); @@ -54,6 +57,15 @@ class HuffmanDecoder { } } + // Reject incomplete codes, matching zlib and puff. RFC 1951 sanctions + // incompleteness only for the single-code case (a tree built from one + // used code of length one), where every length is either zero or one + if (left > 0 && !allow_incomplete && + length_count != static_cast(this->count_[0]) + + static_cast(this->count_[1])) { + throw GZIPError{"Incomplete Huffman code"}; + } + std::array offsets{}; offsets[1] = 0; for (unsigned int bits = 1; bits < MAX_HUFFMAN_BITS; ++bits) { diff --git a/src/core/gzip/include/sourcemeta/core/gzip.h b/src/core/gzip/include/sourcemeta/core/gzip.h index 2b7fb59184..cd22de5c38 100644 --- a/src/core/gzip/include/sourcemeta/core/gzip.h +++ b/src/core/gzip/include/sourcemeta/core/gzip.h @@ -26,7 +26,8 @@ namespace sourcemeta::core { /// @ingroup gzip -/// Compress a byte buffer using the GZIP format (RFC 1952). For example: +/// Compress a byte buffer using the GZIP format (RFC 1952). An optional +/// compression level from 0 to 12 trades speed for ratio. For example: /// /// ```cpp /// #include @@ -36,7 +37,8 @@ namespace sourcemeta::core { /// reinterpret_cast(input.data()), input.size())}; /// ``` auto SOURCEMETA_CORE_GZIP_EXPORT gzip(const std::uint8_t *input, - std::size_t size) -> std::string; + std::size_t size, int level = 1) + -> std::string; /// @ingroup gzip /// Decompress a GZIP compressed byte buffer (RFC 1952). An optional output diff --git a/src/core/gzip/streambuf.cc b/src/core/gzip/streambuf.cc index 21e6c43479..4ba4801bb8 100644 --- a/src/core/gzip/streambuf.cc +++ b/src/core/gzip/streambuf.cc @@ -10,7 +10,6 @@ #include // std::uint8_t, std::uint16_t, std::uint32_t #include // std::istream #include // std::string_view -#include // std::vector namespace sourcemeta::core { @@ -34,72 +33,88 @@ struct GZIPStreamBuffer::Internal { namespace { -auto try_read_first_byte(BitReader &reader, std::uint8_t &first_byte) -> bool { - try { - first_byte = reader.read_byte(); - return true; - } catch (const GZIPError &) { - return false; +// Accumulates the running CRC-32 over the header bytes that the FHCRC check +// covers, computing it only when the FHCRC flag is present +class HeaderChecksum { +public: + HeaderChecksum(const bool track) : track_{track} {} + + auto feed(const std::uint8_t byte) -> void { + if (this->track_) { + const auto data{static_cast(byte)}; + this->checksum_ = + crc32_update(this->checksum_, std::string_view{&data, 1}); + } + } + + [[nodiscard]] auto low16() const -> std::uint16_t { + return static_cast(this->checksum_ & 0xffffu); } + +private: + bool track_; + std::uint32_t checksum_{0}; +}; + +auto read_header_byte(BitReader &reader, HeaderChecksum &checksum) + -> std::uint8_t { + const auto byte{reader.read_byte()}; + checksum.feed(byte); + return byte; } -auto parse_member_header(BitReader &reader, const std::uint8_t first_byte, - std::vector &header_bytes) -> void { +auto parse_member_header(BitReader &reader, const std::uint8_t first_byte) + -> void { + // RFC 1952 section 2.3.1.2: FHCRC covers every header byte up to but not + // including the CRC16 itself, so feeding each byte as it is read produces + // exactly the right value. The bytes are not retained, removing an + // unbounded-memory path through FNAME and FCOMMENT + // Caller already consumed the ID1 byte and verified it is 0x1f - header_bytes.push_back(first_byte); const auto id2{reader.read_byte()}; - header_bytes.push_back(id2); if (id2 != 0x8b) { throw GZIPError{"Invalid gzip magic bytes"}; } const auto compression_method{reader.read_byte()}; - header_bytes.push_back(compression_method); if (compression_method != 8) { throw GZIPError{"Unsupported gzip compression method"}; } const auto flag_byte{reader.read_byte()}; - header_bytes.push_back(flag_byte); if ((flag_byte & 0xe0) != 0) { throw GZIPError{"Reserved gzip FLG bits must be zero"}; } + HeaderChecksum checksum{(flag_byte & 0x02) != 0}; + checksum.feed(first_byte); + checksum.feed(id2); + checksum.feed(compression_method); + checksum.feed(flag_byte); + // MTIME (4 bytes) + XFL (1 byte) + OS (1 byte) are informational for (std::size_t index = 0; index < 6; ++index) { - header_bytes.push_back(reader.read_byte()); + read_header_byte(reader, checksum); } if ((flag_byte & 0x04) != 0) { // FEXTRA - const auto xlen_lo{reader.read_byte()}; - const auto xlen_hi{reader.read_byte()}; - header_bytes.push_back(xlen_lo); - header_bytes.push_back(xlen_hi); + const auto xlen_lo{read_header_byte(reader, checksum)}; + const auto xlen_hi{read_header_byte(reader, checksum)}; const auto xlen{static_cast(xlen_lo) | (static_cast(xlen_hi) << 8)}; for (std::size_t index = 0; index < xlen; ++index) { - header_bytes.push_back(reader.read_byte()); + read_header_byte(reader, checksum); } } if ((flag_byte & 0x08) != 0) { // FNAME (null-terminated) - while (true) { - const auto byte{reader.read_byte()}; - header_bytes.push_back(byte); - if (byte == 0) { - break; - } + while (read_header_byte(reader, checksum) != 0) { } } if ((flag_byte & 0x10) != 0) { // FCOMMENT (null-terminated) - while (true) { - const auto byte{reader.read_byte()}; - header_bytes.push_back(byte); - if (byte == 0) { - break; - } + while (read_header_byte(reader, checksum) != 0) { } } @@ -111,17 +126,25 @@ auto parse_member_header(BitReader &reader, const std::uint8_t first_byte, static_cast(stored_lo) | static_cast(static_cast(stored_hi) << 8))}; - const auto checksum{crc32( - std::string_view{reinterpret_cast(header_bytes.data()), - header_bytes.size()})}; - const std::uint16_t computed{ - static_cast(checksum & 0xffffu)}; - if (stored != computed) { + if (stored != checksum.low16()) { throw GZIPError{"FHCRC mismatch"}; } } } +// Used for members past the first, where gzip(1) tolerates trailing +// non-member data, so a header that fails to validate is reported as +// trailing garbage rather than propagated as an error +auto try_parse_member_header(BitReader &reader, const std::uint8_t first_byte) + -> bool { + try { + parse_member_header(reader, first_byte); + return true; + } catch (const GZIPError &) { + return false; + } +} + } // namespace GZIPStreamBuffer::GZIPStreamBuffer(std::istream &compressed_stream) @@ -140,23 +163,29 @@ auto GZIPStreamBuffer::underflow() -> int_type { while (true) { if (!this->internal->member_started) { std::uint8_t first_byte{0}; - if (!try_read_first_byte(this->internal->reader, first_byte)) { + if (!this->internal->reader.try_read_byte(first_byte)) { if (!this->internal->any_member_completed) { throw GZIPError{"Empty source stream"}; } this->internal->stream_ended = true; return traits_type::eof(); } - if (first_byte != 0x1f) { - if (!this->internal->any_member_completed) { + if (this->internal->any_member_completed) { + // gzip(1) silently ignores any trailing data after a complete member + // rather than treating it as the start of a new member. Bytes that do + // not form a valid member header end the stream without error, + // independent of the first byte value + if (first_byte != 0x1f || + !try_parse_member_header(this->internal->reader, first_byte)) { + this->internal->stream_ended = true; + return traits_type::eof(); + } + } else { + if (first_byte != 0x1f) { throw GZIPError{"Invalid gzip magic bytes"}; } - // Trailing garbage after a complete member is silently ignored - this->internal->stream_ended = true; - return traits_type::eof(); + parse_member_header(this->internal->reader, first_byte); } - std::vector header_bytes; - parse_member_header(this->internal->reader, first_byte, header_bytes); this->internal->deflate.reset(); this->internal->member_started = true; diff --git a/src/core/html/include/sourcemeta/core/html_writer.h b/src/core/html/include/sourcemeta/core/html_writer.h index 384f1307d8..7774081121 100644 --- a/src/core/html/include/sourcemeta/core/html_writer.h +++ b/src/core/html/include/sourcemeta/core/html_writer.h @@ -37,10 +37,14 @@ class SOURCEMETA_CORE_HTML_EXPORT HTMLWriter { this->buffer_.reserve(bytes); } - /// Close the most recently opened element + /// Close the most recently opened element. Closing when no element is open + /// has no effect. SOURCEMETA_FORCEINLINE inline auto close() -> HTMLWriter & { this->flush_open_tag(); assert(!this->tag_stack_.empty()); + if (this->tag_stack_.empty()) [[unlikely]] { + return *this; + } this->buffer_.append("buffer_.append(this->tag_stack_.back()); this->buffer_.append(">"); diff --git a/src/core/http/helpers.h b/src/core/http/helpers.h index abfabd40a2..40d78b2b5c 100644 --- a/src/core/http/helpers.h +++ b/src/core/http/helpers.h @@ -144,8 +144,20 @@ inline auto http_for_each_parameter(const std::string_view parameters, ++position; } std::size_t end_position{position}; - while (end_position < parameters.size() && - parameters[end_position] != ';') { + bool in_quotes{false}; + while (end_position < parameters.size()) { + const char current{parameters[end_position]}; + if (in_quotes) { + if (current == '\\' && end_position + 1 < parameters.size()) { + ++end_position; + } else if (current == '"') { + in_quotes = false; + } + } else if (current == '"') { + in_quotes = true; + } else if (current == ';') { + break; + } ++end_position; } const auto raw{http_subview(parameters, position, end_position - position)}; @@ -167,27 +179,29 @@ inline auto http_for_each_parameter(const std::string_view parameters, } } -// RFC 9110 §5.6.5 q-value. Defaults to 1.0 on malformed input. +// RFC 9110 §12.4.2 q-value. A malformed weight is a fail-safe refusal, so it +// is treated as 0 rather than maximal preference. An absent weight is not +// routed here and keeps its 1.0 default at the call site. inline auto http_parse_qvalue(const std::string_view value) noexcept -> float { if (value.empty()) { - return 1.0f; + return 0.0f; } if (value[0] != '0' && value[0] != '1') { - return 1.0f; + return 0.0f; } const float integer_part{static_cast(value[0] - '0')}; if (value.size() == 1) { return integer_part; } if (value[1] != '.' || value.size() > 5) { - return 1.0f; + return 0.0f; } std::uint16_t numerator{0}; std::uint16_t denominator{1}; for (std::size_t index{2}; index < value.size(); ++index) { const char character{value[index]}; if (character < '0' || character > '9') { - return 1.0f; + return 0.0f; } numerator = static_cast(numerator * 10 + (character - '0')); denominator = static_cast(denominator * 10); @@ -196,7 +210,7 @@ inline auto http_parse_qvalue(const std::string_view value) noexcept -> float { static_cast(denominator)}; const float result{integer_part + fraction}; if (result > 1.0f) { - return 1.0f; + return 0.0f; } return result; } diff --git a/src/core/idna/idna.cc b/src/core/idna/idna.cc index da161023d1..87018b7ed7 100644 --- a/src/core/idna/idna.cc +++ b/src/core/idna/idna.cc @@ -12,6 +12,54 @@ namespace sourcemeta::core { +namespace { + +// RFC 5890 §2.3.2.1: the maximum length of a label in A-label form +constexpr std::size_t MAXIMUM_LABEL_OCTETS{63}; + +// Decode and fully validate a Punycode A-label body (the substring after the +// "xn--" prefix), writing the decoded U-label out on success. Returns false +// when the body is not a canonical A-label. +auto validate_a_label_body(const std::string_view encoded, + std::u32string &decoded) -> bool { + if (encoded.empty()) { + return false; + } + + try { + decoded = punycode_to_utf32(encoded); + } catch (const PunycodeError &) { + return false; + } + + // RFC 5890 §2.3.2.1: a U-label contains at least one non-ASCII codepoint. + // A Punycode body that decodes to pure ASCII is not a real A-label. + bool has_non_ascii{false}; + for (const auto codepoint : decoded) { + if (codepoint > 0x7F) { + has_non_ascii = true; + break; + } + } + if (!has_non_ascii) { + return false; + } + + if (!idna_is_valid_u_label(decoded)) { + return false; + } + + // RFC 5891 §4.2: A-labels must be in canonical Punycode form, so + // re-encoding the decoded U-label must yield the original bytes. + try { + return utf32_to_punycode(decoded) == encoded; + } catch (const PunycodeError &) { + return false; + } +} + +} // namespace + auto idna_classify_label(const std::u32string_view label, std::u32string &decoded) -> std::optional { @@ -34,23 +82,23 @@ auto idna_classify_label(const std::u32string_view label, ((label[1] | 0x20) == U'n') && label[2] == U'-' && label[3] == U'-'}; if (has_a_label_prefix) { + // RFC 5890 §2.3.2.1: a label in A-label form is at most 63 octets + if (label.size() > MAXIMUM_LABEL_OCTETS) { + return std::nullopt; + } + std::string ascii; ascii.reserve(label.size()); for (const auto codepoint : label) { ascii.push_back(static_cast(codepoint)); } // Normalise the prefix to canonical lowercase before validating, so - // the round-trip equality inside `idna_is_valid_a_label` does not - // reject input that only differs in the case of the prefix + // the round-trip equality does not reject input that only differs in + // the case of the prefix ascii[0] = 'x'; ascii[1] = 'n'; - if (!idna_is_valid_a_label(ascii)) { - return std::nullopt; - } - try { - decoded = punycode_to_utf32( - std::string_view{ascii.data() + 4, ascii.size() - 4}); - } catch (...) { + if (!validate_a_label_body( + std::string_view{ascii.data() + 4, ascii.size() - 4}, decoded)) { return std::nullopt; } return IDNALabelKind::ALabel; @@ -128,24 +176,23 @@ auto idna_passes_contexto(const std::u32string_view label, break; } - // RFC 5892 Appendix A.8 ARABIC-INDIC DIGITS (U+0660..U+0669) - if (codepoint >= 0x0660 && codepoint <= 0x0669) { - for (const auto other : label) { - if (other >= 0x06F0 && other <= 0x06F9) { - return false; - } - } - return true; - } - - // RFC 5892 Appendix A.9 EXTENDED ARABIC-INDIC DIGITS (U+06F0..U+06F9) - if (codepoint >= 0x06F0 && codepoint <= 0x06F9) { + // RFC 5892 Appendix A.8 ARABIC-INDIC DIGITS (U+0660..U+0669) and + // Appendix A.9 EXTENDED ARABIC-INDIC DIGITS (U+06F0..U+06F9): a label must + // not mix the two blocks. A single scan over the label settles both rules. + const bool is_arabic_indic{codepoint >= 0x0660 && codepoint <= 0x0669}; + const bool is_extended_arabic_indic{codepoint >= 0x06F0 && + codepoint <= 0x06F9}; + if (is_arabic_indic || is_extended_arabic_indic) { + bool has_arabic_indic{false}; + bool has_extended_arabic_indic{false}; for (const auto other : label) { if (other >= 0x0660 && other <= 0x0669) { - return false; + has_arabic_indic = true; + } else if (other >= 0x06F0 && other <= 0x06F9) { + has_extended_arabic_indic = true; } } - return true; + return !(has_arabic_indic && has_extended_arabic_indic); } // No RFC 5892 Appendix A.3-A.9 rule applies to this codepoint, so there @@ -252,6 +299,16 @@ auto idna_is_valid_u_label(const std::u32string_view label) -> bool { } } + // RFC 5890 §2.3.2.1: the corresponding A-label (the "xn--" prefix plus the + // Punycode-encoded body) must not exceed 63 octets + try { + if (4 + utf32_to_punycode(label).size() > MAXIMUM_LABEL_OCTETS) { + return false; + } + } catch (const PunycodeError &) { + return false; + } + return true; } @@ -350,6 +407,11 @@ auto idna_is_valid_a_label(const std::string_view label) -> bool { return false; } + // RFC 5890 §2.3.2.1: a label in A-label form is at most 63 octets + if (label.size() > MAXIMUM_LABEL_OCTETS) { + return false; + } + // RFC 5890 §2.3.2.1: A-labels are pure ASCII for (const auto byte : label) { if (static_cast(byte) > 0x7F) { @@ -361,41 +423,9 @@ auto idna_is_valid_a_label(const std::string_view label) -> bool { // avoids `std::string_view::substr`, which is not noexcept. const std::string_view encoded{label.data() + prefix.size(), label.size() - prefix.size()}; - if (encoded.empty()) { - return false; - } std::u32string decoded; - try { - decoded = punycode_to_utf32(encoded); - } catch (...) { - return false; - } - - // RFC 5890 §2.3.2.1: a U-label contains at least one non-ASCII codepoint. - // A Punycode body that decodes to pure ASCII is not a real A-label. - bool has_non_ascii{false}; - for (const auto codepoint : decoded) { - if (codepoint > 0x7F) { - has_non_ascii = true; - break; - } - } - if (!has_non_ascii) { - return false; - } - - if (!idna_is_valid_u_label(decoded)) { - return false; - } - - // RFC 5891 §4.2: A-labels must be in canonical Punycode form, so - // re-encoding the decoded U-label must yield the original bytes. - try { - return utf32_to_punycode(decoded) == encoded; - } catch (...) { - return false; - } + return validate_a_label_body(encoded, decoded); } } // namespace sourcemeta::core diff --git a/src/core/idna/include/sourcemeta/core/idna.h b/src/core/idna/include/sourcemeta/core/idna.h index 166cb5ea61..9c78746825 100644 --- a/src/core/idna/include/sourcemeta/core/idna.h +++ b/src/core/idna/include/sourcemeta/core/idna.h @@ -142,7 +142,10 @@ auto idna_passes_bidi_rule(const std::u32string_view label) noexcept -> bool; /// Return whether the given label is a valid U-label per RFC 5891 §4. See /// https://www.rfc-editor.org/rfc/rfc5891#section-4 for the criteria. /// The Bidi rule is not checked here because Bidi domain detection is a -/// property of the whole domain, not of a single label. For example: +/// property of the whole domain, not of a single label. A pure-ASCII label +/// that satisfies the structural rules is accepted even though it carries no +/// non-ASCII codepoint, so this check is not on its own a guarantee that the +/// label requires IDNA processing. For example: /// /// ```cpp /// #include @@ -158,10 +161,13 @@ auto idna_is_valid_u_label(const std::u32string_view label) -> bool; /// @ingroup idna /// Return whether the given label is a valid A-label per RFC 5891 §4. See /// https://www.rfc-editor.org/rfc/rfc5891#section-4 for the criteria. -/// A valid A-label starts with the ACE prefix "xn--", is pure ASCII, has a -/// non-empty Punycode body that decodes to a U-label containing at least -/// one non-ASCII codepoint, and round-trips through Punycode in its -/// canonical form. For example: +/// A valid A-label starts with the lowercase ACE prefix "xn--", is pure +/// ASCII, is at most 63 octets, has a non-empty Punycode body that decodes to +/// a U-label containing at least one non-ASCII codepoint, and round-trips +/// through Punycode in its canonical form. Both the prefix and the Punycode +/// body are matched case-sensitively, so an uppercase prefix or a mixed-case +/// body is rejected. This is intended for registration-side validation rather +/// than case-folding lookup. For example: /// /// ```cpp /// #include diff --git a/src/core/json/include/sourcemeta/core/json_object.h b/src/core/json/include/sourcemeta/core/json_object.h index 51ca77628d..29fd2be4b9 100644 --- a/src/core/json/include/sourcemeta/core/json_object.h +++ b/src/core/json/include/sourcemeta/core/json_object.h @@ -56,37 +56,50 @@ template class JSONObject { auto operator<(const JSONObject &other) const noexcept -> bool { - // The `std::unordered_map` container, by definition, does not provide - // ordering. However, we still want some level of ordering to allow - // arrays of objects to be sorted. - - // First try a size comparison + // Objects have no inherent order, but a deterministic strict weak ordering + // independent of insertion order is needed so that collections of objects + // can be sorted. Smaller objects come first, and objects of equal size are + // ordered as their entries would compare in key order. That outcome is + // decided entirely by the smallest key at which the two objects differ, + // which is found by scanning the entries in place to avoid allocating if (this->data.size() != other.data.size()) { return this->data.size() < other.data.size(); } - // Otherwise do value comparison for common properties - for (const auto &entry : *this) { - const auto other_entry{other.find(entry.first)}; - if (other_entry != other.cend() && entry.second < other_entry->second) { - return true; + const Key *decisive_key{nullptr}; + bool decision{false}; + for (const auto &entry : this->data) { + const auto match{other.find(entry.first)}; + const bool differs{match == other.cend() || + !(entry.second == match->second)}; + if (differs && (decisive_key == nullptr || entry.first < *decisive_key)) { + decisive_key = &entry.first; + decision = match == other.cend() || entry.second < match->second; } } - return false; + for (const auto &entry : other.data) { + if (this->find(entry.first) == this->cend() && + (decisive_key == nullptr || entry.first < *decisive_key)) { + decisive_key = &entry.first; + decision = false; + } + } + + return decision; } auto operator<=(const JSONObject &other) const noexcept -> bool { - return this->data <= other.data; + return !(other < *this); } auto operator>(const JSONObject &other) const noexcept -> bool { - return this->data > other.data; + return other < *this; } auto operator>=(const JSONObject &other) const noexcept -> bool { - return this->data >= other.data; + return !(*this < other); } auto operator==(const JSONObject &other) const noexcept diff --git a/src/core/json/include/sourcemeta/core/json_value.h b/src/core/json/include/sourcemeta/core/json_value.h index 37f0576865..9c2b304aab 100644 --- a/src/core/json/include/sourcemeta/core/json_value.h +++ b/src/core/json/include/sourcemeta/core/json_value.h @@ -998,6 +998,11 @@ class SOURCEMETA_CORE_JSON_EXPORT JSON { [[nodiscard]] auto at_or(const String &key, const JSON &otherwise) const -> const JSON &; + /// This overload avoids misuses of returning a const reference parameter as a + /// constant reference. + [[nodiscard]] auto at_or(const String &key, JSON &&otherwise) const + -> const JSON & = delete; + /// This method retrieves an object property given a pre-calculated property /// hash, or a user provided value if such property is not defined. /// @@ -1018,13 +1023,8 @@ class SOURCEMETA_CORE_JSON_EXPORT JSON { const typename Object::hash_type hash, const JSON &otherwise) const -> const JSON &; - // Constant reference parameters can accept xvalues which will be destructed - // after the call. When the function returns such a parameter also as constant - // reference, then the returned reference can be used after the object it - // refers to has been destroyed. - // https://clang.llvm.org/extra/clang-tidy/checks/bugprone/return-const-ref-from-parameter.html - // This overload avoids mis-uses of retuning const reference parameter as - // constant reference. + /// This overload avoids misuses of returning a const reference parameter as a + /// constant reference. [[nodiscard]] auto at_or(const String &key, const typename Object::hash_type hash, JSON &&otherwise) const -> const JSON & = delete; diff --git a/src/core/json/json.cc b/src/core/json/json.cc index 3fbf22e1b0..d4229a7f3f 100644 --- a/src/core/json/json.cc +++ b/src/core/json/json.cc @@ -12,7 +12,9 @@ #include // std::uint64_t #include // std::filesystem #include // std::basic_istream +#include // std::numeric_limits #include // std::basic_ostream +#include // std::cmp_greater #include // std::vector namespace sourcemeta::core { @@ -23,6 +25,13 @@ static auto internal_parse_json(const char *&cursor, const char *end, const bool track_positions, JSON &output) -> void { const char *buffer_start{cursor}; + // Tape entries address the input with 32-bit offsets and lengths, so a larger + // input cannot be represented without truncation + if (std::cmp_greater(end - cursor, + std::numeric_limits::max())) { + throw JSONParseError(line, column); + } + std::vector tape; tape.reserve(static_cast(end - cursor) / 8); if (callback || track_positions) { diff --git a/src/core/json/json_value.cc b/src/core/json/json_value.cc index efff899888..2ab370b18b 100644 --- a/src/core/json/json_value.cc +++ b/src/core/json/json_value.cc @@ -264,39 +264,51 @@ auto JSON::operator=(const JSON &other) -> JSON & { return *this; } - // Fast path for scalar sources: destroy this iteratively (safe for any - // depth) then assign the scalar directly. Avoids the copy-then-move dance - // that the container path needs for strong exception safety + // Fast path for scalar sources: tear this value down iteratively, which is + // safe for any depth, then assign the scalar directly. Each scalar is + // buffered into a local first, because the source may be nested inside this + // value, and tearing this value down would otherwise free the storage still + // being read from switch (other.current_type) { case Type::Null: this->~JSON(); this->current_type = Type::Null; return *this; - case Type::Boolean: + case Type::Boolean: { + const auto value{other.data_boolean}; this->~JSON(); - this->data_boolean = other.data_boolean; + this->data_boolean = value; this->current_type = Type::Boolean; return *this; - case Type::Integer: + } + case Type::Integer: { + const auto value{other.data_integer}; this->~JSON(); - this->data_integer = other.data_integer; + this->data_integer = value; this->current_type = Type::Integer; return *this; - case Type::Real: + } + case Type::Real: { + const auto value{other.data_real}; this->~JSON(); - this->data_real = other.data_real; + this->data_real = value; this->current_type = Type::Real; return *this; - case Type::String: + } + case Type::String: { + String value{other.data_string}; this->~JSON(); - std::construct_at(&this->data_string, other.data_string); + std::construct_at(&this->data_string, std::move(value)); this->current_type = Type::String; return *this; - case Type::Decimal: + } + case Type::Decimal: { + auto *value{new Decimal{*other.data_decimal}}; this->~JSON(); - this->data_decimal = new Decimal{*other.data_decimal}; + this->data_decimal = value; this->current_type = Type::Decimal; return *this; + } case Type::Array: case Type::Object: break; @@ -312,13 +324,17 @@ auto JSON::operator=(const JSON &other) -> JSON & { } auto JSON::operator=(JSON &&other) noexcept -> JSON & { - // Destroy-then-rebuild so the existing value in this is torn down by the - // iterative destructor if (this == &other) { return *this; } + + // Steal the source into a local before this value is torn down, because the + // source may be nested inside this value, and tearing it down first would + // free the storage still being moved from. Parentheses select the move + // constructor rather than the list constructor + JSON moved(std::move(other)); this->~JSON(); - std::construct_at(this, std::move(other)); + std::construct_at(this, std::move(moved)); return *this; } @@ -907,6 +923,15 @@ auto JSON::clear_except(std::initializer_list keys) -> void { auto JSON::merge(const JSON::Object &other) -> void { assert(this->is_object()); + // When the source is this object's own container, the insertions below would + // reallocate the very storage being iterated, so it is snapshotted first + if (&other == &this->data_object) { + // NOLINTNEXTLINE(performance-unnecessary-copy-initialization) + const JSON::Object snapshot{other}; + this->merge(snapshot); + return; + } + for (const auto &pair : other) { const auto maybe_key{this->try_at(pair.first, pair.hash)}; if (maybe_key && maybe_key->is_object() && pair.second.is_object()) { diff --git a/src/core/jsonpointer/include/sourcemeta/core/jsonpointer_pointer.h b/src/core/jsonpointer/include/sourcemeta/core/jsonpointer_pointer.h index a07242a023..d28ccd4ce7 100644 --- a/src/core/jsonpointer/include/sourcemeta/core/jsonpointer_pointer.h +++ b/src/core/jsonpointer/include/sourcemeta/core/jsonpointer_pointer.h @@ -868,7 +868,7 @@ template class GenericPointer { for (const auto &element : value.as_array()) { if (element.is_string()) { result.emplace_back(element.to_string()); - } else if (element.is_integer()) { + } else if (element.is_integer() && element.to_integer() >= 0) { result.emplace_back( static_cast(element.to_integer())); } else { diff --git a/src/core/jsonpointer/include/sourcemeta/core/jsonpointer_token.h b/src/core/jsonpointer/include/sourcemeta/core/jsonpointer_token.h index 69d6399400..01f4d020d8 100644 --- a/src/core/jsonpointer/include/sourcemeta/core/jsonpointer_token.h +++ b/src/core/jsonpointer/include/sourcemeta/core/jsonpointer_token.h @@ -79,7 +79,9 @@ template class GenericToken { /// ``` GenericToken(const int value) : as_property{false}, property{DEFAULT_PROPERTY}, hash{0}, - index{static_cast(value)} {} + index{static_cast(value)} { + assert(value >= 0); + } #if defined(_MSC_VER) /// This constructor creates an JSON Pointer token from an item index. For diff --git a/src/core/jsonpointer/include/sourcemeta/core/jsonpointer_walker.h b/src/core/jsonpointer/include/sourcemeta/core/jsonpointer_walker.h index 9441f564b5..610099c2ab 100644 --- a/src/core/jsonpointer/include/sourcemeta/core/jsonpointer_walker.h +++ b/src/core/jsonpointer/include/sourcemeta/core/jsonpointer_walker.h @@ -16,7 +16,10 @@ template class GenericPointerWalker { using internal = typename std::vector; public: - GenericPointerWalker(const JSON &document) { this->walk(document, {}); } + GenericPointerWalker(const JSON &document) { + PointerT accumulator; + this->walk(document, accumulator); + } using const_iterator = typename internal::const_iterator; [[nodiscard]] auto begin() const -> const_iterator { @@ -33,19 +36,19 @@ template class GenericPointerWalker { }; private: - auto walk(const JSON &document, const PointerT &pointer) -> void { + auto walk(const JSON &document, PointerT &pointer) -> void { this->pointers.push_back(pointer); if (document.is_array()) { for (std::size_t index = 0; index < document.size(); index++) { - PointerT subpointer{pointer}; - subpointer.emplace_back(index); - this->walk(document.at(index), subpointer); + pointer.emplace_back(index); + this->walk(document.at(index), pointer); + pointer.pop_back(); } } else if (document.is_object()) { for (const auto &pair : document.as_object()) { - PointerT subpointer{pointer}; - subpointer.emplace_back(pair.first); - this->walk(pair.second, subpointer); + pointer.emplace_back(pair.first); + this->walk(pair.second, pointer); + pointer.pop_back(); } } } diff --git a/src/core/jsonpointer/jsonpointer.cc b/src/core/jsonpointer/jsonpointer.cc index 576cb53b86..23f71a8dc7 100644 --- a/src/core/jsonpointer/jsonpointer.cc +++ b/src/core/jsonpointer/jsonpointer.cc @@ -127,15 +127,19 @@ auto try_traverse(const sourcemeta::core::JSON &document, return nullptr; } else { const auto index{token.to_index()}; - if (index < current->size()) { - if (is_object) { - std::array buffer{}; - const auto [end_pointer, error_code] = std::to_chars( - buffer.data(), buffer.data() + buffer.size(), index); - current = ¤t->at(std::string_view{buffer.data(), end_pointer}); + if (is_object) { + std::array buffer{}; + const auto [end_pointer, error_code] = + std::to_chars(buffer.data(), buffer.data() + buffer.size(), index); + const auto *json_value{ + current->try_at(std::string_view{buffer.data(), end_pointer})}; + if (json_value) { + current = json_value; } else { - current = ¤t->at(index); + return nullptr; } + } else if (index < current->size()) { + current = ¤t->at(index); } else { return nullptr; } @@ -260,11 +264,19 @@ auto set(JSON &document, const Pointer &pointer, JSON &&value) -> void { // the (nonexistent) member after the last array element. // See https://www.rfc-editor.org/rfc/rfc6901#section-4 if (current.is_array() && last.is_hyphen()) { - current.push_back(value); + current.push_back(std::move(value)); } else if (last.is_property()) { current.at(last.to_property()).into(std::move(value)); } else { - current.at(last.to_index()).into(std::move(value)); + if (current.is_object()) { + std::array buffer{}; + const auto [end_pointer, error_code] = std::to_chars( + buffer.data(), buffer.data() + buffer.size(), last.to_index()); + current.at(std::string_view{buffer.data(), end_pointer}) + .into(std::move(value)); + } else { + current.at(last.to_index()).into(std::move(value)); + } } } diff --git a/src/core/jsonpointer/parser.h b/src/core/jsonpointer/parser.h index 3977321fd9..fa1bc44709 100644 --- a/src/core/jsonpointer/parser.h +++ b/src/core/jsonpointer/parser.h @@ -8,6 +8,7 @@ #include #include // std::from_chars +#include // std::size_t #include // std::uint64_t #include // std::basic_istream #include // std::basic_stringstream @@ -29,9 +30,9 @@ template typename Allocator> inline auto parse_index(std::basic_stringstream> &stream, - const std::uint64_t column) -> unsigned long { + const std::uint64_t column) -> std::size_t { const auto input = stream.str(); - unsigned long index_value{}; + std::size_t index_value{}; const auto result = std::from_chars(input.data(), input.data() + input.size(), index_value); if (result.ec != std::errc{}) [[unlikely]] { @@ -52,12 +53,17 @@ auto parse_pointer(std::basic_istream &stream) -> std::conditional_t { [[maybe_unused]] Pointer result; JSON::Char character = 0; + JSON::CharTraits::int_type code = 0; [[maybe_unused]] std::basic_stringstream string; std::uint64_t column{0}; parse_token_begin: - character = static_cast(stream.get()); + code = stream.get(); column += 1; + if (JSON::CharTraits::eq_int_type(code, JSON::CharTraits::eof())) { + goto done; + } + character = JSON::CharTraits::to_char_type(code); // A JSON Pointer is a Unicode string // containing a sequence of zero or more reference tokens, each prefixed // by a '/' (%x2F) character. @@ -65,14 +71,21 @@ auto parse_pointer(std::basic_istream &stream) switch (character) { case internal::token_pointer_slash: goto parse_token_content; - case static_cast(JSON::CharTraits::eof()): - goto done; default: throw PointerParseError(column); } parse_token_content: - character = static_cast(stream.peek()); + code = stream.peek(); + if (JSON::CharTraits::eq_int_type(code, JSON::CharTraits::eof())) { + column += 1; + stream.ignore(); + if constexpr (!CheckOnly) { + result.emplace_back(""); + } + goto done; + } + character = JSON::CharTraits::to_char_type(code); switch (character) { // Note that leading zeros are not allowed // See https://www.rfc-editor.org/rfc/rfc6901#section-4 @@ -95,13 +108,6 @@ auto parse_pointer(std::basic_istream &stream) string.put(character); } goto parse_token_index_rest_any; - case static_cast(JSON::CharTraits::eof()): - column += 1; - stream.ignore(); - if constexpr (!CheckOnly) { - result.emplace_back(""); - } - goto done; case internal::token_pointer_slash: if constexpr (!CheckOnly) { result.emplace_back(""); @@ -128,7 +134,17 @@ auto parse_pointer(std::basic_istream &stream) if constexpr (!CheckOnly) { string.put(character); } - character = static_cast(stream.peek()); + code = stream.peek(); + if (JSON::CharTraits::eq_int_type(code, JSON::CharTraits::eof())) { + column += 1; + stream.ignore(); + if constexpr (!CheckOnly) { + result.emplace_back(internal::parse_index(string, column)); + internal::reset(string); + } + goto done; + } + character = JSON::CharTraits::to_char_type(code); switch (character) { case internal::token_pointer_slash: column += 1; @@ -138,20 +154,22 @@ auto parse_pointer(std::basic_istream &stream) internal::reset(string); } goto parse_token_content; - case static_cast(JSON::CharTraits::eof()): - column += 1; - stream.ignore(); - if constexpr (!CheckOnly) { - result.emplace_back(internal::parse_index(string, column)); - internal::reset(string); - } - goto done; default: goto parse_token_property_rest_any; } parse_token_index_rest_any: - character = static_cast(stream.peek()); + code = stream.peek(); + if (JSON::CharTraits::eq_int_type(code, JSON::CharTraits::eof())) { + column += 1; + stream.ignore(); + if constexpr (!CheckOnly) { + result.emplace_back(internal::parse_index(string, column)); + internal::reset(string); + } + goto done; + } + character = JSON::CharTraits::to_char_type(code); switch (character) { case internal::token_pointer_slash: column += 1; @@ -161,14 +179,6 @@ auto parse_pointer(std::basic_istream &stream) internal::reset(string); } goto parse_token_content; - case static_cast(JSON::CharTraits::eof()): - column += 1; - stream.ignore(); - if constexpr (!CheckOnly) { - result.emplace_back(internal::parse_index(string, column)); - internal::reset(string); - } - goto done; case internal::token_pointer_number_zero: case internal::token_pointer_number_one: case internal::token_pointer_number_two: @@ -195,8 +205,16 @@ auto parse_pointer(std::basic_istream &stream) */ parse_token_property_rest_any: - character = static_cast(stream.get()); + code = stream.get(); column += 1; + if (JSON::CharTraits::eq_int_type(code, JSON::CharTraits::eof())) { + if constexpr (!CheckOnly) { + result.emplace_back(string.str()); + internal::reset(string); + } + goto done; + } + character = JSON::CharTraits::to_char_type(code); switch (character) { case internal::token_pointer_slash: if constexpr (!CheckOnly) { @@ -206,12 +224,6 @@ auto parse_pointer(std::basic_istream &stream) goto parse_token_content; case internal::token_pointer_tilde: goto parse_token_escape_tilde; - case static_cast(JSON::CharTraits::eof()): - if constexpr (!CheckOnly) { - result.emplace_back(string.str()); - internal::reset(string); - } - goto done; default: if constexpr (!CheckOnly) { string.put(character); @@ -220,8 +232,12 @@ auto parse_pointer(std::basic_istream &stream) } parse_token_escape_tilde: - character = static_cast(stream.get()); + code = stream.get(); column += 1; + if (JSON::CharTraits::eq_int_type(code, JSON::CharTraits::eof())) { + throw PointerParseError(column); + } + character = JSON::CharTraits::to_char_type(code); // Because the characters '~' (%x7E) and '/' (%x2F) have special // meanings in JSON Pointer, '~' needs to be encoded as '~0' and '/' // needs to be encoded as '~1' when these characters appear in a diff --git a/src/core/jsonpointer/stringify.h b/src/core/jsonpointer/stringify.h index d14cb93a50..1094ea8382 100644 --- a/src/core/jsonpointer/stringify.h +++ b/src/core/jsonpointer/stringify.h @@ -36,12 +36,12 @@ auto stringify_token(const TokenT &token, stream.put(internal::token_pointer_slash); if (token.is_property()) { for (const auto &character : token.to_property()) { + // Because the characters '~' (%x7E) and '/' (%x2F) have special + // meanings in JSON Pointer, '~' needs to be encoded as '~0' and '/' + // needs to be encoded as '~1' when these characters appear in a + // reference token. Every other character is written verbatim. + // See https://www.rfc-editor.org/rfc/rfc6901#section-3 switch (character) { - // Because the characters '~' (%x7E) and '/' (%x2F) have special - // meanings in JSON Pointer, '~' needs to be encoded as '~0' and '/' - // needs to be encoded as '~1' when these characters appear in a - // reference token. - // See https://www.rfc-editor.org/rfc/rfc6901#section-3 case internal::token_pointer_slash: stream.put(internal::token_pointer_tilde); stream.put(internal::token_pointer_one); @@ -50,322 +50,6 @@ auto stringify_token(const TokenT &token, stream.put(internal::token_pointer_tilde); stream.put(internal::token_pointer_zero); break; - - // All instances of quotation mark '"' (%x22), reverse solidus '\' - // (%x5C), and control (%x00-1F) characters MUST be escaped. See - // https://www.rfc-editor.org/rfc/rfc6901#section-5 - case internal::token_pointer_quote: - internal::write_character(stream, - internal::token_pointer_quote); - break; - case internal::token_pointer_reverse_solidus: - internal::write_character( - stream, internal::token_pointer_reverse_solidus); - break; - - // See https://www.asciitable.com - // See https://www.rfc-editor.org/rfc/rfc4627#section-2.5 - - // Null - case '\u0000': - internal::write_character( - stream, internal::token_pointer_reverse_solidus); - stream.put(internal::token_pointer_escape_unicode); - stream.put('0'); - stream.put('0'); - stream.put('0'); - stream.put('0'); - break; - // Start of heading - case '\u0001': - internal::write_character( - stream, internal::token_pointer_reverse_solidus); - stream.put(internal::token_pointer_escape_unicode); - stream.put('0'); - stream.put('0'); - stream.put('0'); - stream.put('1'); - break; - // Start of text - case '\u0002': - internal::write_character( - stream, internal::token_pointer_reverse_solidus); - stream.put(internal::token_pointer_escape_unicode); - stream.put('0'); - stream.put('0'); - stream.put('0'); - stream.put('2'); - break; - // End of text - case '\u0003': - internal::write_character( - stream, internal::token_pointer_reverse_solidus); - stream.put(internal::token_pointer_escape_unicode); - stream.put('0'); - stream.put('0'); - stream.put('0'); - stream.put('3'); - break; - // End of transmission - case '\u0004': - internal::write_character( - stream, internal::token_pointer_reverse_solidus); - stream.put(internal::token_pointer_escape_unicode); - stream.put('0'); - stream.put('0'); - stream.put('0'); - stream.put('4'); - break; - // Enquiry - case '\u0005': - internal::write_character( - stream, internal::token_pointer_reverse_solidus); - stream.put(internal::token_pointer_escape_unicode); - stream.put('0'); - stream.put('0'); - stream.put('0'); - stream.put('5'); - break; - // Acknowledge - case '\u0006': - internal::write_character( - stream, internal::token_pointer_reverse_solidus); - stream.put(internal::token_pointer_escape_unicode); - stream.put('0'); - stream.put('0'); - stream.put('0'); - stream.put('6'); - break; - // Bell - case '\u0007': - internal::write_character( - stream, internal::token_pointer_reverse_solidus); - stream.put(internal::token_pointer_escape_unicode); - stream.put('0'); - stream.put('0'); - stream.put('0'); - stream.put('7'); - break; - // Backspace - case '\u0008': - internal::write_character( - stream, internal::token_pointer_reverse_solidus); - stream.put(internal::token_pointer_escape_backspace); - break; - // Horizontal tab - case '\u0009': - internal::write_character( - stream, internal::token_pointer_reverse_solidus); - stream.put(internal::token_pointer_escape_tab); - break; - // Line feed - case '\u000A': - internal::write_character( - stream, internal::token_pointer_reverse_solidus); - stream.put(internal::token_pointer_escape_line_feed); - break; - // Vertical tab - case '\u000B': - internal::write_character( - stream, internal::token_pointer_reverse_solidus); - stream.put(internal::token_pointer_escape_unicode); - stream.put('0'); - stream.put('0'); - stream.put('0'); - stream.put('B'); - break; - // Form feed - case '\u000C': - internal::write_character( - stream, internal::token_pointer_reverse_solidus); - stream.put(internal::token_pointer_escape_form_feed); - break; - // Carriage return - case '\u000D': - internal::write_character( - stream, internal::token_pointer_reverse_solidus); - stream.put(internal::token_pointer_escape_carriage_return); - break; - // Shift out - case '\u000E': - internal::write_character( - stream, internal::token_pointer_reverse_solidus); - stream.put(internal::token_pointer_escape_unicode); - stream.put('0'); - stream.put('0'); - stream.put('0'); - stream.put('E'); - break; - // Shift in - case '\u000F': - internal::write_character( - stream, internal::token_pointer_reverse_solidus); - stream.put(internal::token_pointer_escape_unicode); - stream.put('0'); - stream.put('0'); - stream.put('0'); - stream.put('F'); - break; - // Data link escape - case '\u0010': - internal::write_character( - stream, internal::token_pointer_reverse_solidus); - stream.put(internal::token_pointer_escape_unicode); - stream.put('0'); - stream.put('0'); - stream.put('1'); - stream.put('0'); - break; - // Device control 1 - case '\u0011': - internal::write_character( - stream, internal::token_pointer_reverse_solidus); - stream.put(internal::token_pointer_escape_unicode); - stream.put('0'); - stream.put('0'); - stream.put('1'); - stream.put('1'); - break; - // Device control 2 - case '\u0012': - internal::write_character( - stream, internal::token_pointer_reverse_solidus); - stream.put(internal::token_pointer_escape_unicode); - stream.put('0'); - stream.put('0'); - stream.put('1'); - stream.put('2'); - break; - // Device control 3 - case '\u0013': - internal::write_character( - stream, internal::token_pointer_reverse_solidus); - stream.put(internal::token_pointer_escape_unicode); - stream.put('0'); - stream.put('0'); - stream.put('1'); - stream.put('3'); - break; - // Device control 4 - case '\u0014': - internal::write_character( - stream, internal::token_pointer_reverse_solidus); - stream.put(internal::token_pointer_escape_unicode); - stream.put('0'); - stream.put('0'); - stream.put('1'); - stream.put('4'); - break; - // Negative acknowledge - case '\u0015': - internal::write_character( - stream, internal::token_pointer_reverse_solidus); - stream.put(internal::token_pointer_escape_unicode); - stream.put('0'); - stream.put('0'); - stream.put('1'); - stream.put('5'); - break; - // Synchronous idle - case '\u0016': - internal::write_character( - stream, internal::token_pointer_reverse_solidus); - stream.put(internal::token_pointer_escape_unicode); - stream.put('0'); - stream.put('0'); - stream.put('1'); - stream.put('6'); - break; - // End of transmission block - case '\u0017': - internal::write_character( - stream, internal::token_pointer_reverse_solidus); - stream.put(internal::token_pointer_escape_unicode); - stream.put('0'); - stream.put('0'); - stream.put('1'); - stream.put('7'); - break; - // Cancel - case '\u0018': - internal::write_character( - stream, internal::token_pointer_reverse_solidus); - stream.put(internal::token_pointer_escape_unicode); - stream.put('0'); - stream.put('0'); - stream.put('1'); - stream.put('8'); - break; - // End of medium - case '\u0019': - internal::write_character( - stream, internal::token_pointer_reverse_solidus); - stream.put(internal::token_pointer_escape_unicode); - stream.put('0'); - stream.put('0'); - stream.put('1'); - stream.put('9'); - break; - // Substitute - case '\u001A': - internal::write_character( - stream, internal::token_pointer_reverse_solidus); - stream.put(internal::token_pointer_escape_unicode); - stream.put('0'); - stream.put('0'); - stream.put('1'); - stream.put('A'); - break; - // Escape - case '\u001B': - internal::write_character( - stream, internal::token_pointer_reverse_solidus); - stream.put(internal::token_pointer_escape_unicode); - stream.put('0'); - stream.put('0'); - stream.put('1'); - stream.put('B'); - break; - // File separator - case '\u001C': - internal::write_character( - stream, internal::token_pointer_reverse_solidus); - stream.put(internal::token_pointer_escape_unicode); - stream.put('0'); - stream.put('0'); - stream.put('1'); - stream.put('C'); - break; - // Group separator - case '\u001D': - internal::write_character( - stream, internal::token_pointer_reverse_solidus); - stream.put(internal::token_pointer_escape_unicode); - stream.put('0'); - stream.put('0'); - stream.put('1'); - stream.put('D'); - break; - // Record separator - case '\u001E': - internal::write_character( - stream, internal::token_pointer_reverse_solidus); - stream.put(internal::token_pointer_escape_unicode); - stream.put('0'); - stream.put('0'); - stream.put('1'); - stream.put('E'); - break; - // Unit separator - case '\u001F': - internal::write_character( - stream, internal::token_pointer_reverse_solidus); - stream.put(internal::token_pointer_escape_unicode); - stream.put('0'); - stream.put('0'); - stream.put('1'); - stream.put('F'); - break; default: internal::write_character(stream, character); } diff --git a/src/core/jsonrpc/include/sourcemeta/core/jsonrpc.h b/src/core/jsonrpc/include/sourcemeta/core/jsonrpc.h index ffabca5597..9962aac2c8 100644 --- a/src/core/jsonrpc/include/sourcemeta/core/jsonrpc.h +++ b/src/core/jsonrpc/include/sourcemeta/core/jsonrpc.h @@ -67,8 +67,10 @@ SOURCEMETA_CORE_JSONRPC_EXPORT auto jsonrpc_is_server_error(const std::int64_t code) -> bool; /// @ingroup jsonrpc -/// Check whether the given JSON value is a JSON-RPC 2.0 batch envelope. For -/// example: +/// Check whether the given JSON value is a JSON-RPC 2.0 batch envelope. This +/// only verifies that the value is an array, so the caller remains responsible +/// for the empty-batch single-error response and for omitting a response when a +/// batch consists entirely of notifications. For example: /// /// ```cpp /// #include diff --git a/src/core/markdown/include/sourcemeta/core/markdown.h b/src/core/markdown/include/sourcemeta/core/markdown.h index 0a1a10043a..557f17f6f9 100644 --- a/src/core/markdown/include/sourcemeta/core/markdown.h +++ b/src/core/markdown/include/sourcemeta/core/markdown.h @@ -23,7 +23,10 @@ namespace sourcemeta::core { /// @ingroup markdown /// Convert a Markdown string to an HTML fragment using GitHub Flavored /// Markdown (GFM) with all standard extensions enabled (tables, autolinks, -/// strikethrough, tag filtering, and task lists). For example: +/// strikethrough, tag filtering, and task lists). Raw HTML and dangerous +/// links in the input pass through unchanged by default, so the result is not +/// safe to render from untrusted input. Enabling safe mode suppresses raw HTML +/// and strips unsafe links. For example: /// /// ```cpp /// #include @@ -34,7 +37,8 @@ namespace sourcemeta::core { /// assert(result == "

Hello world

\n"); /// ``` SOURCEMETA_CORE_MARKDOWN_EXPORT -auto markdown_to_html(const std::string_view input) -> std::string; +auto markdown_to_html(const std::string_view input, const bool safe = false) + -> std::string; } // namespace sourcemeta::core diff --git a/src/core/markdown/markdown.cc b/src/core/markdown/markdown.cc index b15ea799fb..25aa4da70e 100644 --- a/src/core/markdown/markdown.cc +++ b/src/core/markdown/markdown.cc @@ -15,10 +15,12 @@ const bool cmark_initialized = namespace sourcemeta::core { -auto markdown_to_html(const std::string_view input) -> std::string { - static constexpr auto options{CMARK_OPT_VALIDATE_UTF8 | CMARK_OPT_FOOTNOTES | - CMARK_OPT_STRIKETHROUGH_DOUBLE_TILDE | - CMARK_OPT_GITHUB_PRE_LANG}; +auto markdown_to_html(const std::string_view input, const bool safe) + -> std::string { + static constexpr auto base_options{ + CMARK_OPT_VALIDATE_UTF8 | CMARK_OPT_FOOTNOTES | + CMARK_OPT_STRIKETHROUGH_DOUBLE_TILDE | CMARK_OPT_GITHUB_PRE_LANG}; + const int options{safe ? (base_options | CMARK_OPT_SAFE) : base_options}; auto *parser{cmark_parser_new(options)}; diff --git a/src/core/mcp/include/sourcemeta/core/mcp.h b/src/core/mcp/include/sourcemeta/core/mcp.h index de244891ee..44508d24a6 100644 --- a/src/core/mcp/include/sourcemeta/core/mcp.h +++ b/src/core/mcp/include/sourcemeta/core/mcp.h @@ -123,7 +123,9 @@ constexpr auto mcp_is_request_method(const JSON::StringView method) noexcept /// @ingroup mcp /// Resolve an `MCP-Protocol-Version` header value into a known protocol -/// version, or `std::nullopt` when the value is unrecognised. For example: +/// version, or `std::nullopt` when the value is unrecognised. An absent header +/// resolves to the oldest supported version per the Streamable HTTP transport. +/// For example: /// /// ```cpp /// #include diff --git a/src/core/mcp/mcp.cc b/src/core/mcp/mcp.cc index 969a9587a3..247e268f5e 100644 --- a/src/core/mcp/mcp.cc +++ b/src/core/mcp/mcp.cc @@ -321,6 +321,11 @@ auto mcp_make_initialize_result(const sourcemeta::core::JSON &request, requested_version = protocol_version_field->to_string(); } const auto resolved{mcp_resolve_protocol_version(requested_version)}; + // MCP lifecycle, version negotiation: "If the server supports the requested + // protocol version, it MUST respond with the same version. Otherwise, the + // server MUST respond with another protocol version it supports. This SHOULD + // be the latest version supported by the server." + // https://modelcontextprotocol.io/specification/2025-06-18/basic/lifecycle#version-negotiation const auto version{resolved.value_or(MCPProtocolVersion::V_2025_11_25)}; auto capabilities_object{sourcemeta::core::JSON::make_object()}; diff --git a/src/core/punycode/punycode.cc b/src/core/punycode/punycode.cc index 54e705799c..7d883c9c54 100644 --- a/src/core/punycode/punycode.cc +++ b/src/core/punycode/punycode.cc @@ -6,7 +6,6 @@ #include // assert #include // std::uint32_t, std::uint64_t #include // std::numeric_limits -#include // std::ostringstream #include // std::vector namespace sourcemeta::core { @@ -85,6 +84,11 @@ static auto punycode_encode(const std::u32string_view codepoints, non_basic_sorted.reserve(codepoints.size()); for (const auto code_point : codepoints) { + if (code_point > 0x10FFFF || + (code_point >= 0xD800 && code_point <= 0xDFFF)) { + throw PunycodeError("Invalid code point"); + } + if (is_basic(code_point)) { output.push_back(static_cast(code_point)); } else { @@ -172,11 +176,15 @@ static auto punycode_decode(const std::string_view encoded, std::uint32_t insertion_index{0}; std::uint32_t bias{INITIAL_BIAS}; + decoded.reserve(encoded.size()); + const auto delimiter_position = encoded.rfind(DELIMITER); std::size_t position{0}; - if (delimiter_position != std::string_view::npos) { - decoded.reserve(encoded.size()); + // RFC 3492 Section 6.2: the delimiter is consumed only when at least one + // basic code point precedes it. A leading delimiter is not consumed and + // becomes the first extended digit, which fails to decode. + if (delimiter_position != std::string_view::npos && delimiter_position > 0) { for (std::size_t index = 0; index < delimiter_position; index += 1) { const auto code_point = static_cast(encoded[index]); if (!is_basic(code_point)) { @@ -303,11 +311,12 @@ auto utf8_to_punycode(const std::string_view input) -> std::string { auto punycode_to_utf8(const std::string_view input) -> std::string { std::u32string decoded; punycode_decode(input, decoded); - std::ostringstream output_stream; + std::string output; + output.reserve(decoded.size()); for (const auto code_point : decoded) { - codepoint_to_utf8(code_point, output_stream); + codepoint_to_utf8(code_point, output); } - return output_stream.str(); + return output; } } // namespace sourcemeta::core diff --git a/src/core/regex/regex.cc b/src/core/regex/regex.cc index e8ac1fdd0b..a5fd7ae6a5 100644 --- a/src/core/regex/regex.cc +++ b/src/core/regex/regex.cc @@ -13,6 +13,23 @@ #include // std::errc #include // std::unreachable +namespace { + +auto make_bounded_match_context() -> pcre2_match_context * { + pcre2_match_context *context{pcre2_match_context_create(nullptr)}; + // A null context is a valid argument to the matcher and selects the default + // limits, so an allocation failure degrades safely rather than crashing + if (context == nullptr) { + return nullptr; + } + + pcre2_set_match_limit(context, 1000000); + pcre2_set_depth_limit(context, 1000); + return context; +} + +} // namespace + namespace sourcemeta::core { auto to_regex(const std::string_view pattern) -> std::optional { @@ -95,12 +112,17 @@ auto matches(const Regex ®ex, const std::string_view value) -> bool { case RegexIndex::PCRE2: { const RegexTypePCRE2 *pcre2_regex{std::get_if(®ex)}; auto *pcre2_code_ptr{static_cast(pcre2_regex->code.get())}; - // Re-use this to avoid creating and destroying the `struct`on every call + // These are intentionally never freed as they live for the lifetime of + // the thread and reusing them avoids allocating on every call thread_local pcre2_match_data *match_data{ pcre2_match_data_create(1, nullptr)}; + // Cap the work the matcher may perform so that pathological patterns + // applied to adversarial inputs terminate instead of running unbounded + thread_local pcre2_match_context *match_context{ + make_bounded_match_context()}; const int match_result{pcre2_match( pcre2_code_ptr, reinterpret_cast(value.data()), - value.size(), 0, PCRE2_NO_UTF_CHECK, match_data, nullptr)}; + value.size(), 0, PCRE2_NO_UTF_CHECK, match_data, match_context)}; return match_result >= 0; } case RegexIndex::Noop: diff --git a/src/core/time/CMakeLists.txt b/src/core/time/CMakeLists.txt index 62481adb0b..bfd35d6138 100644 --- a/src/core/time/CMakeLists.txt +++ b/src/core/time/CMakeLists.txt @@ -1,5 +1,5 @@ sourcemeta_library(NAMESPACE sourcemeta PROJECT core NAME time - SOURCES imf_fixdate.cc rfc850_date.cc asctime.cc + SOURCES helpers.h imf_fixdate.cc rfc850_date.cc asctime.cc rfc3339_datetime.cc rfc3339_fulldate.cc rfc3339_fulltime.cc rfc3339_partialtime_no_secfrac.cc rfc3339_duration.cc) diff --git a/src/core/time/asctime.cc b/src/core/time/asctime.cc index 777aed71e9..3622ad761e 100644 --- a/src/core/time/asctime.cc +++ b/src/core/time/asctime.cc @@ -1,5 +1,7 @@ #include +#include "helpers.h" + #include // assert #include // std::isdigit #include // std::chrono::system_clock @@ -75,6 +77,9 @@ auto from_asctime(const std::string_view value) noexcept if (stream.fail()) { return std::nullopt; } + if (!is_valid_broken_down_time(parts)) { + return std::nullopt; + } #if defined(_MSC_VER) return std::chrono::system_clock::from_time_t(_mkgmtime(&parts)); #else diff --git a/src/core/time/helpers.h b/src/core/time/helpers.h new file mode 100644 index 0000000000..4eb86e14e8 --- /dev/null +++ b/src/core/time/helpers.h @@ -0,0 +1,39 @@ +#ifndef SOURCEMETA_CORE_TIME_HELPERS_H_ +#define SOURCEMETA_CORE_TIME_HELPERS_H_ + +#include + +#include // std::uint8_t, std::uint16_t +#include // std::tm +#include // std::cmp_greater + +namespace sourcemeta::core { + +// Validate the calendar fields of a broken-down time, since the conversion to +// an epoch time point would otherwise silently normalise out-of-range values +inline auto is_valid_broken_down_time(const std::tm &parts) -> bool { + if (parts.tm_mon < 0 || parts.tm_mon > 11) { + return false; + } + const auto month{static_cast(parts.tm_mon + 1)}; + const auto year{static_cast(parts.tm_year + 1900)}; + if (parts.tm_mday < 1 || + std::cmp_greater(parts.tm_mday, max_day_in_month(month, year))) { + return false; + } + if (parts.tm_hour < 0 || parts.tm_hour > 23) { + return false; + } + if (parts.tm_min < 0 || parts.tm_min > 59) { + return false; + } + // RFC 3339 §5.6: a leap second is represented as a "60" second value + if (parts.tm_sec < 0 || parts.tm_sec > 60) { + return false; + } + return true; +} + +} // namespace sourcemeta::core + +#endif diff --git a/src/core/time/imf_fixdate.cc b/src/core/time/imf_fixdate.cc index b4af8dde6e..fe712a70a0 100644 --- a/src/core/time/imf_fixdate.cc +++ b/src/core/time/imf_fixdate.cc @@ -1,5 +1,7 @@ #include +#include "helpers.h" + #include // assert #include // std::isdigit #include // std::chrono::system_clock @@ -70,6 +72,9 @@ auto from_imf_fixdate(const std::string_view value) noexcept if (stream.fail()) { return std::nullopt; } + if (!is_valid_broken_down_time(parts)) { + return std::nullopt; + } #if defined(_MSC_VER) return std::chrono::system_clock::from_time_t(_mkgmtime(&parts)); #else diff --git a/src/core/time/rfc850_date.cc b/src/core/time/rfc850_date.cc index a41e2576e3..86cc0b8953 100644 --- a/src/core/time/rfc850_date.cc +++ b/src/core/time/rfc850_date.cc @@ -1,5 +1,7 @@ #include +#include "helpers.h" + #include // std::ranges::find #include // std::array #include // assert @@ -113,6 +115,9 @@ auto from_rfc850_date(const std::string_view value) noexcept if (stream.fail()) { return std::nullopt; } + if (!is_valid_broken_down_time(parts)) { + return std::nullopt; + } #if defined(_MSC_VER) return std::chrono::system_clock::from_time_t(_mkgmtime(&parts)); #else diff --git a/src/core/unicode/include/sourcemeta/core/unicode.h b/src/core/unicode/include/sourcemeta/core/unicode.h index 7aef8328c0..e84387f641 100644 --- a/src/core/unicode/include/sourcemeta/core/unicode.h +++ b/src/core/unicode/include/sourcemeta/core/unicode.h @@ -27,7 +27,9 @@ namespace sourcemeta::core { /// @ingroup unicode -/// Encode a single Unicode codepoint as a UTF-8 string. For example: +/// Encode a single Unicode codepoint as a UTF-8 string. The codepoint must be +/// a valid Unicode scalar value, otherwise the output is unspecified. +/// For example: /// /// ```cpp /// #include @@ -39,8 +41,9 @@ SOURCEMETA_CORE_UNICODE_EXPORT auto codepoint_to_utf8(const char32_t codepoint) -> std::string; /// @ingroup unicode -/// Encode a single Unicode codepoint as UTF-8 into an output stream. -/// For example: +/// Encode a single Unicode codepoint as UTF-8 into an output stream. The +/// codepoint must be a valid Unicode scalar value, otherwise the output is +/// unspecified. For example: /// /// ```cpp /// #include @@ -56,7 +59,8 @@ auto codepoint_to_utf8(const char32_t codepoint, std::ostream &output) -> void; /// @ingroup unicode /// Encode a single Unicode codepoint as UTF-8, appending to an existing string. -/// For example: +/// The codepoint must be a valid Unicode scalar value, otherwise the output is +/// unspecified. For example: /// /// ```cpp /// #include @@ -138,7 +142,7 @@ inline constexpr auto utf8_lead_byte_size(const unsigned char byte) /// @ingroup unicode /// Check whether the given byte is a UTF-8 continuation byte (%x80-BF per -/// RFC 6532 Section 3.1). For example: +/// RFC 3629 Section 4). For example: /// /// ```cpp /// #include @@ -469,7 +473,7 @@ auto is_nfc(const std::u32string_view input) -> bool; /// @ingroup unicode /// Determine the byte length of the valid UTF-8 codepoint starting at the /// given position within the input. Returns 1 for an ASCII byte, 2/3/4 for a -/// valid multi-byte UTF-8 sequence (RFC 6532 Section 3.1, excluding overlong +/// valid multi-byte UTF-8 sequence (RFC 3629 Section 4, excluding overlong /// encodings, surrogates, and code points above U+10FFFF), or 0 if the bytes /// at that position do not start a valid UTF-8 codepoint. For example: /// @@ -500,7 +504,7 @@ utf8_codepoint_length(const std::string_view input, } // The second byte after the lead has tighter sub-ranges for specific leads - // (RFC 6532 §3.1) that exclude overlong encodings, surrogates, and code + // (RFC 3629 §4) that exclude overlong encodings, surrogates, and code // points above U+10FFFF const auto byte_1{static_cast(input[position + 1])}; bool byte_1_ok{false}; diff --git a/src/core/unicode/unicode.cc b/src/core/unicode/unicode.cc index da7884fb2d..f3d9463acc 100644 --- a/src/core/unicode/unicode.cc +++ b/src/core/unicode/unicode.cc @@ -1,13 +1,54 @@ #include -#include // assert -#include // std::uint8_t -#include // std::istringstream, std::ostringstream +#include // std::array +#include // assert +#include // std::size_t +#include // std::uint8_t +#include // std::optional, std::nullopt #include "unicode_data.h" namespace sourcemeta::core { +namespace { + +// Decode the code point of a multi-byte sequence from its lead byte and the +// continuation bytes that follow it, rejecting invalid continuation bytes, +// overlong encodings, and code points that are not valid scalar values +auto utf8_decode_sequence(const std::uint8_t lead, const std::uint8_t size, + const std::uint8_t *continuations) + -> std::optional { + char32_t code_point{0}; + char32_t minimum{0}; + if (size == 2) { + code_point = lead & 0x1F; + minimum = 0x80; + } else if (size == 3) { + code_point = lead & 0x0F; + minimum = 0x800; + } else { + code_point = lead & 0x07; + minimum = 0x10000; + } + + for (std::uint8_t index{0}; index < size - 1; ++index) { + const auto continuation{continuations[index]}; + if (!is_utf8_continuation(continuation)) { + return std::nullopt; + } + + code_point = (code_point << 6) | (continuation & 0x3F); + } + + if (code_point < minimum || !is_valid_codepoint(code_point)) { + return std::nullopt; + } + + return code_point; +} + +} // namespace + auto codepoint_to_utf8(const char32_t codepoint, std::ostream &output) -> void { assert(is_valid_codepoint(codepoint)); if (codepoint < 0x80) { @@ -66,34 +107,20 @@ auto utf8_to_utf32(std::istream &input) -> std::optional { continue; } - char32_t code_point{0}; - char32_t minimum{0}; - if (size == 2) { - code_point = byte & 0x1F; - minimum = 0x80; - } else if (size == 3) { - code_point = byte & 0x0F; - minimum = 0x800; - } else { - code_point = byte & 0x07; - minimum = 0x10000; - } - - for (std::uint8_t index{1}; index < size; ++index) { - std::uint8_t continuation{0}; - if (!input.read(reinterpret_cast(&continuation), 1) || - !is_utf8_continuation(continuation)) { + std::array continuations{}; + for (std::uint8_t index{0}; index < size - 1; ++index) { + if (!input.read(reinterpret_cast(&continuations[index]), 1)) { return std::nullopt; } - - code_point = (code_point << 6) | (continuation & 0x3F); } - if (code_point < minimum || !is_valid_codepoint(code_point)) { + const auto code_point{ + utf8_decode_sequence(byte, size, continuations.data())}; + if (!code_point.has_value()) { return std::nullopt; } - result.push_back(code_point); + result.push_back(code_point.value()); } if (!input.eof()) { @@ -105,10 +132,38 @@ auto utf8_to_utf32(std::istream &input) -> std::optional { auto utf8_to_utf32(const std::string_view input) -> std::optional { - // TODO: Replace std::istringstream with std::ispanstream once libc++ - // supports it (__cpp_lib_spanstream), to avoid copying the input string - std::istringstream stream{std::string{input}}; - return utf8_to_utf32(stream); + std::u32string result; + result.reserve(input.size()); + + std::size_t position{0}; + while (position < input.size()) { + const auto byte{static_cast(input[position])}; + const auto size{utf8_lead_byte_size(byte)}; + if (size == 0) { + return std::nullopt; + } + if (size == 1) { + result.push_back(byte); + position += 1; + continue; + } + + if (input.size() - position < size) { + return std::nullopt; + } + + const auto code_point{utf8_decode_sequence( + byte, size, + reinterpret_cast(input.data() + position + 1))}; + if (!code_point.has_value()) { + return std::nullopt; + } + + result.push_back(code_point.value()); + position += size; + } + + return result; } auto combining_class(const char32_t codepoint) noexcept -> std::uint8_t { diff --git a/src/core/uri/accessors.cc b/src/core/uri/accessors.cc index 1f87839b11..6ba7d0ca46 100644 --- a/src/core/uri/accessors.cc +++ b/src/core/uri/accessors.cc @@ -1,9 +1,9 @@ #include #include -#include // std::uint32_t -#include // std::optional -#include // std::string +#include // std::uint32_t +#include // std::optional +#include // std::string_view namespace sourcemeta::core { @@ -66,7 +66,13 @@ auto URI::host() const -> std::optional { auto URI::port() const -> std::optional { return this->port_; } -auto URI::path() const -> std::optional { return this->path_; } +auto URI::path() const -> std::optional { + if (this->path_.has_value()) { + return this->path_.value(); + } + + return std::nullopt; +} auto URI::fragment() const -> std::optional { return this->fragment_; diff --git a/src/core/uri/canonicalize.cc b/src/core/uri/canonicalize.cc index dc23b33694..f623d45175 100644 --- a/src/core/uri/canonicalize.cc +++ b/src/core/uri/canonicalize.cc @@ -15,11 +15,6 @@ auto URI::canonicalize() -> URI & { sourcemeta::core::to_lowercase(this->scheme_.value()); } - // Lowercase host (hostnames are case-insensitive per RFC 3986) - if (this->host_.has_value()) { - sourcemeta::core::to_lowercase(this->host_.value()); - } - // Canonicalize path by removing "." and ".." segments if (this->path_.has_value() && !this->path_.value().empty()) { auto ¤t_path{this->path_.value()}; @@ -34,46 +29,34 @@ auto URI::canonicalize() -> URI & { this->fragment_ = std::nullopt; } - // pchar = unreserved / pct-encoded / sub-delims / ":" / "@" - // See https://www.rfc-editor.org/rfc/rfc3986#appendix-A - const auto is_pchar = [](char character) -> bool { - return uri_is_unreserved(character) || uri_is_sub_delim(character) || - character == URI_COLON || character == URI_AT; - }; - + // Only unreserved characters may be percent-decoded during normalization + // See https://www.rfc-editor.org/rfc/rfc3986#section-6.2.2.2 if (this->path_.has_value()) { uri_normalize_percent_encoding_inplace(this->path_.value()); - uri_unescape_if_inplace(this->path_.value(), is_pchar); + uri_unescape_unreserved_inplace(this->path_.value()); } if (this->query_.has_value()) { uri_normalize_percent_encoding_inplace(this->query_.value()); - uri_unescape_if_inplace(this->query_.value(), [&](char character) { - return is_pchar(character) || character == URI_SLASH || - character == URI_QUESTION; - }); + uri_unescape_unreserved_inplace(this->query_.value()); } if (this->fragment_.has_value()) { uri_normalize_percent_encoding_inplace(this->fragment_.value()); - uri_unescape_if_inplace(this->fragment_.value(), [&](char character) { - return is_pchar(character) || character == URI_SLASH || - character == URI_QUESTION; - }); + uri_unescape_unreserved_inplace(this->fragment_.value()); } if (this->userinfo_.has_value()) { uri_normalize_percent_encoding_inplace(this->userinfo_.value()); - uri_unescape_if_inplace(this->userinfo_.value(), [&](char character) { - return uri_is_sub_delim(character) || character == URI_COLON; - }); + uri_unescape_unreserved_inplace(this->userinfo_.value()); } + // Hostnames are case-insensitive per RFC 3986, and the lowercasing must come + // after decoding so that a percent-encoded uppercase letter is also folded if (this->host_.has_value()) { uri_normalize_percent_encoding_inplace(this->host_.value()); - uri_unescape_if_inplace(this->host_.value(), [](char character) { - return uri_is_sub_delim(character); - }); + uri_unescape_unreserved_inplace(this->host_.value()); + sourcemeta::core::to_lowercase(this->host_.value()); } // Remove default ports (80 for http, 443 for https) diff --git a/src/core/uri/escaping.h b/src/core/uri/escaping.h index 0f60013f99..41a71a6423 100644 --- a/src/core/uri/escaping.h +++ b/src/core/uri/escaping.h @@ -3,14 +3,9 @@ #include "grammar.h" -#include // std::array -#include // std::isalnum -#include // std::from_chars -#include // std::uint8_t -#include // std::istream -#include // std::istream_iterator -#include // std::ostream -#include // std::string +#include // std::isxdigit, std::toupper +#include // std::uint8_t +#include // std::string namespace sourcemeta::core { @@ -38,107 +33,6 @@ enum class URIEscapeMode : std::uint8_t { UserInfo }; -inline auto uri_escape(std::istream &input, std::ostream &output, - const URIEscapeMode mode, - const bool preserve_percent_sequences = true) -> void { - char character = 0; - while (input.get(character)) { - // Check if this is an already percent-encoded sequence (%HEXHEX) - // If so, preserve it as-is to avoid double-encoding - // (only when preserve_percent_sequences is true) - if (preserve_percent_sequences && character == URI_PERCENT) { - const auto position = input.tellg(); - char next_1 = 0; - char next_2 = 0; - - if (input.get(next_1) && input.get(next_2) && - std::isxdigit(static_cast(next_1)) && - std::isxdigit(static_cast(next_2))) { - // Valid percent-encoded sequence - preserve it - output << character << next_1 << next_2; - continue; - } - - // Not a valid percent-encoded sequence - restore position and escape % - input.seekg(position); - } - - // unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" - // See https://www.rfc-editor.org/rfc/rfc3986#appendix-A - if (uri_is_unreserved(character)) { - output << character; - continue; - } - - if (mode == URIEscapeMode::SkipSubDelims || mode == URIEscapeMode::Path || - mode == URIEscapeMode::Fragment || mode == URIEscapeMode::Filesystem || - mode == URIEscapeMode::UserInfo) { - if (uri_is_sub_delim(character)) { - output << character; - continue; - } - } - - if (mode == URIEscapeMode::Path) { - if (character == URI_COLON || character == URI_AT || - character == URI_SLASH) { - output << character; - continue; - } - } - - if (mode == URIEscapeMode::Fragment) { - if (character == URI_COLON || character == URI_AT || - character == URI_SLASH || character == URI_QUESTION) { - output << character; - continue; - } - } - - if (mode == URIEscapeMode::Filesystem || mode == URIEscapeMode::UserInfo) { - if (character == URI_COLON) { - output << character; - continue; - } - } - - const auto byte{static_cast(character)}; - const auto high{(byte >> 4) & 0x0F}; - const auto low{byte & 0x0F}; - output << URI_PERCENT; - output << static_cast(high < 10 ? '0' + high : 'A' + high - 10); - output << static_cast(low < 10 ? '0' + low : 'A' + low - 10); - } -} - -inline auto uri_unescape(std::istream &input, std::ostream &output) -> void { - std::istream_iterator iterator(input); - std::istream_iterator end; - auto plus_1 = std::ranges::next(iterator, 1, end); - auto plus_2 = std::ranges::next(plus_1, 1, end); - const int hex_base = 16; - - while (iterator != end) { - if (*iterator == URI_PERCENT && plus_1 != end && plus_2 != end && - std::isxdigit(*(plus_1)) && std::isxdigit(*(plus_2))) { - const std::array hex{{*plus_1, *plus_2}}; - int decoded_value{}; - std::from_chars(hex.data(), hex.data() + hex.size(), decoded_value, - hex_base); - output << static_cast(decoded_value); - - iterator = std::ranges::next(plus_2, 1, end); - plus_1 = std::ranges::next(iterator, 1, end); - plus_2 = std::ranges::next(plus_1, 1, end); - } else { - output << *iterator; - iterator = plus_1; - plus_1 = plus_2; - plus_2 = std::ranges::next(plus_1, 1, end); - } - } -} - inline auto uri_hex_to_int(char character) -> unsigned char { if (character >= '0' && character <= '9') { return static_cast(character - '0'); diff --git a/src/core/uri/filesystem.cc b/src/core/uri/filesystem.cc index ac9ca084ce..c789fbb5b9 100644 --- a/src/core/uri/filesystem.cc +++ b/src/core/uri/filesystem.cc @@ -47,7 +47,7 @@ auto append_raw_segment(std::optional &path, namespace sourcemeta::core { auto URI::to_path() const -> std::filesystem::path { - auto path = this->path().value_or(""); + std::string path{this->path().value_or("")}; // For non-file URIs, just return the path as-is if (!this->is_file()) { diff --git a/src/core/uri/include/sourcemeta/core/uri.h b/src/core/uri/include/sourcemeta/core/uri.h index 9dbbde73de..084a8dbcf6 100644 --- a/src/core/uri/include/sourcemeta/core/uri.h +++ b/src/core/uri/include/sourcemeta/core/uri.h @@ -209,11 +209,13 @@ class SOURCEMETA_CORE_URI_EXPORT URI { /// /// const sourcemeta::core::URI uri{"https://www.sourcemeta.com"}; /// assert(uri.host().has_value()); - /// assert(uri.host().value() == "sourcemeta.com"); + /// assert(uri.host().value() == "www.sourcemeta.com"); /// ``` [[nodiscard]] auto host() const -> std::optional; - /// Get the port part of the URI, if any. For example: + /// Get the port part of the URI, if any. Parsing rejects a port that does not + /// fit in 32 bits even though RFC 3986 leaves the production unbounded. For + /// example: /// /// ```cpp /// #include @@ -236,7 +238,7 @@ class SOURCEMETA_CORE_URI_EXPORT URI { /// assert(uri.path().has_value()); /// assert(uri.path().value() == "/foo/bar"); /// ``` - [[nodiscard]] auto path() const -> std::optional; + [[nodiscard]] auto path() const -> std::optional; /// Set the path part of the URI. For example: /// @@ -495,7 +497,7 @@ class SOURCEMETA_CORE_URI_EXPORT URI { /// /// const sourcemeta::core::URI /// uri{"https://www.sourcemeta.com/foo/../bar"}; - /// assert(uri.recompose() == "https://sourcemeta.com/bar"); + /// assert(uri.recompose() == "https://www.sourcemeta.com/foo/../bar"); /// ``` [[nodiscard]] auto recompose() const -> std::string; @@ -526,7 +528,7 @@ class SOURCEMETA_CORE_URI_EXPORT URI { /// uri{"https://www.sourcemeta.com/foo#bar"}; /// assert(uri.recompose_without_fragment().has_value()); /// assert(uri.recompose_without_fragment().value() == - /// "https://sourcemeta.com/foo"); + /// "https://www.sourcemeta.com/foo"); /// ``` [[nodiscard]] auto recompose_without_fragment() const -> std::optional; @@ -565,7 +567,7 @@ class SOURCEMETA_CORE_URI_EXPORT URI { /// const sourcemeta::core::URI base{"https://www.sourcemeta.com"}; /// sourcemeta::core::URI result{"foo"}; /// result.resolve_from(base); - /// assert(result.recompose() == "https://sourcemeta.com/foo"); + /// assert(result.recompose() == "https://www.sourcemeta.com/foo"); /// ``` auto resolve_from(const URI &base) -> URI &; diff --git a/src/core/uri/parse.cc b/src/core/uri/parse.cc index 385bbb1b22..7786bda3b4 100644 --- a/src/core/uri/parse.cc +++ b/src/core/uri/parse.cc @@ -606,10 +606,6 @@ auto do_parse(const std::string_view input, uri_unescape_unreserved_inplace(parsed_path.value()); path = std::move(parsed_path.value()); - } else if (has_authority || has_scheme) { - if (input.ends_with(URI_SLASH) || input == "/") { - path = "/"; - } } } diff --git a/src/core/uri/recompose.cc b/src/core/uri/recompose.cc index 711635d012..702e1cf822 100644 --- a/src/core/uri/recompose.cc +++ b/src/core/uri/recompose.cc @@ -117,8 +117,7 @@ auto URI::recompose_relative() const -> std::string { const auto first_slash = path_value.find('/'); const auto first_segment_length = first_slash == std::string::npos ? path_value.size() : first_slash; - const std::string_view first_segment{path_value.data(), - first_segment_length}; + const auto first_segment{path_value.substr(0, first_segment_length)}; if (first_segment.contains(':')) { std::string encoded; encoded.reserve(first_segment_length + 4); diff --git a/src/core/uri/resolution.cc b/src/core/uri/resolution.cc index 75f4ea65b2..d102adfd7e 100644 --- a/src/core/uri/resolution.cc +++ b/src/core/uri/resolution.cc @@ -2,7 +2,6 @@ #include "normalize.h" -#include // assert #include // std::optional #include // std::string @@ -38,11 +37,6 @@ namespace sourcemeta::core { auto URI::resolve_from(const URI &base) -> URI & { // RFC 3986 Section 5.2.2: Transform References - // Check if this is a dot reference ("." or "./") before we modify the path - const bool was_dot_reference = - this->path_.has_value() && - (this->path_.value() == "." || this->path_.value() == "./"); - // Reference has a scheme - use as-is (already absolute) if (this->scheme_.has_value()) { if (this->path_.has_value()) { @@ -97,18 +91,6 @@ auto URI::resolve_from(const URI &base) -> URI & { // Reference has empty path if (!this->path_.has_value() || this->path_.value().empty()) { - // Special case: "." or "./" resolves to the containing directory - if (was_dot_reference) { - const auto base_path = base.path_.value_or(""); - const auto last_slash = base_path.rfind('/'); - if (last_slash != std::string::npos) { - this->path_ = base_path.substr(0, last_slash + 1); - } else { - this->path_ = std::nullopt; - } - return *this; - } - // Empty path with query or fragment means use base path this->path_ = base.path_; if (!this->query_.has_value()) { diff --git a/src/core/uri/setters.cc b/src/core/uri/setters.cc index dac8e9beb3..d8b7b29dc1 100644 --- a/src/core/uri/setters.cc +++ b/src/core/uri/setters.cc @@ -39,6 +39,45 @@ auto normalize_fragment(const std::string_view input) -> std::string { return std::string{input.starts_with('#') ? input.substr(1) : input}; } +// Raw string validation against the RFC 3986 Section 3.3 path productions. The +// input is not parsed as a URI because that would misclassify ':' in the first +// segment as a scheme delimiter and silently drop a '?' or '#' suffix +auto validate_raw_path(const std::string_view path) -> void { + if (path.starts_with("//")) { + throw sourcemeta::core::URIError{ + "You cannot set a path that contains an authority"}; + } + + for (std::size_t index = 0; index < path.size(); ++index) { + const char character = path[index]; + if (character == '%') { + if (index + 2 >= path.size() || + !std::isxdigit(static_cast(path[index + 1])) || + !std::isxdigit(static_cast(path[index + 2]))) { + throw sourcemeta::core::URIError{ + "You cannot set a path with an invalid percent-encoded sequence"}; + } + index += 2; + continue; + } + if (sourcemeta::core::uri_is_unreserved(character) || + sourcemeta::core::uri_is_sub_delim(character) || character == ':' || + character == '@' || character == '/') { + continue; + } + if (character == '?') { + throw sourcemeta::core::URIError{ + "You cannot set a path that contains a query"}; + } + if (character == '#') { + throw sourcemeta::core::URIError{ + "You cannot set a path that contains a fragment"}; + } + throw sourcemeta::core::URIError{ + "You cannot set a path that contains an invalid character"}; + } +} + } // namespace namespace sourcemeta::core { @@ -53,8 +92,7 @@ auto URI::path(const std::string &path) -> URI & { throw URIError{"You cannot set a relative path to an absolute URI"}; } - // Parse the path string to extract its normalized value - const auto parsed_path = URI{path}.path_; + validate_raw_path(path); // Determine if this URI needs a leading slash // (URIs with scheme/authority need leading slash, except URNs/tags/mailto) @@ -63,7 +101,8 @@ auto URI::path(const std::string &path) -> URI & { this->scheme_.has_value()) || this->port_.has_value() || this->host_.has_value(); - this->path_ = apply_leading_slash_transform(parsed_path, needs_leading_slash); + this->path_ = apply_leading_slash_transform(std::optional{path}, + needs_leading_slash); return *this; } @@ -77,8 +116,7 @@ auto URI::path(std::string &&path) -> URI & { throw URIError{"You cannot set a relative path to an absolute URI"}; } - // Parse the path string to extract its normalized value - const auto parsed_path = URI{path}.path_; + validate_raw_path(path); // Determine if this URI needs a leading slash // (URIs with scheme/authority need leading slash, except URNs/tags/mailto) @@ -87,7 +125,8 @@ auto URI::path(std::string &&path) -> URI & { this->scheme_.has_value()) || this->port_.has_value() || this->host_.has_value(); - this->path_ = apply_leading_slash_transform(parsed_path, needs_leading_slash); + this->path_ = apply_leading_slash_transform( + std::optional{std::move(path)}, needs_leading_slash); return *this; } @@ -284,7 +323,9 @@ auto URI::extension(std::string &&extension) -> URI & { } auto URI::fragment(const std::string_view fragment) -> URI & { - this->fragment_ = normalize_fragment(std::string{fragment}); + auto value{normalize_fragment(fragment)}; + uri_unescape_unreserved_inplace(value); + this->fragment_ = std::move(value); return *this; } diff --git a/src/core/uritemplate/helpers.h b/src/core/uritemplate/helpers.h index a071ed5a0d..d016515838 100644 --- a/src/core/uritemplate/helpers.h +++ b/src/core/uritemplate/helpers.h @@ -5,6 +5,7 @@ #include // std::array #include // std::size_t +#include // std::uint16_t #include // std::string #include // std::string_view #include // std::void_t @@ -52,6 +53,38 @@ static constexpr std::array HEX_DIGITS = { {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'}}; +// The prefix modifier counts characters rather than bytes, where a percent +// encoded triplet and a multi-byte UTF-8 sequence each count as one character +// See https://www.rfc-editor.org/rfc/rfc6570#section-2.4.1 +inline auto prefix_by_characters(const std::string_view input, + const std::uint16_t characters) + -> std::string_view { + std::string_view::size_type position = 0; + std::uint16_t taken = 0; + while (position < input.size() && taken < characters) { + if (input[position] == '%' && position + 2 < input.size() && + is_hex(input[position + 1]) && is_hex(input[position + 2])) { + position += 3; + } else { + const auto lead = static_cast(input[position]); + const std::string_view::size_type size = lead < 0x80 ? 1 + : lead < 0xE0 ? 2 + : lead < 0xF0 ? 3 + : lead < 0xF8 ? 4 + : 1; + position += size; + } + + taken++; + } + + if (position > input.size()) { + position = input.size(); + } + + return input.substr(0, position); +} + inline auto append_percent_encoded(std::string &output, const char character) -> void { const auto byte = static_cast(character); @@ -111,6 +144,32 @@ inline auto append_name(std::string &result, const std::string_view name, } } +// RFC 6570 Section 2.1: a literal character outside the pct-encoded form. The +// apostrophe is also accepted because the specification's own normative +// examples rely on it as a literal +inline auto is_literal_char(const char character) noexcept -> bool { + const auto byte = static_cast(character); + if (byte >= 0x80) { + return true; + } + + switch (byte) { + case 0x21: + case 0x23: + case 0x24: + case 0x26: + case 0x27: + case 0x3D: + case 0x5D: + case 0x5F: + case 0x7E: + return true; + default: + return (byte >= 0x28 && byte <= 0x3B) || (byte >= 0x3F && byte <= 0x5B) || + (byte >= 0x61 && byte <= 0x7A); + } +} + // RFC 6570 Section 2.3: varchar = ALPHA / DIGIT / "_" inline auto is_varchar(const char character) noexcept -> bool { return (character >= 'A' && character <= 'Z') || @@ -245,13 +304,17 @@ parse_variable_list(const std::string_view input, std::size_t position, throw URITemplateParseError(1); } + // A varspec must be followed by another varspec or the expression close + // See https://www.rfc-editor.org/rfc/rfc6570#section-2.3 if (input[position] == '}') { break; } - if (input[position] == ',') { - position++; + if (input[position] != ',') { + throw URITemplateParseError(position + 1); } + + position++; } return position; @@ -266,18 +329,28 @@ auto parse_expression(const std::string_view input) return std::nullopt; } - if (input[0] == '}') { - throw URITemplateParseError(1); - } - - std::size_t position = 1; + std::size_t position = 0; while (position < input.size()) { - if (input[position] == '{') { + const char character = input[position]; + if (character == '{') { break; } - if (input[position] == '}') { + + // See https://www.rfc-editor.org/rfc/rfc6570#section-2.1 + if (character == '%') { + if (position + 2 >= input.size() || !is_hex(input[position + 1]) || + !is_hex(input[position + 2])) { + throw URITemplateParseError(position + 1); + } + + position += 3; + continue; + } + + if (!is_literal_char(character)) { throw URITemplateParseError(position + 1); } + position++; } @@ -350,7 +423,7 @@ auto expand_expression( auto actual_value = value; if (variable.length > 0) { - actual_value = actual_value.substr(0, variable.length); + actual_value = prefix_by_characters(actual_value, variable.length); } if (variable.explode) { diff --git a/src/core/uritemplate/include/sourcemeta/core/uritemplate.h b/src/core/uritemplate/include/sourcemeta/core/uritemplate.h index b4e7fc4b90..29f5a49ffd 100644 --- a/src/core/uritemplate/include/sourcemeta/core/uritemplate.h +++ b/src/core/uritemplate/include/sourcemeta/core/uritemplate.h @@ -103,19 +103,31 @@ class SOURCEMETA_CORE_URITEMPLATE_EXPORT URITemplate { template > [[nodiscard]] auto expand(const Container &variables) const -> std::string { - return this->expand([&variables]( - const std::string_view name) -> URITemplateValue { - const auto iterator{variables.find(typename Container::key_type{name})}; - if (iterator == variables.end()) { - return std::nullopt; - } else { - return std::make_tuple(std::string_view{iterator->second}, std::nullopt, - false); - } - }); + return this->expand( + [&variables](const std::string_view name) -> URITemplateValue { + const auto iterator{find_variable(variables, name)}; + if (iterator == variables.end()) { + return std::nullopt; + } else { + return std::make_tuple(std::string_view{iterator->second}, + std::nullopt, false); + } + }); } private: + // Prefer a heterogeneous lookup so the key is not materialized when the + // container supports it + template + static auto find_variable(const Container &variables, + const std::string_view name) { + if constexpr (requires { variables.find(name); }) { + return variables.find(name); + } else { + return variables.find(typename Container::key_type{name}); + } + } + // Exporting symbols that depends on the standard C++ library is considered // safe. // https://learn.microsoft.com/en-us/cpp/error-messages/compiler-warnings/compiler-warning-level-2-c4275?view=msvc-170&redirectedfrom=MSDN diff --git a/src/core/uritemplate/uritemplate.cc b/src/core/uritemplate/uritemplate.cc index 569b8ebf72..0f4373c6c5 100644 --- a/src/core/uritemplate/uritemplate.cc +++ b/src/core/uritemplate/uritemplate.cc @@ -47,7 +47,7 @@ URITemplate::URITemplate(const std::string_view source) { URITemplateTokenQueryContinuationExpansion, URITemplateTokenVariable, URITemplateTokenLiteral>( remaining, offset, &this->tokens_)) { - break; + throw URITemplateParseError(offset + 1); } } catch (URITemplateParseError &error) { throw URITemplateParseError(offset + error.column()); diff --git a/src/core/uritemplate/uritemplate_router_view.cc b/src/core/uritemplate/uritemplate_router_view.cc index ad17300bc1..4020dec4f8 100644 --- a/src/core/uritemplate/uritemplate_router_view.cc +++ b/src/core/uritemplate/uritemplate_router_view.cc @@ -4,6 +4,7 @@ #include // std::ranges::sort #include // std::array #include // assert +#include // std::uint32_t, std::uint8_t, std::uintptr_t #include // std::memcmp, std::memcpy #include // std::ofstream #include // std::numeric_limits @@ -585,7 +586,9 @@ auto URITemplateRouterView::match( if (variable_node.type == URITemplateRouter::NodeType::OptionalExpansion) { if (variable_node.string_offset > string_table_size || variable_node.string_length > - string_table_size - variable_node.string_offset) { + string_table_size - variable_node.string_offset || + variable_index > + std::numeric_limits::max()) { return finalize_match(otherwise_context, 0, 0); } callback(static_cast(variable_index), diff --git a/src/core/yaml/parser.h b/src/core/yaml/parser.h index 992e16caa6..bb28ef7812 100644 --- a/src/core/yaml/parser.h +++ b/src/core/yaml/parser.h @@ -223,6 +223,22 @@ class Parser { } private: + static constexpr std::size_t maximum_expanded_nodes{10000000}; + + auto count_expanded_nodes(const JSON &value) -> std::size_t { + std::size_t total{1}; + if (value.is_array()) { + for (const auto &element : value.as_array()) { + total += this->count_expanded_nodes(element); + } + } else if (value.is_object()) { + for (const auto &entry : value.as_object()) { + total += this->count_expanded_nodes(entry.second); + } + } + return total; + } + auto process_directives(Token &token) -> void { bool seen_yaml_directive{false}; while (token.type == TokenType::DirectiveYAML || @@ -1412,6 +1428,12 @@ class Parser { callback_index++; } + this->expanded_nodes_ += this->count_expanded_nodes(anchored.value); + if (this->expanded_nodes_ > maximum_expanded_nodes) [[unlikely]] { + throw YAMLParseError{token.line, token.column, + "Maximum YAML alias expansion exceeded"}; + } + return anchored.value; } @@ -1953,6 +1975,7 @@ class Parser { std::unordered_map anchors_; bool recording_anchor_{false}; bool indent_width_detected_{false}; + std::size_t expanded_nodes_{0}; std::vector current_anchor_callbacks_; std::deque pending_tokens_; std::optional pending_token_position_; diff --git a/src/lang/error/include/sourcemeta/core/error_file.h b/src/lang/error/include/sourcemeta/core/error_file.h index b597fe677d..e423d1fba0 100644 --- a/src/lang/error/include/sourcemeta/core/error_file.h +++ b/src/lang/error/include/sourcemeta/core/error_file.h @@ -1,8 +1,7 @@ #ifndef SOURCEMETA_CORE_ERROR_FILE_H_ #define SOURCEMETA_CORE_ERROR_FILE_H_ -#include // assert -#include // std::filesystem::path, std::filesystem::exists +#include // std::filesystem::path #include // std::move, std::forward namespace sourcemeta::core { @@ -21,9 +20,7 @@ template class FileError : public T { public: template FileError(std::filesystem::path path, Args &&...args) - : T{std::forward(args)...}, path_{std::move(path)} { - assert(std::filesystem::exists(this->path_)); - } + : T{std::forward(args)...}, path_{std::move(path)} {} [[nodiscard]] auto path() const noexcept -> const std::filesystem::path & { return this->path_; diff --git a/src/lang/io/include/sourcemeta/core/io.h b/src/lang/io/include/sourcemeta/core/io.h index 82a2cc6b5f..6823a2e3ea 100644 --- a/src/lang/io/include/sourcemeta/core/io.h +++ b/src/lang/io/include/sourcemeta/core/io.h @@ -20,6 +20,7 @@ #include // std::function #include // std::cin #include // std::basic_istream +#include // std::numeric_limits #include // std::ostream #include // std::span #include // std::basic_ostringstream @@ -120,12 +121,12 @@ auto read_file(const std::filesystem::path &path) } const auto canonical_path{sourcemeta::core::canonical(path)}; - std::ifstream stream{canonical_path}; + std::basic_ifstream stream{canonical_path}; if (!stream.is_open()) { throw IOFilePermissionError{canonical_path}; } - stream.exceptions(std::ifstream::badbit); + stream.exceptions(std::basic_ifstream::badbit); return stream; } @@ -201,6 +202,15 @@ auto read_file_to_string(const std::filesystem::path &path) return read_to_string(stream); } + // On 32-bit targets the file size can exceed what an in-memory string can + // hold, in which case fall back to a streaming read that grows as needed + if constexpr (std::numeric_limits::max() > + std::numeric_limits::max()) { + if (size > std::numeric_limits::max()) { + return read_to_string(stream); + } + } + std::basic_string result; result.resize(static_cast(size)); stream.read(result.data(), static_cast(result.size())); diff --git a/src/lang/io/include/sourcemeta/core/io_fileview.h b/src/lang/io/include/sourcemeta/core/io_fileview.h index 4cddf041cc..d028af3896 100644 --- a/src/lang/io/include/sourcemeta/core/io_fileview.h +++ b/src/lang/io/include/sourcemeta/core/io_fileview.h @@ -43,6 +43,8 @@ class SOURCEMETA_CORE_IO_EXPORT FileView { [[nodiscard]] auto size() const noexcept -> std::size_t; /// Interpret the memory-mapped data as a pointer to T at the given offset. + /// The caller must ensure that the offset yields a pointer suitably aligned + /// for T, as dereferencing a misaligned pointer is undefined behavior. template [[nodiscard]] auto as(const std::size_t offset = 0) const noexcept -> const T * { diff --git a/src/lang/io/io_atomic.cc b/src/lang/io/io_atomic.cc index aafa10cfaa..a48f501e2e 100644 --- a/src/lang/io/io_atomic.cc +++ b/src/lang/io/io_atomic.cc @@ -1,12 +1,17 @@ #include #include -#include // assert -#include // EACCES, errno -#include // std::filesystem -#include // std::ofstream -#include // std::ios::binary, std::ios::trunc -#include // std::ostream +#include // assert +#include // EACCES, errno +#include // std::size_t +#include // std::uint64_t +#include // std::filesystem +#include // std::ofstream +#include // std::ios::binary, std::ios::trunc +#include // std::ostream +#include // std::random_device, std::mt19937_64, std::uniform_int_distribution +#include // std::string +#include // std::string_view #include // std::error_code, std::generic_category #if defined(__linux__) @@ -21,15 +26,35 @@ namespace { +auto unique_staging_path(const std::filesystem::path &destination) + -> std::filesystem::path { + thread_local std::mt19937_64 generator{std::random_device{}()}; + std::uniform_int_distribution distribution; + std::string suffix{".tmp."}; + suffix.reserve(suffix.size() + 16); + static constexpr std::string_view digits{"0123456789abcdef"}; + auto value{distribution(generator)}; + for (std::size_t index{0}; index < 16; index++) { + suffix.push_back(digits[value & 0xF]); + value >>= 4; + } + + std::filesystem::path staging{destination}; + staging += suffix; + return staging; +} + class AtomicFileWriter { public: AtomicFileWriter(const std::filesystem::path &destination) - : destination_{destination}, staging_{destination} { + : destination_{destination}, staging_{unique_staging_path(destination)} { // The staging file lives next to the destination so that // `std::filesystem::rename` stays on a single filesystem and remains // atomic. Using the system-wide temporary directory would risk `EXDEV` // errors on cross-filesystem builds (CI containers, NFS mounts, etc.). - this->staging_ += ".tmp"; + // The staging name carries a per-writer random suffix so that concurrent + // writers targeting the same destination do not interleave into one + // staging file and break atomicity if (this->destination_.has_parent_path()) { std::filesystem::create_directories(this->destination_.parent_path()); diff --git a/src/lang/io/io_fileview.cc b/src/lang/io/io_fileview.cc index 2090572144..d7caad00be 100644 --- a/src/lang/io/io_fileview.cc +++ b/src/lang/io/io_fileview.cc @@ -30,6 +30,12 @@ FileView::FileView(const std::filesystem::path &path) { } this->size_ = static_cast(file_size.QuadPart); + // Mapping a zero-length file is not possible, so leave the view empty + if (this->size_ == 0) { + this->data_ = nullptr; + return; + } + this->mapping_handle_ = CreateFileMappingW(this->file_handle_, nullptr, PAGE_READONLY, 0, 0, nullptr); if (this->mapping_handle_ == nullptr) { @@ -77,6 +83,12 @@ FileView::FileView(const std::filesystem::path &path) { } this->size_ = static_cast(file_stat.st_size); + // Mapping a zero-length region fails with EINVAL, so leave the view empty + if (this->size_ == 0) { + this->data_ = nullptr; + return; + } + void *mapped = mmap(nullptr, this->size_, PROT_READ, MAP_PRIVATE, this->file_descriptor_, 0); if (mapped == MAP_FAILED) { diff --git a/src/lang/numeric/big_coefficient.h b/src/lang/numeric/big_coefficient.h index 383f04d0d8..705ee35e3e 100644 --- a/src/lang/numeric/big_coefficient.h +++ b/src/lang/numeric/big_coefficient.h @@ -480,6 +480,15 @@ class BigCoefficient { exponent--; } + // A negative exponent on an integral value means the coefficient carries + // trailing zeros that the scale removes, so dividing recovers the true + // magnitude rather than returning a result that is too large by a power of + // ten + while (exponent < 0) { + value = value / 10; + exponent++; + } + return value; } diff --git a/src/lang/numeric/decimal.cc b/src/lang/numeric/decimal.cc index 3c179c290f..4e0bfbefed 100644 --- a/src/lang/numeric/decimal.cc +++ b/src/lang/numeric/decimal.cc @@ -11,9 +11,11 @@ #include // std::strlen #include // std::setprecision #include // std::numeric_limits +#include // std::optional, std::nullopt #include // std::ostringstream #include // std::out_of_range #include // std::string, std::stof, std::stod +#include // std::vector namespace { @@ -69,16 +71,28 @@ struct ParsedDecimal { auto parse_digit_payload(const char *cursor, std::size_t count) -> std::int64_t { - std::int64_t payload = 0; + // The diagnostic payload of a NaN carries no arithmetic meaning here, so a + // payload longer than the storage saturates rather than overflowing, which + // would otherwise be signed integer overflow (undefined behaviour) + constexpr auto maximum{ + static_cast(std::numeric_limits::max())}; + std::uint64_t payload = 0; for (std::size_t index = 0; index < count; index++) { - payload = payload * 10 + (cursor[index] - '0'); + const auto digit = static_cast(cursor[index] - '0'); + if (payload > (maximum - digit) / 10) { + return std::numeric_limits::max(); + } + + payload = payload * 10 + digit; } - return payload; + return static_cast(payload); } -auto parse_special(const char *input, std::size_t length) -> ParsedDecimal * { - static ParsedDecimal result; +auto parse_special(const char *input, std::size_t length) + -> std::optional { + ParsedDecimal result{ + .coefficient = 0, .coefficient_high = 0, .exponent = 0, .flags = 0}; const char *cursor = input; std::uint8_t sign_flag = 0; @@ -99,7 +113,7 @@ auto parse_special(const char *input, std::size_t length) -> ParsedDecimal * { result.exponent = 0; result.coefficient_high = 0; result.coefficient = parse_digit_payload(cursor + 3, remaining - 3); - return &result; + return result; } if (remaining >= 4 && (cursor[0] == 's' || cursor[0] == 'S') && @@ -110,7 +124,7 @@ auto parse_special(const char *input, std::size_t length) -> ParsedDecimal * { result.exponent = 0; result.coefficient_high = 0; result.coefficient = parse_digit_payload(cursor + 4, remaining - 4); - return &result; + return result; } if (remaining >= 3 && (cursor[0] == 'I' || cursor[0] == 'i') && @@ -126,17 +140,17 @@ auto parse_special(const char *input, std::size_t length) -> ParsedDecimal * { result.coefficient = 0; result.exponent = 0; result.coefficient_high = 0; - return &result; + return result; } - return nullptr; + return std::nullopt; } auto parse_decimal_string(const char *input, std::size_t length) -> ParsedDecimal { - auto *special = parse_special(input, length); - if (special) { - return *special; + const auto special = parse_special(input, length); + if (special.has_value()) { + return special.value(); } ParsedDecimal result{ @@ -155,17 +169,27 @@ auto parse_decimal_string(const char *input, std::size_t length) throw sourcemeta::core::DecimalParseError{}; } - std::array digit_buffer{}; + // RFC 8259 section 6 permits numbers of arbitrary magnitude and precision, so + // every significant digit must be retained to build the coefficient exactly. + // The inline buffer keeps the common case allocation-free, and inputs longer + // than it spill into a heap buffer sized to the remaining input so that the + // digit reads below can never run past the storage + std::array inline_digit_buffer{}; + std::vector heap_digit_buffer; + char *digit_buffer = inline_digit_buffer.data(); + const auto maximum_digits = static_cast(end - cursor); + if (maximum_digits > inline_digit_buffer.size()) { + heap_digit_buffer.resize(maximum_digits); + digit_buffer = heap_digit_buffer.data(); + } + std::uint32_t digit_count_total = 0; std::int32_t decimal_offset = -1; bool has_digit = false; while (cursor < end) { if (*cursor >= '0' && *cursor <= '9') { - if (digit_count_total < digit_buffer.size()) { - digit_buffer[digit_count_total] = *cursor; - } - + digit_buffer[digit_count_total] = *cursor; digit_count_total++; has_digit = true; } else if (*cursor == '.') { @@ -233,20 +257,20 @@ auto parse_decimal_string(const char *input, std::size_t length) throw sourcemeta::core::DecimalParseError{}; } - auto exponent_suffix = static_cast(std::min( - std::max( - exponent_suffix_64, - static_cast(std::numeric_limits::min())), - static_cast(std::numeric_limits::max()))); - + // Combine the explicit exponent with the fractional-digit adjustment in a + // wider type before clamping, so an extreme exponent suffix cannot underflow + // or overflow a narrow integer (undefined behaviour) + std::int64_t exponent_64 = exponent_suffix_64; if (decimal_offset >= 0) { - result.exponent = - exponent_suffix - - (static_cast(digit_count_total) - decimal_offset); - } else { - result.exponent = exponent_suffix; + exponent_64 -= static_cast(digit_count_total) - + static_cast(decimal_offset); } + result.exponent = static_cast(std::min( + std::max(exponent_64, static_cast( + std::numeric_limits::min())), + static_cast(std::numeric_limits::max()))); + std::uint32_t leading_zeros = 0; while (leading_zeros < digit_count_total - 1 && digit_buffer[leading_zeros] == '0') { @@ -283,7 +307,7 @@ auto parse_decimal_string(const char *input, std::size_t length) result.coefficient_high = high_word; result.flags |= FLAG_BIG; } else { - auto big = BigCoefficient::from_digits(digit_buffer.data() + leading_zeros, + auto big = BigCoefficient::from_digits(digit_buffer + leading_zeros, significant_digits); store_big_pointer(result.coefficient, std::move(big)); result.coefficient_high = 0; @@ -401,14 +425,22 @@ Decimal::Decimal(Decimal &&other) noexcept auto Decimal::operator=(const Decimal &other) -> Decimal & { if (this != &other) { - free_big_coefficient(this->coefficient_, this->flags_); - this->coefficient_ = other.coefficient_; - this->coefficient_high_ = other.coefficient_high_; - this->exponent_ = other.exponent_; - this->flags_ = other.flags_; if (other.flags_ & FLAG_HEAP) { - store_big_pointer(this->coefficient_, - load_big_pointer(other.coefficient_)->clone()); + // Copy the heap coefficient before releasing the current one, so that a + // failed allocation leaves this value intact rather than owning storage + // shared with the source, which would then be released twice + auto cloned{load_big_pointer(other.coefficient_)->clone()}; + free_big_coefficient(this->coefficient_, this->flags_); + this->coefficient_high_ = other.coefficient_high_; + this->exponent_ = other.exponent_; + this->flags_ = other.flags_; + store_big_pointer(this->coefficient_, std::move(cloned)); + } else { + free_big_coefficient(this->coefficient_, this->flags_); + this->coefficient_ = other.coefficient_; + this->coefficient_high_ = other.coefficient_high_; + this->exponent_ = other.exponent_; + this->flags_ = other.flags_; } } @@ -636,7 +668,11 @@ auto Decimal::to_int64() const -> std::int64_t { this->flags_); auto value = big.to_uint128(this->exponent_); if (this->flags_ & FLAG_SIGN) { - return -static_cast(value); + // Negate in unsigned arithmetic so that the most negative value does not + // overflow, since applying unary minus to its positive magnitude in a + // signed type would be undefined behaviour + return static_cast(std::uint64_t{0} - + static_cast(value)); } return static_cast(value); @@ -1810,7 +1846,8 @@ auto Decimal::operator%=(const Decimal &other) -> Decimal & { 0, static_cast(number_of_digits - digits_to_remove)); auto old_sign = static_cast(quotient.flags_ & FLAG_SIGN); - free_big_coefficient(quotient.coefficient_, quotient.flags_); + // The assignment below releases the current coefficient, so freeing + // it explicitly here as well would free the same allocation twice quotient = Decimal{integer_string}; quotient.flags_ = static_cast(quotient.flags_ | old_sign); diff --git a/src/lang/numeric/include/sourcemeta/core/numeric_util.h b/src/lang/numeric/include/sourcemeta/core/numeric_util.h index 8630e96165..8e5ca6a9de 100644 --- a/src/lang/numeric/include/sourcemeta/core/numeric_util.h +++ b/src/lang/numeric/include/sourcemeta/core/numeric_util.h @@ -8,6 +8,7 @@ #include // std::floating_point, std::integral, std::same_as #include // std::uint8_t, std::int64_t, std::uint64_t #include // std::numeric_limits +#include // std::cmp_greater_equal, std::cmp_less_equal namespace sourcemeta::core { @@ -225,11 +226,22 @@ auto count_multiples(const Minimum &minimum, const Maximum &maximum, const auto signed_multiplier{static_cast(multiplier)}; assert(signed_minimum <= signed_maximum); assert(signed_multiplier > 0); - return static_cast( - divide_floor(signed_maximum, - static_cast(signed_multiplier)) - - divide_floor(signed_minimum - 1, - static_cast(signed_multiplier))); + const auto unsigned_multiplier{ + static_cast(signed_multiplier)}; + const auto multiples_to_maximum{ + divide_floor(signed_maximum, unsigned_multiplier)}; + const auto multiples_below_minimum{ + divide_floor(signed_minimum, unsigned_multiplier)}; + // Count the multiples up to the maximum and subtract those strictly below + // the minimum. The lower bound is derived without forming one less than the + // smallest value, which would overflow for the most negative input, and the + // subtraction is performed in unsigned arithmetic so the difference cannot + // overflow a signed integer + const std::uint64_t minimum_is_multiple{ + signed_minimum % signed_multiplier == 0 ? 1U : 0U}; + return static_cast(multiples_to_maximum) - + static_cast(multiples_below_minimum) + + minimum_is_multiple; } } @@ -247,7 +259,10 @@ constexpr auto uint_max = [] { template constexpr auto is_within(const T &value, const std::int64_t lower, const std::int64_t higher) noexcept -> bool { - return value >= lower && value <= higher; + // Compare across signedness without converting an unsigned value against a + // negative bound, which would otherwise wrap the bound to a large positive + return std::cmp_greater_equal(value, lower) && + std::cmp_less_equal(value, higher); } /// @ingroup numeric @@ -299,12 +314,18 @@ constexpr auto closest_smallest_exponent(const std::uint64_t value, assert(exponent_start <= exponent_end); std::uint64_t result{base}; for (std::uint8_t exponent{1}; exponent < exponent_end; exponent++) { - const std::uint64_t next{result * base}; - if (next > value && exponent >= exponent_start) { - return exponent; - } else { - result = next; + // Test whether the next power exceeds the value without forming it, since + // result multiplied by base could wrap the accumulator + const bool next_power_exceeds_value{result > value / base}; + if (next_power_exceeds_value) { + if (exponent >= exponent_start) { + return exponent; + } + + continue; } + + result *= base; } assert(result <= value); diff --git a/src/lang/options/options.cc b/src/lang/options/options.cc index b754701c98..51f641f768 100644 --- a/src/lang/options/options.cc +++ b/src/lang/options/options.cc @@ -1,6 +1,7 @@ #include #include // assert +#include // std::size_t #include // std::forward namespace { @@ -79,7 +80,8 @@ auto Options::parse(const int argc, -> void { bool end_of_options{false}; // We assume that the first argument is the program name - for (auto index = static_cast(options.skip + 1); index < argc; index++) { + const auto argument_count{static_cast(argc)}; + for (std::size_t index{options.skip + 1}; index < argument_count; index++) { const std::string_view token{argv[index]}; if (end_of_options) { @@ -90,7 +92,8 @@ auto Options::parse(const int argc, continue; } - const auto *const next{(index + 1) < argc ? argv[index + 1] : nullptr}; + const auto *const next{(index + 1) < argument_count ? argv[index + 1] + : nullptr}; // Parse long options if (token.size() >= 3 && token[0] == '-' && token[1] == '-') { diff --git a/src/lang/parallel/include/sourcemeta/core/parallel_for_each.h b/src/lang/parallel/include/sourcemeta/core/parallel_for_each.h index e9dfba3c6d..a7730c173f 100644 --- a/src/lang/parallel/include/sourcemeta/core/parallel_for_each.h +++ b/src/lang/parallel/include/sourcemeta/core/parallel_for_each.h @@ -2,6 +2,7 @@ #define SOURCEMETA_CORE_PARALLEL_FOR_EACH_H_ #include // std::max +#include // UINT_MAX #include // std::copyable, std::invocable #include // std::exception_ptr, std::current_exception, std::rethrow_exception #include // std::function @@ -35,6 +36,27 @@ inline unsigned __stdcall parallel_for_each_windows_thread_start( return 0; } #endif + +// If thread creation fails after some workers have already started, those +// workers keep referencing the stack locals of the spawning frame, so unwinding +// past them must be avoided. Drain the remaining tasks so the running workers +// stop pulling new work and exit, then join every already-created worker before +// propagating the failure +template +inline auto parallel_for_each_drain_and_join(std::queue &tasks, + std::mutex &queue_mutex, + std::vector &workers) + -> void { + { + std::lock_guard lock{queue_mutex}; + std::queue empty; + tasks.swap(empty); + } + + for (auto &worker_thread : workers) { + worker_thread.join(); + } +} #endif /// @ingroup parallel @@ -76,7 +98,7 @@ auto parallel_for_each( Iterator first, Iterator last, Callback &&callback, const std::size_t parallelism = std::thread::hardware_concurrency(), const std::size_t stack_size_bytes = 0) -> void { - const auto effective_parallelism{std::max(parallelism, 1uz)}; + const auto effective_parallelism{(std::max)(parallelism, 1uz)}; // Empty list if (first == last) { @@ -150,6 +172,7 @@ auto parallel_for_each( auto *heap_function = new std::function(worker_callable); if (stack_size_bytes > static_cast(UINT_MAX)) { delete heap_function; + parallel_for_each_drain_and_join(tasks, queue_mutex, workers); throw std::runtime_error( "The requested stack size is too large for this platform"); } @@ -159,6 +182,7 @@ auto parallel_for_each( ¶llel_for_each_windows_thread_start, heap_function, 0, nullptr); if (raw_handle == 0) { delete heap_function; + parallel_for_each_drain_and_join(tasks, queue_mutex, workers); throw std::runtime_error("Could not create thread"); } @@ -192,6 +216,7 @@ auto parallel_for_each( if (raw_handle != 0) { pthread_attr_destroy(&attr); delete heap_function; + parallel_for_each_drain_and_join(tasks, queue_mutex, workers); throw std::runtime_error("Could not create thread"); } workers.emplace_back( diff --git a/src/lang/process/spawn.cc b/src/lang/process/spawn.cc index f180ad5c48..849128337a 100644 --- a/src/lang/process/spawn.cc +++ b/src/lang/process/spawn.cc @@ -1,17 +1,18 @@ #include #include // assert -#include // ENOENT +#include // ENOENT, EINTR, errno #include // std::filesystem #include // std::initializer_list #include // std::span +#include // std::string #include // std::vector #if defined(_WIN32) && !defined(__MSYS__) && !defined(__CYGWIN__) && \ !defined(__MINGW32__) && !defined(__MINGW64__) #define WIN32_LEAN_AND_MEAN -#include // std::ostringstream -#include // CreateProcess, PROCESS_INFORMATION, STARTUPINFO, WaitForSingleObject, GetExitCodeProcess +#include // std::size_t +#include // CreateProcess, PROCESS_INFORMATION, STARTUPINFO, WaitForSingleObject, GetExitCodeProcess, WAIT_FAILED #else #include // posix_spawnp, posix_spawnattr_t, posix_spawnattr_init, posix_spawnattr_destroy, posix_spawn_file_actions_t, posix_spawn_file_actions_init, posix_spawn_file_actions_destroy, pid_t #include // waitpid, WIFEXITED, WEXITSTATUS @@ -24,6 +25,50 @@ extern char **environ; #endif +#if defined(_WIN32) && !defined(__MSYS__) && !defined(__CYGWIN__) && \ + !defined(__MINGW32__) && !defined(__MINGW64__) +namespace { + +// Quote a single argument for the inverse of CommandLineToArgvW, so that the +// child reconstructs the exact same argument vector +auto append_quoted_argument(std::string &command_line, + const std::string_view argument) -> void { + const bool needs_quoting{argument.empty() || + argument.find_first_of(" \t\"") != + std::string_view::npos}; + + if (!needs_quoting) { + command_line.append(argument); + return; + } + + command_line.push_back('"'); + + for (auto cursor = argument.cbegin();; ++cursor) { + std::size_t backslash_count{0}; + while (cursor != argument.cend() && *cursor == '\\') { + ++cursor; + ++backslash_count; + } + + if (cursor == argument.cend()) { + command_line.append(backslash_count * 2, '\\'); + break; + } else if (*cursor == '"') { + command_line.append(backslash_count * 2 + 1, '\\'); + command_line.push_back('"'); + } else { + command_line.append(backslash_count, '\\'); + command_line.push_back(*cursor); + } + } + + command_line.push_back('"'); +} + +} // namespace +#endif + namespace sourcemeta::core { auto spawn(const std::string &program, @@ -35,22 +80,15 @@ auto spawn(const std::string &program, #if defined(_WIN32) && !defined(__MSYS__) && !defined(__CYGWIN__) && \ !defined(__MINGW32__) && !defined(__MINGW64__) - std::ostringstream command_line; - command_line << program; + std::string command_line; + append_quoted_argument(command_line, program); for (const auto &argument : arguments) { - command_line << " "; - // Quote arguments that contain spaces - const std::string arg_str{argument}; - if (arg_str.contains(' ')) { - command_line << "\"" << arg_str << "\""; - } else { - command_line << arg_str; - } + command_line.push_back(' '); + append_quoted_argument(command_line, argument); } - std::string cmd_line_str = command_line.str(); - std::vector cmd_line(cmd_line_str.begin(), cmd_line_str.end()); + std::vector cmd_line(command_line.begin(), command_line.end()); cmd_line.push_back('\0'); STARTUPINFOA startup_info{}; @@ -80,7 +118,11 @@ auto spawn(const std::string &program, throw ProcessSpawnError{program, arguments}; } - WaitForSingleObject(process_info.hProcess, INFINITE); + if (WaitForSingleObject(process_info.hProcess, INFINITE) == WAIT_FAILED) { + CloseHandle(process_info.hProcess); + CloseHandle(process_info.hThread); + throw ProcessSpawnError{program, arguments}; + } DWORD exit_code; if (!GetExitCodeProcess(process_info.hProcess, &exit_code)) { @@ -94,12 +136,18 @@ auto spawn(const std::string &program, return static_cast(exit_code); #else + std::vector owned_arguments; + owned_arguments.reserve(arguments.size()); + for (const auto &argument : arguments) { + owned_arguments.emplace_back(argument); + } + std::vector argv; - argv.reserve(arguments.size() + 2); + argv.reserve(owned_arguments.size() + 2); argv.push_back(program.c_str()); - for (const auto &argument : arguments) { - argv.push_back(argument.data()); + for (const auto &argument : owned_arguments) { + argv.push_back(argument.c_str()); } argv.push_back(nullptr); @@ -112,11 +160,21 @@ auto spawn(const std::string &program, #if defined(__MSYS__) || defined(__CYGWIN__) || defined(__MINGW32__) || \ defined(__MINGW64__) + // These platforms lack a child-directory file action, so we change the + // process-wide working directory around the spawn and restore it afterwards + // This races with any concurrent thread that observes or mutates the current + // directory while the spawn is in flight const std::filesystem::path original_directory{ std::filesystem::current_path()}; std::filesystem::current_path(directory); #else - posix_spawn_file_actions_addchdir_np(&file_actions, directory.c_str()); + const int addchdir_result{ + posix_spawn_file_actions_addchdir_np(&file_actions, directory.c_str())}; + if (addchdir_result != 0) { + posix_spawn_file_actions_destroy(&file_actions); + posix_spawnattr_destroy(&attributes); + throw ProcessSpawnError{program, arguments}; + } #endif pid_t process_id; @@ -140,8 +198,14 @@ auto spawn(const std::string &program, throw ProcessSpawnError{program, arguments}; } - int status; - waitpid(process_id, &status, 0); + int status{0}; + while (waitpid(process_id, &status, 0) == -1) { + if (errno == EINTR) { + continue; + } + + throw ProcessSpawnError{program, arguments}; + } if (WIFEXITED(status)) { return WEXITSTATUS(status); diff --git a/src/lang/stacktrace/stacktrace_posix.h b/src/lang/stacktrace/stacktrace_posix.h index f949f51153..cfeb47bc9b 100644 --- a/src/lang/stacktrace/stacktrace_posix.h +++ b/src/lang/stacktrace/stacktrace_posix.h @@ -3,9 +3,9 @@ #include -#include // std::array -#include // std::atomic -#include // sigaction, struct sigaction, SIG*, raise +#include // std::array +#include // std::atomic +#include // sigaction, struct sigaction, SIG*, raise, sigaltstack, stack_t, SA_ONSTACK #include // std::size_t #include // std::uintptr_t #include // std::strlen @@ -161,6 +161,12 @@ constexpr const char *separator{"========================================" std::atomic crash_handler_installed{false}; +// A stack-overflow fault arrives on an already-exhausted stack, so the handler +// must run on a separate region to be able to produce a trace. This is sized +// generously to leave room for the backtrace machinery +constexpr std::size_t alternate_stack_size{1 << 16}; +alignas(16) std::array alternate_stack{}; + } // namespace // NOTE: `backtrace`, `dladdr`, and `strlen` are not on POSIX's strict @@ -228,9 +234,19 @@ __attribute__((visibility("default"))) auto stacktrace_on_crash() -> void { return; } + stack_t signal_stack{}; + signal_stack.ss_sp = alternate_stack.data(); + signal_stack.ss_size = alternate_stack.size(); + signal_stack.ss_flags = 0; + const bool alternate_stack_ready{::sigaltstack(&signal_stack, nullptr) == 0}; + struct sigaction action{}; action.sa_sigaction = &sourcemeta_core_stacktrace_crash_handler; - action.sa_flags = static_cast(SA_SIGINFO | SA_RESETHAND | SA_NODEFER); + int flags{static_cast(SA_SIGINFO | SA_RESETHAND | SA_NODEFER)}; + if (alternate_stack_ready) { + flags |= static_cast(SA_ONSTACK); + } + action.sa_flags = flags; sigemptyset(&action.sa_mask); for (const int signal_number : {SIGSEGV, SIGABRT, SIGFPE, SIGBUS, SIGILL}) { diff --git a/src/lang/text/text.cc b/src/lang/text/text.cc index 6916eec962..e7e95addfa 100644 --- a/src/lang/text/text.cc +++ b/src/lang/text/text.cc @@ -1,6 +1,5 @@ #include -#include // std::isalpha, std::toupper #include // std::size_t #include // std::filesystem::path #include // std::optional, std::nullopt @@ -15,6 +14,17 @@ auto is_ascii_whitespace(const char character) noexcept -> bool { character == '\v' || character == '\f' || character == '\r'; } +auto is_ascii_letter(const char character) noexcept -> bool { + return (character >= 'a' && character <= 'z') || + (character >= 'A' && character <= 'Z'); +} + +auto to_ascii_uppercase(const char character) noexcept -> char { + return (character >= 'a' && character <= 'z') + ? static_cast(character - ('a' - 'A')) + : character; +} + } // namespace namespace sourcemeta::core { @@ -45,9 +55,8 @@ auto to_title_case(std::string &value) -> void { capitalize_next = true; } if (capitalize_next) { - value[write++] = static_cast( - std::toupper(static_cast(character))); - if (std::isalpha(static_cast(character))) { + value[write++] = to_ascii_uppercase(character); + if (is_ascii_letter(character)) { capitalize_next = false; } } else { diff --git a/test/email/email_test.cc b/test/email/email_test.cc index 7091f66d8c..1f40cf8efe 100644 --- a/test/email/email_test.cc +++ b/test/email/email_test.cc @@ -768,11 +768,23 @@ TEST(Email, invalid_ipv4_five_octets) { EXPECT_FALSE(sourcemeta::core::is_idn_email("a@[1.2.3.4.5]")); } -// RFC 5321 §4.1.3: Snum = 1*3DIGIT, leading zero in multi-digit Snum is -// rejected by is_ipv4 -TEST(Email, invalid_ipv4_leading_zero) { - EXPECT_FALSE(sourcemeta::core::is_email("a@[01.2.3.4]")); - EXPECT_FALSE(sourcemeta::core::is_idn_email("a@[01.2.3.4]")); +// RFC 5321 §4.1.3: Snum = 1*3DIGIT, leading zeros are permitted +TEST(Email, valid_ipv4_leading_zero) { + EXPECT_TRUE(sourcemeta::core::is_email("a@[01.2.3.4]")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@[01.2.3.4]")); +} + +// RFC 5321 §4.1.3: Snum = 1*3DIGIT, a fully zero-padded octet is permitted +TEST(Email, valid_ipv4_padded_octets) { + EXPECT_TRUE(sourcemeta::core::is_email("a@[001.002.003.004]")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@[001.002.003.004]")); +} + +// RFC 5321 §4.1.3: Snum = 1*3DIGIT, an octet wider than three digits is +// rejected +TEST(Email, invalid_ipv4_four_digit_octet) { + EXPECT_FALSE(sourcemeta::core::is_email("a@[0001.2.3.4]")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@[0001.2.3.4]")); } // RFC 5321 §4.1.3: IPv4-address-literal cannot end with a "." diff --git a/test/gzip/gzip_streambuf_test.cc b/test/gzip/gzip_streambuf_test.cc index e747f77ec0..2451a159e0 100644 --- a/test/gzip/gzip_streambuf_test.cc +++ b/test/gzip/gzip_streambuf_test.cc @@ -864,3 +864,25 @@ TEST(GZIP_stream_buffer, fextra_spans_internal_buffer) { compressed.append("\x85\x11\x4a\x0d\x0b\x00\x00\x00", 8); EXPECT_EQ(decompress_via_stream(compressed), "hello world"); } + +TEST(GZIP_stream_buffer, dynamic_block_hlit_above_286_throws) { + // RFC 1951 section 3.2.7 caps the literal/length alphabet at 286 symbols. + // The deflate payload starts a dynamic block with HLIT encoding 288 codes + // (BFINAL=1, BTYPE=10, HLIT field = 31 so 31 + 257 = 288) + sourcemeta::core::InputByteStream stream{0x1f, 0x8b, 0x08, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xff, + 0xfd, 0x00, 0x00}; + EXPECT_THROW(decompress_via_stream(stream), sourcemeta::core::GZIPError); +} + +TEST(GZIP_stream_buffer, dynamic_block_incomplete_code_throws) { + // RFC 1951 forbids incomplete Huffman codes outside the single-code case. + // The deflate payload starts a dynamic block whose code-length code tree + // assigns length two to two symbols, leaving the alphabet incomplete + // (BFINAL=1, BTYPE=10, HLIT=0, HDIST=0, HCLEN=0, code-length lengths for + // symbols 16 and 17 set to two) + sourcemeta::core::InputByteStream stream{0x1f, 0x8b, 0x08, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xff, + 0x05, 0x00, 0x24, 0x00}; + EXPECT_THROW(decompress_via_stream(stream), sourcemeta::core::GZIPError); +} diff --git a/test/gzip/gzip_test.cc b/test/gzip/gzip_test.cc index 08c164d607..facc1732c2 100644 --- a/test/gzip/gzip_test.cc +++ b/test/gzip/gzip_test.cc @@ -78,6 +78,37 @@ TEST(GZIP, decompress_with_output_hint) { EXPECT_EQ(decompressed, input); } +TEST(GZIP, compress_with_explicit_level_round_trips) { + const std::string pattern{"The quick brown fox jumps over the lazy dog. "}; + std::string input; + for (int index = 0; index < 1000; ++index) { + input += pattern; + } + + const auto compressed{sourcemeta::core::gzip( + reinterpret_cast(input.data()), input.size(), 9)}; + EXPECT_FALSE(compressed.empty()); + + const auto decompressed{sourcemeta::core::gunzip( + reinterpret_cast(compressed.data()), + compressed.size())}; + EXPECT_EQ(decompressed, input); +} + +TEST(GZIP, compress_higher_level_is_not_larger) { + const std::string pattern{"The quick brown fox jumps over the lazy dog. "}; + std::string input; + for (int index = 0; index < 1000; ++index) { + input += pattern; + } + + const auto fastest{sourcemeta::core::gzip( + reinterpret_cast(input.data()), input.size(), 1)}; + const auto smallest{sourcemeta::core::gzip( + reinterpret_cast(input.data()), input.size(), 12)}; + EXPECT_LE(smallest.size(), fastest.size()); +} + TEST(GZIP, decompress_invalid_input_throws) { const std::string garbage{"this is not gzip data"}; EXPECT_THROW(sourcemeta::core::gunzip( diff --git a/test/http/http_match_accept_test.cc b/test/http/http_match_accept_test.cc index 515de88a67..71025b5319 100644 --- a/test/http/http_match_accept_test.cc +++ b/test/http/http_match_accept_test.cc @@ -143,10 +143,20 @@ TEST(HTTP_match_accept, three_candidates_picks_best_q) { "application/json"); } -TEST(HTTP_match_accept, malformed_q_treated_as_one) { +// RFC 9110 §12.4.2: a malformed weight is a fail-safe refusal, treated as 0 +TEST(HTTP_match_accept, malformed_q_treated_as_zero) { EXPECT_EQ( sourcemeta::core::http_match_accept("text/html;q=abc", {"text/html"}), - "text/html"); + ""); +} + +// RFC 9110 §5.6.6: a semicolon inside a quoted string does not separate +// parameters, so the quoted content does not synthesise a phantom q parameter +TEST(HTTP_match_accept, quoted_parameter_semicolon_not_split) { + EXPECT_EQ(sourcemeta::core::http_match_accept( + "text/html;foo=\"a;q=0.1\", application/json;q=0.5", + {"text/html", "application/json"}), + "text/html"); } TEST(HTTP_match_accept, html_or_json_returns_html_for_html_accept) { @@ -297,9 +307,11 @@ TEST(HTTP_match_accept, q_value_one_dot_no_digits_is_one) { "text/html"); } -TEST(HTTP_match_accept, q_value_four_decimal_digits_treated_as_one) { +// RFC 9110 §12.4.2: qvalue allows at most three fractional digits, so a +// four-digit fraction is malformed and is a fail-safe refusal treated as 0 +TEST(HTTP_match_accept, q_value_four_decimal_digits_treated_as_zero) { EXPECT_EQ(sourcemeta::core::http_match_accept( "text/html;q=0.1234, application/json;q=0.5", {"text/html", "application/json"}), - "text/html"); + "application/json"); } diff --git a/test/http/http_negotiate_encoding_test.cc b/test/http/http_negotiate_encoding_test.cc index 298cabd6b0..f966a83004 100644 --- a/test/http/http_negotiate_encoding_test.cc +++ b/test/http/http_negotiate_encoding_test.cc @@ -108,11 +108,13 @@ TEST(HTTP_negotiate_encoding, identity_excluded_via_wildcard_no_explicit_gzip) { EXPECT_FALSE(result.has_value()); } -TEST(HTTP_negotiate_encoding, garbage_q_treated_as_one) { +// RFC 9110 §12.4.2: a malformed weight is a fail-safe refusal, treated as 0, +// so gzip is refused and the implicit identity encoding is selected +TEST(HTTP_negotiate_encoding, garbage_q_treated_as_zero) { const auto result{sourcemeta::core::http_negotiate_encoding( "gzip;q=abc", sourcemeta::core::HTTPContentEncoding::GZIP)}; ASSERT_TRUE(result.has_value()); - EXPECT_EQ(result.value(), sourcemeta::core::HTTPContentEncoding::GZIP); + EXPECT_EQ(result.value(), sourcemeta::core::HTTPContentEncoding::Identity); } TEST(HTTP_negotiate_encoding, multiple_gzip_entries_take_max_q) { diff --git a/test/idna/idna_is_valid_a_label_test.cc b/test/idna/idna_is_valid_a_label_test.cc index b6b1299a24..5340095f04 100644 --- a/test/idna/idna_is_valid_a_label_test.cc +++ b/test/idna/idna_is_valid_a_label_test.cc @@ -2,6 +2,8 @@ #include +#include // std::string + TEST(IDNA_is_valid_a_label, munich_german) { // xn--mnchen-3ya decodes to "M\u00FCnchen" EXPECT_TRUE(sourcemeta::core::idna_is_valid_a_label("xn--mnchen-3ya")); @@ -54,3 +56,11 @@ TEST(IDNA_is_valid_a_label, uppercase_in_body) { // representation, so the round-trip check rejects it. EXPECT_FALSE(sourcemeta::core::idna_is_valid_a_label("xn--MNCHEN-3ya")); } + +// RFC 5890 §2.3.2.1: a label in A-label form is at most 63 octets. A 64-octet +// input is rejected on length alone, before any Punycode decoding +TEST(IDNA_is_valid_a_label, exceeds_63_octets) { + const std::string label{"xn--" + std::string(60, 'a')}; + EXPECT_EQ(label.size(), 64); + EXPECT_FALSE(sourcemeta::core::idna_is_valid_a_label(label)); +} diff --git a/test/idna/idna_is_valid_u_label_test.cc b/test/idna/idna_is_valid_u_label_test.cc index 58ff0333b2..cdf8c04bd9 100644 --- a/test/idna/idna_is_valid_u_label_test.cc +++ b/test/idna/idna_is_valid_u_label_test.cc @@ -2,6 +2,8 @@ #include +#include // std::u32string + TEST(IDNA_is_valid_u_label, ascii_letters) { EXPECT_TRUE(sourcemeta::core::idna_is_valid_u_label(U"abc")); } @@ -167,3 +169,17 @@ TEST(IDNA_is_valid_u_label, out_of_order_combining_marks_rejected) { TEST(IDNA_is_valid_u_label, hangul_precomposed_syllable_accepted) { EXPECT_TRUE(sourcemeta::core::idna_is_valid_u_label(U"\uAC00")); } + +// RFC 5890 \u00A72.3.2.1: the corresponding A-label must be at most 63 octets. +// A pure-ASCII label of 58 "a" characters encodes to a 59-octet Punycode +// body, which with the 4-octet "xn--" prefix is exactly 63 octets +TEST(IDNA_is_valid_u_label, a_label_form_exactly_63_octets_accepted) { + const std::u32string label(58, U'a'); + EXPECT_TRUE(sourcemeta::core::idna_is_valid_u_label(label)); +} + +// One additional character pushes the A-label form to 64 octets +TEST(IDNA_is_valid_u_label, a_label_form_64_octets_rejected) { + const std::u32string label(59, U'a'); + EXPECT_FALSE(sourcemeta::core::idna_is_valid_u_label(label)); +} diff --git a/test/io/io_fileview_test.cc b/test/io/io_fileview_test.cc index 097250ee68..50c79fbd56 100644 --- a/test/io/io_fileview_test.cc +++ b/test/io/io_fileview_test.cc @@ -2,7 +2,9 @@ #include -#include // std::uint32_t +#include // std::uint32_t +#include // std::filesystem +#include // std::string_view TEST(IO_FileView, size) { const sourcemeta::core::FileView view{std::filesystem::path{STUBS_DIRECTORY} / @@ -45,3 +47,13 @@ TEST(IO_FileView, file_not_found) { std::filesystem::path{STUBS_DIRECTORY} / "nonexistent.bin"), sourcemeta::core::FileViewError); } + +TEST(IO_FileView, empty_file_does_not_throw) { + const sourcemeta::core::TemporaryDirectory directory{ + std::filesystem::temp_directory_path(), ".fileview-"}; + const auto path{directory.path() / "empty.bin"}; + sourcemeta::core::write_file(path, std::string_view{""}); + + const sourcemeta::core::FileView view{path}; + EXPECT_EQ(view.size(), 0); +} diff --git a/test/json/json_array_test.cc b/test/json/json_array_test.cc index 0b1d0b3b64..f947d913dc 100644 --- a/test/json/json_array_test.cc +++ b/test/json/json_array_test.cc @@ -281,6 +281,31 @@ TEST(JSON_array, int_standard_sort) { EXPECT_EQ(document.at(2).to_integer(), 3); } +TEST(JSON_array, object_standard_sort) { + sourcemeta::core::JSON document = sourcemeta::core::parse_json( + R"JSON([{"a":9,"b":1},{"a":1,"b":9},{"a":5,"b":5},{"a":1,"b":1}])JSON"); + std::sort(document.as_array().begin(), document.as_array().end()); + EXPECT_EQ(document.size(), 4); + EXPECT_EQ(document.at(0).at("a").to_integer(), 1); + EXPECT_EQ(document.at(0).at("b").to_integer(), 1); + EXPECT_EQ(document.at(1).at("a").to_integer(), 1); + EXPECT_EQ(document.at(1).at("b").to_integer(), 9); + EXPECT_EQ(document.at(2).at("a").to_integer(), 5); + EXPECT_EQ(document.at(2).at("b").to_integer(), 5); + EXPECT_EQ(document.at(3).at("a").to_integer(), 9); + EXPECT_EQ(document.at(3).at("b").to_integer(), 1); +} + +TEST(JSON_array, move_assignment_from_own_element) { + sourcemeta::core::JSON document = sourcemeta::core::parse_json("[[1,2,3]]"); + document = std::move(document.at(0)); + EXPECT_TRUE(document.is_array()); + EXPECT_EQ(document.size(), 3); + EXPECT_EQ(document.at(0).to_integer(), 1); + EXPECT_EQ(document.at(1).to_integer(), 2); + EXPECT_EQ(document.at(2).to_integer(), 3); +} + TEST(JSON_array, erase_many_full) { sourcemeta::core::JSON document = sourcemeta::core::parse_json("[1,2,3]"); EXPECT_TRUE(document.is_array()); diff --git a/test/json/json_object_test.cc b/test/json/json_object_test.cc index 998bfd8e2f..b3cd393d07 100644 --- a/test/json/json_object_test.cc +++ b/test/json/json_object_test.cc @@ -872,6 +872,56 @@ TEST(JSON_object, merge_deep_object) { EXPECT_EQ(document, expected); } +TEST(JSON_object, merge_with_own_object) { + auto document{ + sourcemeta::core::parse_json(R"JSON({ "foo": 1, "bar": 2 })JSON")}; + document.merge(document.as_object()); + EXPECT_EQ(document.size(), 2); + EXPECT_EQ(document.at("foo").to_integer(), 1); + EXPECT_EQ(document.at("bar").to_integer(), 2); +} + +TEST(JSON_object, copy_assignment_from_own_member) { + auto document{ + sourcemeta::core::parse_json(R"JSON({ "foo": { "bar": 42 } })JSON")}; + document = document.at("foo"); + EXPECT_TRUE(document.is_object()); + EXPECT_EQ(document.size(), 1); + EXPECT_TRUE(document.defines("bar")); + EXPECT_EQ(document.at("bar").to_integer(), 42); +} + +TEST(JSON_object, ordering_is_asymmetric) { + const auto left{ + sourcemeta::core::parse_json(R"JSON({ "a": 1, "b": 9 })JSON")}; + const auto right{ + sourcemeta::core::parse_json(R"JSON({ "a": 9, "b": 1 })JSON")}; + EXPECT_NE(left, right); + EXPECT_TRUE(left < right); + EXPECT_FALSE(right < left); + EXPECT_NE(left < right, right < left); +} + +TEST(JSON_object, ordering_with_distinct_keys) { + const auto left{ + sourcemeta::core::parse_json(R"JSON({ "a": 1, "b": 1 })JSON")}; + const auto right{ + sourcemeta::core::parse_json(R"JSON({ "a": 1, "c": 1 })JSON")}; + EXPECT_NE(left, right); + EXPECT_TRUE(left < right); + EXPECT_FALSE(right < left); +} + +TEST(JSON_object, ordering_key_difference_outranks_value_difference) { + const auto left{ + sourcemeta::core::parse_json(R"JSON({ "a": 9, "b": 1 })JSON")}; + const auto right{ + sourcemeta::core::parse_json(R"JSON({ "a": 1, "c": 1 })JSON")}; + EXPECT_NE(left, right); + EXPECT_FALSE(left < right); + EXPECT_TRUE(right < left); +} + TEST(JSON_object, at_or_defined) { const sourcemeta::core::JSON document{{"foo", sourcemeta::core::JSON{true}}, {"bar", sourcemeta::core::JSON{1}}}; diff --git a/test/jsonpointer/jsonpointer_stringify_test.cc b/test/jsonpointer/jsonpointer_stringify_test.cc index 3c53a9f345..6cc4d89ea0 100644 --- a/test/jsonpointer/jsonpointer_stringify_test.cc +++ b/test/jsonpointer/jsonpointer_stringify_test.cc @@ -149,224 +149,224 @@ TEST(JSONPointer_stringify, escape_00) { const sourcemeta::core::Pointer pointer{"foo\0bar"s}; std::ostringstream stream; sourcemeta::core::stringify(pointer, stream); - EXPECT_EQ(stream.str(), "/foo\\u0000bar"); + EXPECT_EQ(stream.str(), "/foo\0bar"s); } TEST(JSONPointer_stringify, escape_01) { const sourcemeta::core::Pointer pointer{"foo\u0001bar"}; std::ostringstream stream; sourcemeta::core::stringify(pointer, stream); - EXPECT_EQ(stream.str(), "/foo\\u0001bar"); + EXPECT_EQ(stream.str(), "/foo\u0001bar"); } TEST(JSONPointer_stringify, escape_02) { const sourcemeta::core::Pointer pointer{"foo\u0002bar"}; std::ostringstream stream; sourcemeta::core::stringify(pointer, stream); - EXPECT_EQ(stream.str(), "/foo\\u0002bar"); + EXPECT_EQ(stream.str(), "/foo\u0002bar"); } TEST(JSONPointer_stringify, escape_03) { const sourcemeta::core::Pointer pointer{"foo\u0003bar"}; std::ostringstream stream; sourcemeta::core::stringify(pointer, stream); - EXPECT_EQ(stream.str(), "/foo\\u0003bar"); + EXPECT_EQ(stream.str(), "/foo\u0003bar"); } TEST(JSONPointer_stringify, escape_04) { const sourcemeta::core::Pointer pointer{"foo\u0004bar"}; std::ostringstream stream; sourcemeta::core::stringify(pointer, stream); - EXPECT_EQ(stream.str(), "/foo\\u0004bar"); + EXPECT_EQ(stream.str(), "/foo\u0004bar"); } TEST(JSONPointer_stringify, escape_05) { const sourcemeta::core::Pointer pointer{"foo\u0005bar"}; std::ostringstream stream; sourcemeta::core::stringify(pointer, stream); - EXPECT_EQ(stream.str(), "/foo\\u0005bar"); + EXPECT_EQ(stream.str(), "/foo\u0005bar"); } TEST(JSONPointer_stringify, escape_06) { const sourcemeta::core::Pointer pointer{"foo\u0006bar"}; std::ostringstream stream; sourcemeta::core::stringify(pointer, stream); - EXPECT_EQ(stream.str(), "/foo\\u0006bar"); + EXPECT_EQ(stream.str(), "/foo\u0006bar"); } TEST(JSONPointer_stringify, escape_07) { const sourcemeta::core::Pointer pointer{"foo\u0007bar"}; std::ostringstream stream; sourcemeta::core::stringify(pointer, stream); - EXPECT_EQ(stream.str(), "/foo\\u0007bar"); + EXPECT_EQ(stream.str(), "/foo\u0007bar"); } TEST(JSONPointer_stringify, escape_08) { const sourcemeta::core::Pointer pointer{"foo\u0008bar"}; std::ostringstream stream; sourcemeta::core::stringify(pointer, stream); - EXPECT_EQ(stream.str(), "/foo\\bbar"); + EXPECT_EQ(stream.str(), "/foo\u0008bar"); } TEST(JSONPointer_stringify, escape_09) { const sourcemeta::core::Pointer pointer{"foo\u0009bar"}; std::ostringstream stream; sourcemeta::core::stringify(pointer, stream); - EXPECT_EQ(stream.str(), "/foo\\tbar"); + EXPECT_EQ(stream.str(), "/foo\u0009bar"); } TEST(JSONPointer_stringify, escape_0A) { const sourcemeta::core::Pointer pointer{"foo\u000abar"}; std::ostringstream stream; sourcemeta::core::stringify(pointer, stream); - EXPECT_EQ(stream.str(), "/foo\\nbar"); + EXPECT_EQ(stream.str(), "/foo\u000Abar"); } TEST(JSONPointer_stringify, escape_0B) { const sourcemeta::core::Pointer pointer{"foo\u000bbar"}; std::ostringstream stream; sourcemeta::core::stringify(pointer, stream); - EXPECT_EQ(stream.str(), "/foo\\u000Bbar"); + EXPECT_EQ(stream.str(), "/foo\u000Bbar"); } TEST(JSONPointer_stringify, escape_0C) { const sourcemeta::core::Pointer pointer{"foo\u000cbar"}; std::ostringstream stream; sourcemeta::core::stringify(pointer, stream); - EXPECT_EQ(stream.str(), "/foo\\fbar"); + EXPECT_EQ(stream.str(), "/foo\u000Cbar"); } TEST(JSONPointer_stringify, escape_0D) { const sourcemeta::core::Pointer pointer{"foo\u000dbar"}; std::ostringstream stream; sourcemeta::core::stringify(pointer, stream); - EXPECT_EQ(stream.str(), "/foo\\rbar"); + EXPECT_EQ(stream.str(), "/foo\u000Dbar"); } TEST(JSONPointer_stringify, escape_0E) { const sourcemeta::core::Pointer pointer{"foo\u000ebar"}; std::ostringstream stream; sourcemeta::core::stringify(pointer, stream); - EXPECT_EQ(stream.str(), "/foo\\u000Ebar"); + EXPECT_EQ(stream.str(), "/foo\u000Ebar"); } TEST(JSONPointer_stringify, escape_0F) { const sourcemeta::core::Pointer pointer{"foo\u000fbar"}; std::ostringstream stream; sourcemeta::core::stringify(pointer, stream); - EXPECT_EQ(stream.str(), "/foo\\u000Fbar"); + EXPECT_EQ(stream.str(), "/foo\u000Fbar"); } TEST(JSONPointer_stringify, escape_10) { const sourcemeta::core::Pointer pointer{"foo\u0010bar"}; std::ostringstream stream; sourcemeta::core::stringify(pointer, stream); - EXPECT_EQ(stream.str(), "/foo\\u0010bar"); + EXPECT_EQ(stream.str(), "/foo\u0010bar"); } TEST(JSONPointer_stringify, escape_11) { const sourcemeta::core::Pointer pointer{"foo\u0011bar"}; std::ostringstream stream; sourcemeta::core::stringify(pointer, stream); - EXPECT_EQ(stream.str(), "/foo\\u0011bar"); + EXPECT_EQ(stream.str(), "/foo\u0011bar"); } TEST(JSONPointer_stringify, escape_12) { const sourcemeta::core::Pointer pointer{"foo\u0012bar"}; std::ostringstream stream; sourcemeta::core::stringify(pointer, stream); - EXPECT_EQ(stream.str(), "/foo\\u0012bar"); + EXPECT_EQ(stream.str(), "/foo\u0012bar"); } TEST(JSONPointer_stringify, escape_13) { const sourcemeta::core::Pointer pointer{"foo\u0013bar"}; std::ostringstream stream; sourcemeta::core::stringify(pointer, stream); - EXPECT_EQ(stream.str(), "/foo\\u0013bar"); + EXPECT_EQ(stream.str(), "/foo\u0013bar"); } TEST(JSONPointer_stringify, escape_14) { const sourcemeta::core::Pointer pointer{"foo\u0014bar"}; std::ostringstream stream; sourcemeta::core::stringify(pointer, stream); - EXPECT_EQ(stream.str(), "/foo\\u0014bar"); + EXPECT_EQ(stream.str(), "/foo\u0014bar"); } TEST(JSONPointer_stringify, escape_15) { const sourcemeta::core::Pointer pointer{"foo\u0015bar"}; std::ostringstream stream; sourcemeta::core::stringify(pointer, stream); - EXPECT_EQ(stream.str(), "/foo\\u0015bar"); + EXPECT_EQ(stream.str(), "/foo\u0015bar"); } TEST(JSONPointer_stringify, escape_16) { const sourcemeta::core::Pointer pointer{"foo\u0016bar"}; std::ostringstream stream; sourcemeta::core::stringify(pointer, stream); - EXPECT_EQ(stream.str(), "/foo\\u0016bar"); + EXPECT_EQ(stream.str(), "/foo\u0016bar"); } TEST(JSONPointer_stringify, escape_17) { const sourcemeta::core::Pointer pointer{"foo\u0017bar"}; std::ostringstream stream; sourcemeta::core::stringify(pointer, stream); - EXPECT_EQ(stream.str(), "/foo\\u0017bar"); + EXPECT_EQ(stream.str(), "/foo\u0017bar"); } TEST(JSONPointer_stringify, escape_18) { const sourcemeta::core::Pointer pointer{"foo\u0018bar"}; std::ostringstream stream; sourcemeta::core::stringify(pointer, stream); - EXPECT_EQ(stream.str(), "/foo\\u0018bar"); + EXPECT_EQ(stream.str(), "/foo\u0018bar"); } TEST(JSONPointer_stringify, escape_19) { const sourcemeta::core::Pointer pointer{"foo\u0019bar"}; std::ostringstream stream; sourcemeta::core::stringify(pointer, stream); - EXPECT_EQ(stream.str(), "/foo\\u0019bar"); + EXPECT_EQ(stream.str(), "/foo\u0019bar"); } TEST(JSONPointer_stringify, escape_1A) { const sourcemeta::core::Pointer pointer{"foo\u001abar"}; std::ostringstream stream; sourcemeta::core::stringify(pointer, stream); - EXPECT_EQ(stream.str(), "/foo\\u001Abar"); + EXPECT_EQ(stream.str(), "/foo\u001Abar"); } TEST(JSONPointer_stringify, escape_1B) { const sourcemeta::core::Pointer pointer{"foo\u001bbar"}; std::ostringstream stream; sourcemeta::core::stringify(pointer, stream); - EXPECT_EQ(stream.str(), "/foo\\u001Bbar"); + EXPECT_EQ(stream.str(), "/foo\u001Bbar"); } TEST(JSONPointer_stringify, escape_1C) { const sourcemeta::core::Pointer pointer{"foo\u001cbar"}; std::ostringstream stream; sourcemeta::core::stringify(pointer, stream); - EXPECT_EQ(stream.str(), "/foo\\u001Cbar"); + EXPECT_EQ(stream.str(), "/foo\u001Cbar"); } TEST(JSONPointer_stringify, escape_1D) { const sourcemeta::core::Pointer pointer{"foo\u001dbar"}; std::ostringstream stream; sourcemeta::core::stringify(pointer, stream); - EXPECT_EQ(stream.str(), "/foo\\u001Dbar"); + EXPECT_EQ(stream.str(), "/foo\u001Dbar"); } TEST(JSONPointer_stringify, escape_1E) { const sourcemeta::core::Pointer pointer{"foo\u001ebar"}; std::ostringstream stream; sourcemeta::core::stringify(pointer, stream); - EXPECT_EQ(stream.str(), "/foo\\u001Ebar"); + EXPECT_EQ(stream.str(), "/foo\u001Ebar"); } TEST(JSONPointer_stringify, escape_1F) { const sourcemeta::core::Pointer pointer{"foo\u001fbar"}; std::ostringstream stream; sourcemeta::core::stringify(pointer, stream); - EXPECT_EQ(stream.str(), "/foo\\u001Fbar"); + EXPECT_EQ(stream.str(), "/foo\u001Fbar"); } TEST(JSONPointer_stringify, no_uri_escape) { diff --git a/test/jsonpointer/jsonpointer_to_uri_test.cc b/test/jsonpointer/jsonpointer_to_uri_test.cc index 4ac0e7a1f6..d6649fb78b 100644 --- a/test/jsonpointer/jsonpointer_to_uri_test.cc +++ b/test/jsonpointer/jsonpointer_to_uri_test.cc @@ -67,224 +67,224 @@ TEST(JSONPointer_to_uri, escape_00) { using namespace std::string_literals; const sourcemeta::core::Pointer pointer{"foo\0bar"s}; const sourcemeta::core::URI fragment{sourcemeta::core::to_uri(pointer)}; - EXPECT_EQ(fragment.recompose(), "#/foo%5Cu0000bar"); + EXPECT_EQ(fragment.recompose(), "#/foo%00bar"); } TEST(JSONPointer_to_uri, escape_01) { const sourcemeta::core::Pointer pointer{"foo\u0001bar"}; std::ostringstream stream; const sourcemeta::core::URI fragment{sourcemeta::core::to_uri(pointer)}; - EXPECT_EQ(fragment.recompose(), "#/foo%5Cu0001bar"); + EXPECT_EQ(fragment.recompose(), "#/foo%01bar"); } TEST(JSONPointer_to_uri, escape_02) { const sourcemeta::core::Pointer pointer{"foo\u0002bar"}; std::ostringstream stream; const sourcemeta::core::URI fragment{sourcemeta::core::to_uri(pointer)}; - EXPECT_EQ(fragment.recompose(), "#/foo%5Cu0002bar"); + EXPECT_EQ(fragment.recompose(), "#/foo%02bar"); } TEST(JSONPointer_to_uri, escape_03) { const sourcemeta::core::Pointer pointer{"foo\u0003bar"}; std::ostringstream stream; const sourcemeta::core::URI fragment{sourcemeta::core::to_uri(pointer)}; - EXPECT_EQ(fragment.recompose(), "#/foo%5Cu0003bar"); + EXPECT_EQ(fragment.recompose(), "#/foo%03bar"); } TEST(JSONPointer_to_uri, escape_04) { const sourcemeta::core::Pointer pointer{"foo\u0004bar"}; std::ostringstream stream; const sourcemeta::core::URI fragment{sourcemeta::core::to_uri(pointer)}; - EXPECT_EQ(fragment.recompose(), "#/foo%5Cu0004bar"); + EXPECT_EQ(fragment.recompose(), "#/foo%04bar"); } TEST(JSONPointer_to_uri, escape_05) { const sourcemeta::core::Pointer pointer{"foo\u0005bar"}; std::ostringstream stream; const sourcemeta::core::URI fragment{sourcemeta::core::to_uri(pointer)}; - EXPECT_EQ(fragment.recompose(), "#/foo%5Cu0005bar"); + EXPECT_EQ(fragment.recompose(), "#/foo%05bar"); } TEST(JSONPointer_to_uri, escape_06) { const sourcemeta::core::Pointer pointer{"foo\u0006bar"}; std::ostringstream stream; const sourcemeta::core::URI fragment{sourcemeta::core::to_uri(pointer)}; - EXPECT_EQ(fragment.recompose(), "#/foo%5Cu0006bar"); + EXPECT_EQ(fragment.recompose(), "#/foo%06bar"); } TEST(JSONPointer_to_uri, escape_07) { const sourcemeta::core::Pointer pointer{"foo\u0007bar"}; std::ostringstream stream; const sourcemeta::core::URI fragment{sourcemeta::core::to_uri(pointer)}; - EXPECT_EQ(fragment.recompose(), "#/foo%5Cu0007bar"); + EXPECT_EQ(fragment.recompose(), "#/foo%07bar"); } TEST(JSONPointer_to_uri, escape_08) { const sourcemeta::core::Pointer pointer{"foo\u0008bar"}; std::ostringstream stream; const sourcemeta::core::URI fragment{sourcemeta::core::to_uri(pointer)}; - EXPECT_EQ(fragment.recompose(), "#/foo%5Cbbar"); + EXPECT_EQ(fragment.recompose(), "#/foo%08bar"); } TEST(JSONPointer_to_uri, escape_09) { const sourcemeta::core::Pointer pointer{"foo\u0009bar"}; std::ostringstream stream; const sourcemeta::core::URI fragment{sourcemeta::core::to_uri(pointer)}; - EXPECT_EQ(fragment.recompose(), "#/foo%5Ctbar"); + EXPECT_EQ(fragment.recompose(), "#/foo%09bar"); } TEST(JSONPointer_to_uri, escape_0A) { const sourcemeta::core::Pointer pointer{"foo\u000abar"}; std::ostringstream stream; const sourcemeta::core::URI fragment{sourcemeta::core::to_uri(pointer)}; - EXPECT_EQ(fragment.recompose(), "#/foo%5Cnbar"); + EXPECT_EQ(fragment.recompose(), "#/foo%0Abar"); } TEST(JSONPointer_to_uri, escape_0B) { const sourcemeta::core::Pointer pointer{"foo\u000bbar"}; std::ostringstream stream; const sourcemeta::core::URI fragment{sourcemeta::core::to_uri(pointer)}; - EXPECT_EQ(fragment.recompose(), "#/foo%5Cu000Bbar"); + EXPECT_EQ(fragment.recompose(), "#/foo%0Bbar"); } TEST(JSONPointer_to_uri, escape_0C) { const sourcemeta::core::Pointer pointer{"foo\u000cbar"}; std::ostringstream stream; const sourcemeta::core::URI fragment{sourcemeta::core::to_uri(pointer)}; - EXPECT_EQ(fragment.recompose(), "#/foo%5Cfbar"); + EXPECT_EQ(fragment.recompose(), "#/foo%0Cbar"); } TEST(JSONPointer_to_uri, escape_0D) { const sourcemeta::core::Pointer pointer{"foo\u000dbar"}; std::ostringstream stream; const sourcemeta::core::URI fragment{sourcemeta::core::to_uri(pointer)}; - EXPECT_EQ(fragment.recompose(), "#/foo%5Crbar"); + EXPECT_EQ(fragment.recompose(), "#/foo%0Dbar"); } TEST(JSONPointer_to_uri, escape_0E) { const sourcemeta::core::Pointer pointer{"foo\u000ebar"}; std::ostringstream stream; const sourcemeta::core::URI fragment{sourcemeta::core::to_uri(pointer)}; - EXPECT_EQ(fragment.recompose(), "#/foo%5Cu000Ebar"); + EXPECT_EQ(fragment.recompose(), "#/foo%0Ebar"); } TEST(JSONPointer_to_uri, escape_0F) { const sourcemeta::core::Pointer pointer{"foo\u000fbar"}; std::ostringstream stream; const sourcemeta::core::URI fragment{sourcemeta::core::to_uri(pointer)}; - EXPECT_EQ(fragment.recompose(), "#/foo%5Cu000Fbar"); + EXPECT_EQ(fragment.recompose(), "#/foo%0Fbar"); } TEST(JSONPointer_to_uri, escape_10) { const sourcemeta::core::Pointer pointer{"foo\u0010bar"}; std::ostringstream stream; const sourcemeta::core::URI fragment{sourcemeta::core::to_uri(pointer)}; - EXPECT_EQ(fragment.recompose(), "#/foo%5Cu0010bar"); + EXPECT_EQ(fragment.recompose(), "#/foo%10bar"); } TEST(JSONPointer_to_uri, escape_11) { const sourcemeta::core::Pointer pointer{"foo\u0011bar"}; std::ostringstream stream; const sourcemeta::core::URI fragment{sourcemeta::core::to_uri(pointer)}; - EXPECT_EQ(fragment.recompose(), "#/foo%5Cu0011bar"); + EXPECT_EQ(fragment.recompose(), "#/foo%11bar"); } TEST(JSONPointer_to_uri, escape_12) { const sourcemeta::core::Pointer pointer{"foo\u0012bar"}; std::ostringstream stream; const sourcemeta::core::URI fragment{sourcemeta::core::to_uri(pointer)}; - EXPECT_EQ(fragment.recompose(), "#/foo%5Cu0012bar"); + EXPECT_EQ(fragment.recompose(), "#/foo%12bar"); } TEST(JSONPointer_to_uri, escape_13) { const sourcemeta::core::Pointer pointer{"foo\u0013bar"}; std::ostringstream stream; const sourcemeta::core::URI fragment{sourcemeta::core::to_uri(pointer)}; - EXPECT_EQ(fragment.recompose(), "#/foo%5Cu0013bar"); + EXPECT_EQ(fragment.recompose(), "#/foo%13bar"); } TEST(JSONPointer_to_uri, escape_14) { const sourcemeta::core::Pointer pointer{"foo\u0014bar"}; std::ostringstream stream; const sourcemeta::core::URI fragment{sourcemeta::core::to_uri(pointer)}; - EXPECT_EQ(fragment.recompose(), "#/foo%5Cu0014bar"); + EXPECT_EQ(fragment.recompose(), "#/foo%14bar"); } TEST(JSONPointer_to_uri, escape_15) { const sourcemeta::core::Pointer pointer{"foo\u0015bar"}; std::ostringstream stream; const sourcemeta::core::URI fragment{sourcemeta::core::to_uri(pointer)}; - EXPECT_EQ(fragment.recompose(), "#/foo%5Cu0015bar"); + EXPECT_EQ(fragment.recompose(), "#/foo%15bar"); } TEST(JSONPointer_to_uri, escape_16) { const sourcemeta::core::Pointer pointer{"foo\u0016bar"}; std::ostringstream stream; const sourcemeta::core::URI fragment{sourcemeta::core::to_uri(pointer)}; - EXPECT_EQ(fragment.recompose(), "#/foo%5Cu0016bar"); + EXPECT_EQ(fragment.recompose(), "#/foo%16bar"); } TEST(JSONPointer_to_uri, escape_17) { const sourcemeta::core::Pointer pointer{"foo\u0017bar"}; std::ostringstream stream; const sourcemeta::core::URI fragment{sourcemeta::core::to_uri(pointer)}; - EXPECT_EQ(fragment.recompose(), "#/foo%5Cu0017bar"); + EXPECT_EQ(fragment.recompose(), "#/foo%17bar"); } TEST(JSONPointer_to_uri, escape_18) { const sourcemeta::core::Pointer pointer{"foo\u0018bar"}; std::ostringstream stream; const sourcemeta::core::URI fragment{sourcemeta::core::to_uri(pointer)}; - EXPECT_EQ(fragment.recompose(), "#/foo%5Cu0018bar"); + EXPECT_EQ(fragment.recompose(), "#/foo%18bar"); } TEST(JSONPointer_to_uri, escape_19) { const sourcemeta::core::Pointer pointer{"foo\u0019bar"}; std::ostringstream stream; const sourcemeta::core::URI fragment{sourcemeta::core::to_uri(pointer)}; - EXPECT_EQ(fragment.recompose(), "#/foo%5Cu0019bar"); + EXPECT_EQ(fragment.recompose(), "#/foo%19bar"); } TEST(JSONPointer_to_uri, escape_1A) { const sourcemeta::core::Pointer pointer{"foo\u001abar"}; std::ostringstream stream; const sourcemeta::core::URI fragment{sourcemeta::core::to_uri(pointer)}; - EXPECT_EQ(fragment.recompose(), "#/foo%5Cu001Abar"); + EXPECT_EQ(fragment.recompose(), "#/foo%1Abar"); } TEST(JSONPointer_to_uri, escape_1B) { const sourcemeta::core::Pointer pointer{"foo\u001bbar"}; std::ostringstream stream; const sourcemeta::core::URI fragment{sourcemeta::core::to_uri(pointer)}; - EXPECT_EQ(fragment.recompose(), "#/foo%5Cu001Bbar"); + EXPECT_EQ(fragment.recompose(), "#/foo%1Bbar"); } TEST(JSONPointer_to_uri, escape_1C) { const sourcemeta::core::Pointer pointer{"foo\u001cbar"}; std::ostringstream stream; const sourcemeta::core::URI fragment{sourcemeta::core::to_uri(pointer)}; - EXPECT_EQ(fragment.recompose(), "#/foo%5Cu001Cbar"); + EXPECT_EQ(fragment.recompose(), "#/foo%1Cbar"); } TEST(JSONPointer_to_uri, escape_1D) { const sourcemeta::core::Pointer pointer{"foo\u001dbar"}; std::ostringstream stream; const sourcemeta::core::URI fragment{sourcemeta::core::to_uri(pointer)}; - EXPECT_EQ(fragment.recompose(), "#/foo%5Cu001Dbar"); + EXPECT_EQ(fragment.recompose(), "#/foo%1Dbar"); } TEST(JSONPointer_to_uri, escape_1E) { const sourcemeta::core::Pointer pointer{"foo\u001ebar"}; std::ostringstream stream; const sourcemeta::core::URI fragment{sourcemeta::core::to_uri(pointer)}; - EXPECT_EQ(fragment.recompose(), "#/foo%5Cu001Ebar"); + EXPECT_EQ(fragment.recompose(), "#/foo%1Ebar"); } TEST(JSONPointer_to_uri, escape_1F) { const sourcemeta::core::Pointer pointer{"foo\u001fbar"}; std::ostringstream stream; const sourcemeta::core::URI fragment{sourcemeta::core::to_uri(pointer)}; - EXPECT_EQ(fragment.recompose(), "#/foo%5Cu001Fbar"); + EXPECT_EQ(fragment.recompose(), "#/foo%1Fbar"); } TEST(JSONPointer_to_uri, with_absolute_base) { diff --git a/test/markdown/markdown_test.cc b/test/markdown/markdown_test.cc index e393764a53..0a69d1c280 100644 --- a/test/markdown/markdown_test.cc +++ b/test/markdown/markdown_test.cc @@ -46,6 +46,18 @@ TEST(Markdown_to_html, simple_paragraph) { EXPECT_EQ(result, "

Hello world

\n"); } +TEST(Markdown_to_html, safe_mode_renders_plain_content) { + const auto result{ + sourcemeta::core::markdown_to_html("Hello **world**", true)}; + EXPECT_EQ(result, "

Hello world

\n"); +} + +TEST(Markdown_to_html, safe_mode_omits_raw_html) { + const auto result{ + sourcemeta::core::markdown_to_html("
hi
", true)}; + EXPECT_EQ(result, "\n"); +} + TEST(Markdown_to_html, multiple_paragraphs) { const auto result{sourcemeta::core::markdown_to_html( "First paragraph\n\nSecond paragraph")}; diff --git a/test/numeric/numeric_decimal_test.cc b/test/numeric/numeric_decimal_test.cc index 2a65543e76..c30f259aa3 100644 --- a/test/numeric/numeric_decimal_test.cc +++ b/test/numeric/numeric_decimal_test.cc @@ -3698,3 +3698,35 @@ TEST(Numeric_decimal, is_integer_implies_is_integral) { EXPECT_TRUE(value.is_integer()); EXPECT_TRUE(value.is_integral()); } + +TEST(Numeric_decimal, parse_digit_string_longer_than_inline_buffer) { + const std::string digits(1100, '9'); + const sourcemeta::core::Decimal value{digits}; + EXPECT_EQ(value.to_string(), digits); + const sourcemeta::core::Decimal again{value.to_string()}; + EXPECT_EQ(value, again); +} + +TEST(Numeric_decimal, big_integral_with_negative_exponent_to_uint64) { + const sourcemeta::core::Decimal value{"10000000000000000000e-1"}; + EXPECT_TRUE(value.is_integral()); + EXPECT_TRUE(value.is_uint64()); + EXPECT_EQ(value.to_uint64(), 1000000000000000000ULL); +} + +TEST(Numeric_decimal, big_integral_with_negative_exponent_to_int64) { + const sourcemeta::core::Decimal value{"-10000000000000000000e-1"}; + EXPECT_TRUE(value.is_integral()); + EXPECT_EQ(value.to_int64(), -1000000000000000000LL); +} + +TEST(Numeric_decimal, nan_with_payload_longer_than_storage_saturates) { + const sourcemeta::core::Decimal value{"NaN99999999999999999999999"}; + EXPECT_TRUE(value.is_nan()); + EXPECT_EQ(value.nan_payload(), std::numeric_limits::max()); +} + +TEST(Numeric_decimal, parse_extreme_negative_exponent_does_not_overflow) { + const sourcemeta::core::Decimal value{"1.5e-2147483648"}; + EXPECT_TRUE(value.is_finite()); +} diff --git a/test/numeric/numeric_util_test.cc b/test/numeric/numeric_util_test.cc index 9701a25e20..ca81121d30 100644 --- a/test/numeric/numeric_util_test.cc +++ b/test/numeric/numeric_util_test.cc @@ -497,6 +497,21 @@ TEST(Numeric_util, count_multiples_negative_range) { std::uint64_t{3}); } +TEST(Numeric_util, count_multiples_minimum_is_int64_min) { + EXPECT_EQ(sourcemeta::core::count_multiples( + std::numeric_limits::min(), + std::numeric_limits::min(), std::int64_t{1}), + std::uint64_t{1}); +} + +TEST(Numeric_util, count_multiples_range_from_int64_min) { + // Multiples of 2 in [INT64_MIN, INT64_MIN + 5]: INT64_MIN, +2, +4 = 3 + EXPECT_EQ(sourcemeta::core::count_multiples( + std::numeric_limits::min(), + std::numeric_limits::min() + 5, std::int64_t{2}), + std::uint64_t{3}); +} + TEST(Numeric_util, count_multiples_mixed_range) { // Multiples of 3 in [-7, 5]: -6, -3, 0, 3 = 4 EXPECT_EQ(sourcemeta::core::count_multiples(std::int64_t{-7}, std::int64_t{5}, diff --git a/test/process/process_spawn_test_unix.cc b/test/process/process_spawn_test_unix.cc index 0ffc5d8975..ba48c200d7 100644 --- a/test/process/process_spawn_test_unix.cc +++ b/test/process/process_spawn_test_unix.cc @@ -58,6 +58,21 @@ TEST(Process_spawn, echo_with_arguments) { EXPECT_EQ(exit_code, 0); } +TEST(Process_spawn, argument_with_spaces_is_a_single_argument) { + const int exit_code{sourcemeta::core::spawn( + "/bin/sh", + {"-c", "test \"$#\" -eq 1 && test \"$1\" = \"alpha beta gamma\"", "sh", + "alpha beta gamma"})}; + EXPECT_EQ(exit_code, 0); +} + +TEST(Process_spawn, empty_argument_is_preserved) { + const int exit_code{sourcemeta::core::spawn( + "/bin/sh", + {"-c", "test \"$#\" -eq 2 && test -z \"$1\"", "sh", "", "second"})}; + EXPECT_EQ(exit_code, 0); +} + TEST(Process_spawn, pwd_with_custom_directory) { const int exit_code{ sourcemeta::core::spawn("pwd", {}, std::filesystem::path{"/tmp"})}; diff --git a/test/punycode/punycode_decode_test.cc b/test/punycode/punycode_decode_test.cc index 633cb68c8c..fe4f8f2277 100644 --- a/test/punycode/punycode_decode_test.cc +++ b/test/punycode/punycode_decode_test.cc @@ -265,6 +265,21 @@ TEST(Punycode_decode, error_non_basic_before_delimiter) { sourcemeta::core::PunycodeError); } +TEST(Punycode_decode, error_leading_delimiter_only) { + EXPECT_THROW(sourcemeta::core::punycode_to_utf32("-"), + sourcemeta::core::PunycodeError); +} + +TEST(Punycode_decode, error_leading_delimiter_with_body) { + EXPECT_THROW(sourcemeta::core::punycode_to_utf32("-abc"), + sourcemeta::core::PunycodeError); +} + +TEST(Punycode_decode, trailing_delimiter_basic_only) { + const std::u32string expected{0x0061}; + EXPECT_EQ(sourcemeta::core::punycode_to_utf32("a-"), expected); +} + TEST(Punycode_decode, case_insensitive_basic_portion_preserved) { // The basic portion (before delimiter) preserves original case const std::u32string expected_lower{0x0061, 0x0062, 0x0063}; diff --git a/test/punycode/punycode_encode_test.cc b/test/punycode/punycode_encode_test.cc index c373edcae9..3270e0acd5 100644 --- a/test/punycode/punycode_encode_test.cc +++ b/test/punycode/punycode_encode_test.cc @@ -245,6 +245,18 @@ TEST(Punycode_encode, long_string_non_ascii) { EXPECT_EQ(sourcemeta::core::utf32_to_punycode(input), "fiqaaaa8796ababbb"); } +TEST(Punycode_encode, error_surrogate_code_point) { + const std::u32string input{0xD800}; + EXPECT_THROW(sourcemeta::core::utf32_to_punycode(input), + sourcemeta::core::PunycodeError); +} + +TEST(Punycode_encode, error_code_point_above_maximum) { + const std::u32string input{0x110000}; + EXPECT_THROW(sourcemeta::core::utf32_to_punycode(input), + sourcemeta::core::PunycodeError); +} + TEST(Punycode_encode, error_utf8_bad_start_byte) { // 0xFF is never valid as a UTF-8 start byte EXPECT_THROW(sourcemeta::core::utf8_to_punycode("\xFF"), diff --git a/test/regex/regex_test.cc b/test/regex/regex_test.cc index 9c5dd14b90..f11e688376 100644 --- a/test/regex/regex_test.cc +++ b/test/regex/regex_test.cc @@ -82,3 +82,10 @@ TEST(Regex, to_regex_default_string_view_does_not_invoke_ub) { const auto regex{sourcemeta::core::to_regex(pattern)}; EXPECT_TRUE(regex.has_value()); } + +TEST(Regex, catastrophic_backtracking_terminates) { + const auto regex{sourcemeta::core::to_regex("(a+)+$")}; + EXPECT_TRUE(regex.has_value()); + const std::string value{std::string(64, 'a') + "!"}; + EXPECT_FALSE(sourcemeta::core::matches(regex.value(), value)); +} diff --git a/test/text/text_to_title_case_test.cc b/test/text/text_to_title_case_test.cc index 86eb57ee76..be150d3b45 100644 --- a/test/text/text_to_title_case_test.cc +++ b/test/text/text_to_title_case_test.cc @@ -177,3 +177,14 @@ TEST(Text_to_title_case, space_in_input_is_not_a_separator) { sourcemeta::core::to_title_case(value); EXPECT_EQ(value, "Hello world"); } + +TEST(Text_to_title_case, non_ascii_byte_is_not_uppercased) { + std::string value{}; + value.push_back(static_cast(0xE9)); + value.append("hello"); + sourcemeta::core::to_title_case(value); + std::string expected{}; + expected.push_back(static_cast(0xE9)); + expected.append("Hello"); + EXPECT_EQ(value, expected); +} diff --git a/test/time/asctime_test.cc b/test/time/asctime_test.cc index c1d02d4c47..0cd74b01d3 100644 --- a/test/time/asctime_test.cc +++ b/test/time/asctime_test.cc @@ -102,6 +102,42 @@ TEST(Time_asctime, parse_rejects_sign_prefix_in_day) { sourcemeta::core::from_asctime("Sun Nov +6 08:49:37 1994").has_value()); } +// RFC 9110 §5.6.7: the day-of-month must be valid for the given month and year +TEST(Time_asctime, parse_rejects_february_thirtieth) { + EXPECT_FALSE( + sourcemeta::core::from_asctime("Sun Feb 30 08:49:37 2015").has_value()); +} + +// RFC 9110 §5.6.7: 2015 is not a leap year so February has only 28 days +TEST(Time_asctime, parse_rejects_february_twenty_ninth_non_leap) { + EXPECT_FALSE( + sourcemeta::core::from_asctime("Sun Feb 29 08:49:37 2015").has_value()); +} + +// RFC 9110 §5.6.7: the day-of-month must be at least one +TEST(Time_asctime, parse_rejects_zero_day) { + EXPECT_FALSE( + sourcemeta::core::from_asctime("Sun Nov 00 08:49:37 1994").has_value()); +} + +// RFC 9110 §5.6.7: the hour must be in the range 00-23 +TEST(Time_asctime, parse_rejects_hour_twenty_four) { + EXPECT_FALSE( + sourcemeta::core::from_asctime("Sun Nov 6 24:49:37 1994").has_value()); +} + +// RFC 9110 §5.6.7: the minute must be in the range 00-59 +TEST(Time_asctime, parse_rejects_minute_sixty) { + EXPECT_FALSE( + sourcemeta::core::from_asctime("Sun Nov 6 08:60:37 1994").has_value()); +} + +// RFC 9110 §5.6.7: the second must not exceed a leap second +TEST(Time_asctime, parse_rejects_second_sixty_one) { + EXPECT_FALSE( + sourcemeta::core::from_asctime("Sun Nov 6 08:49:61 1994").has_value()); +} + TEST(Time_asctime, format_output_length_is_24) { const auto point{std::chrono::system_clock::from_time_t(0)}; EXPECT_EQ(sourcemeta::core::to_asctime(point).size(), 24u); diff --git a/test/time/imf_fixdate_test.cc b/test/time/imf_fixdate_test.cc index 4332852a27..b6df91b9b2 100644 --- a/test/time/imf_fixdate_test.cc +++ b/test/time/imf_fixdate_test.cc @@ -173,6 +173,62 @@ TEST(Time_imf_fixdate, parse_rejects_asctime_shape) { .has_value()); } +// RFC 9110 §5.6.7: the day-of-month must be valid for the given month and year +TEST(Time_imf_fixdate, parse_rejects_february_thirtieth) { + EXPECT_FALSE( + sourcemeta::core::from_imf_fixdate("Mon, 30 Feb 2015 11:28:00 GMT") + .has_value()); +} + +// RFC 9110 §5.6.7: April has only 30 days +TEST(Time_imf_fixdate, parse_rejects_april_thirty_first) { + EXPECT_FALSE( + sourcemeta::core::from_imf_fixdate("Wed, 31 Apr 2015 11:28:00 GMT") + .has_value()); +} + +// RFC 9110 §5.6.7: 2015 is not a leap year so February has only 28 days +TEST(Time_imf_fixdate, parse_rejects_february_twenty_ninth_non_leap) { + EXPECT_FALSE( + sourcemeta::core::from_imf_fixdate("Sun, 29 Feb 2015 11:28:00 GMT") + .has_value()); +} + +// RFC 9110 §5.6.7: 2020 is a leap year so February has 29 days +TEST(Time_imf_fixdate, parse_accepts_february_twenty_ninth_leap) { + EXPECT_TRUE( + sourcemeta::core::from_imf_fixdate("Sat, 29 Feb 2020 11:28:00 GMT") + .has_value()); +} + +// RFC 9110 §5.6.7: the day-of-month must be at least one +TEST(Time_imf_fixdate, parse_rejects_zero_day) { + EXPECT_FALSE( + sourcemeta::core::from_imf_fixdate("Wed, 00 Oct 2015 11:28:00 GMT") + .has_value()); +} + +// RFC 9110 §5.6.7: the hour must be in the range 00-23 +TEST(Time_imf_fixdate, parse_rejects_hour_twenty_four) { + EXPECT_FALSE( + sourcemeta::core::from_imf_fixdate("Wed, 21 Oct 2015 24:28:00 GMT") + .has_value()); +} + +// RFC 9110 §5.6.7: the minute must be in the range 00-59 +TEST(Time_imf_fixdate, parse_rejects_minute_sixty) { + EXPECT_FALSE( + sourcemeta::core::from_imf_fixdate("Wed, 21 Oct 2015 11:60:00 GMT") + .has_value()); +} + +// RFC 9110 §5.6.7: the second must not exceed a leap second +TEST(Time_imf_fixdate, parse_rejects_second_sixty_one) { + EXPECT_FALSE( + sourcemeta::core::from_imf_fixdate("Wed, 21 Oct 2015 11:28:61 GMT") + .has_value()); +} + TEST(Time_imf_fixdate, format_epoch) { const auto point{std::chrono::system_clock::from_time_t(0)}; EXPECT_EQ(sourcemeta::core::to_imf_fixdate(point), diff --git a/test/time/rfc850_date_test.cc b/test/time/rfc850_date_test.cc index 3c2f9c6a32..1514cd6b73 100644 --- a/test/time/rfc850_date_test.cc +++ b/test/time/rfc850_date_test.cc @@ -137,6 +137,41 @@ TEST(Time_rfc850_date, parse_rejects_lowercase_gmt) { .has_value()); } +// RFC 9110 §5.6.7: the day-of-month must be valid for the given month and year +TEST(Time_rfc850_date, parse_rejects_february_thirtieth) { + EXPECT_FALSE( + sourcemeta::core::from_rfc850_date("Monday, 30-Feb-15 11:28:00 GMT") + .has_value()); +} + +// RFC 9110 §5.6.7: 2015 is not a leap year so February has only 28 days +TEST(Time_rfc850_date, parse_rejects_february_twenty_ninth_non_leap) { + EXPECT_FALSE( + sourcemeta::core::from_rfc850_date("Sunday, 29-Feb-15 11:28:00 GMT") + .has_value()); +} + +// RFC 9110 §5.6.7: the hour must be in the range 00-23 +TEST(Time_rfc850_date, parse_rejects_hour_twenty_four) { + EXPECT_FALSE( + sourcemeta::core::from_rfc850_date("Sunday, 06-Nov-94 24:49:37 GMT") + .has_value()); +} + +// RFC 9110 §5.6.7: the minute must be in the range 00-59 +TEST(Time_rfc850_date, parse_rejects_minute_sixty) { + EXPECT_FALSE( + sourcemeta::core::from_rfc850_date("Sunday, 06-Nov-94 08:60:37 GMT") + .has_value()); +} + +// RFC 9110 §5.6.7: the second must not exceed a leap second +TEST(Time_rfc850_date, parse_rejects_second_sixty_one) { + EXPECT_FALSE( + sourcemeta::core::from_rfc850_date("Sunday, 06-Nov-94 08:49:61 GMT") + .has_value()); +} + TEST(Time_rfc850_date, parse_y2k_boundary_at_threshold) { const auto point{ sourcemeta::core::from_rfc850_date("Friday, 01-Jan-76 00:00:00 GMT")}; diff --git a/test/uri/uri_canonicalize_test.cc b/test/uri/uri_canonicalize_test.cc index 58b5b8ce22..a86898525d 100644 --- a/test/uri/uri_canonicalize_test.cc +++ b/test/uri/uri_canonicalize_test.cc @@ -55,6 +55,13 @@ TEST(URI_canonicalize, example_8) { EXPECT_EQ(uri.recompose(), "http://example.com/case-insensitive-host"); } +// A percent-encoded uppercase letter in the host folds to lowercase +TEST(URI_canonicalize, percent_encoded_uppercase_host_letter) { + sourcemeta::core::URI uri{"http://%41EXAMPLE.com/path"}; + uri.canonicalize(); + EXPECT_EQ(uri.recompose(), "http://aexample.com/path"); +} + // Paths are case sensitive TEST(URI_canonicalize, example_9) { sourcemeta::core::URI uri{"hTtP://exAmpLe.com/case-SENSITIVE-path"}; @@ -284,7 +291,8 @@ TEST(URI_canonicalize, complex_case) { TEST(URI_canonicalize, component_aware_decode) { sourcemeta::core::URI uri{"http://example.com/%3a%3b%2f?foo%3dbar#baz%2fqux"}; uri.canonicalize(); - EXPECT_EQ(uri.recompose(), "http://example.com/:;%2F?foo=bar#baz/qux"); + EXPECT_EQ(uri.recompose(), + "http://example.com/%3A%3B%2F?foo%3Dbar#baz%2Fqux"); } TEST(URI_canonicalize, fragment_encoded_colon) { @@ -293,7 +301,7 @@ TEST(URI_canonicalize, fragment_encoded_colon) { uri.canonicalize(); EXPECT_EQ( uri.recompose(), - "https://www.example.com#/$defs/https:~1~1example.com~1schema/type"); + "https://www.example.com#/$defs/https%3A~1~1example.com~1schema/type"); } TEST(URI_canonicalize, relative_path_no_canonicalize) { diff --git a/test/uri/uri_path_test.cc b/test/uri/uri_path_test.cc index 37ac7ff244..108490aec9 100644 --- a/test/uri/uri_path_test.cc +++ b/test/uri/uri_path_test.cc @@ -2,6 +2,7 @@ #include +#include #include // Getter @@ -171,40 +172,30 @@ TEST(URI_path_setter, set_relative_path) { TEST(URI_path_setter, set_path_with_query) { sourcemeta::core::URI uri{"https://example.com"}; - uri.path("/foo%20bar?query=value#fragment"); - EXPECT_EQ(uri.path().value(), "/foo%20bar"); - EXPECT_EQ(uri.recompose(), "https://example.com/foo%20bar"); + EXPECT_THROW(uri.path("/foo%20bar?query=value#fragment"), + sourcemeta::core::URIError); std::string path{"/fooz%20bar?query=value#fragment"}; - uri.path(std::move(path)); - EXPECT_EQ(uri.path().value(), "/fooz%20bar"); - EXPECT_EQ(uri.recompose(), "https://example.com/fooz%20bar"); + EXPECT_THROW(uri.path(std::move(path)), sourcemeta::core::URIError); } TEST(URI_path_setter, set_path_with_fragment) { sourcemeta::core::URI uri{"https://example.com"}; - uri.path("/foo%20bar#fragment"); - EXPECT_EQ(uri.path().value(), "/foo%20bar"); - EXPECT_EQ(uri.recompose(), "https://example.com/foo%20bar"); + EXPECT_THROW(uri.path("/foo%20bar#fragment"), sourcemeta::core::URIError); std::string path{"/fooz%20bar#fragment"}; - uri.path(std::move(path)); - EXPECT_EQ(uri.path().value(), "/fooz%20bar"); - EXPECT_EQ(uri.recompose(), "https://example.com/fooz%20bar"); + EXPECT_THROW(uri.path(std::move(path)), sourcemeta::core::URIError); } TEST(URI_path_setter, set_path_with_query_and_fragment) { sourcemeta::core::URI uri{"https://example.com/old?query=value#fragment"}; - uri.path("/new?query=value#fragment"); - EXPECT_EQ(uri.path().value(), "/new"); - EXPECT_EQ(uri.recompose(), "https://example.com/new?query=value#fragment"); + EXPECT_THROW(uri.path("/new?query=value#fragment"), + sourcemeta::core::URIError); std::string path{"/newer?query=value#fragment"}; - uri.path(std::move(path)); - EXPECT_EQ(uri.path().value(), "/newer"); - EXPECT_EQ(uri.recompose(), "https://example.com/newer?query=value#fragment"); + EXPECT_THROW(uri.path(std::move(path)), sourcemeta::core::URIError); } TEST(URI_path_setter_no_scheme, set_path_on_host_only) { @@ -690,7 +681,7 @@ TEST(URI_path_setter, getter_setter_invariant_relative_with_leading_slash) { EXPECT_EQ(uri.path().value(), "/test/no-serve/schema"); EXPECT_EQ(uri.recompose(), "/test/no-serve/schema"); - const auto original_path{uri.path().value()}; + const std::string original_path{uri.path().value()}; const auto original_recompose{uri.recompose()}; uri.path(original_path); @@ -704,7 +695,7 @@ TEST(URI_path_setter, getter_setter_invariant_relative_without_leading_slash) { EXPECT_EQ(uri.path().value(), "test/no-serve/schema"); EXPECT_EQ(uri.recompose(), "test/no-serve/schema"); - const auto original_path{uri.path().value()}; + const std::string original_path{uri.path().value()}; const auto original_recompose{uri.recompose()}; uri.path(original_path); @@ -718,7 +709,7 @@ TEST(URI_path_setter, getter_setter_invariant_absolute_uri) { EXPECT_EQ(uri.path().value(), "/foo/bar"); EXPECT_EQ(uri.recompose(), "http://example.com/foo/bar"); - const auto original_path{uri.path().value()}; + const std::string original_path{uri.path().value()}; const auto original_recompose{uri.recompose()}; uri.path(original_path); @@ -732,7 +723,7 @@ TEST(URI_path_setter, getter_setter_invariant_relative_dotdot) { EXPECT_EQ(uri.path().value(), "../foo/bar"); EXPECT_EQ(uri.recompose(), "../foo/bar"); - const auto original_path{uri.path().value()}; + const std::string original_path{uri.path().value()}; const auto original_recompose{uri.recompose()}; uri.path(original_path); diff --git a/test/yaml/yaml_parse_test.cc b/test/yaml/yaml_parse_test.cc index 7d9ecdcec5..b001675b80 100644 --- a/test/yaml/yaml_parse_test.cc +++ b/test/yaml/yaml_parse_test.cc @@ -473,3 +473,23 @@ TEST(YAML_parse, invalid_unicode_escape_8) { FAIL() << "Expected YAMLParseError, got different exception"; } } + +TEST(YAML_parse, exponential_alias_expansion_is_bounded) { + const std::string input{"a: &a [ x, x, x, x, x, x, x, x, x, x ]\n" + "b: &b [ *a, *a, *a, *a, *a, *a, *a, *a, *a, *a ]\n" + "c: &c [ *b, *b, *b, *b, *b, *b, *b, *b, *b, *b ]\n" + "d: &d [ *c, *c, *c, *c, *c, *c, *c, *c, *c, *c ]\n" + "e: &e [ *d, *d, *d, *d, *d, *d, *d, *d, *d, *d ]\n" + "f: &f [ *e, *e, *e, *e, *e, *e, *e, *e, *e, *e ]\n" + "g: &g [ *f, *f, *f, *f, *f, *f, *f, *f, *f, *f ]\n" + "h: &h [ *g, *g, *g, *g, *g, *g, *g, *g, *g, *g ]\n" + "i: &i [ *h, *h, *h, *h, *h, *h, *h, *h, *h, *h ]\n"}; + try { + sourcemeta::core::parse_yaml(input); + FAIL() << "Expected YAMLParseError to be thrown"; + } catch (const sourcemeta::core::YAMLParseError &) { + SUCCEED(); + } catch (...) { + FAIL() << "Expected YAMLParseError, got different exception"; + } +}