From a7f3592f5efa7cbe1c397983f7506fb63427ace5 Mon Sep 17 00:00:00 2001 From: uwezkhan06 Date: Tue, 28 Apr 2026 19:26:59 +0530 Subject: [PATCH] Validate Python buffer inputs before converting them to byte views --- python/_re2.cc | 174 +++++++++++++++++++++++++++---------------------- 1 file changed, 97 insertions(+), 77 deletions(-) diff --git a/python/_re2.cc b/python/_re2.cc index 22f092b23..9934a323b 100644 --- a/python/_re2.cc +++ b/python/_re2.cc @@ -17,7 +17,7 @@ #include "pybind11/gil.h" #include "pybind11/pybind11.h" #include "pybind11/pytypes.h" -#include "pybind11/stl.h" // IWYU pragma: keep +#include "pybind11/stl.h" // IWYU pragma: keep #include "re2/filtered_re2.h" #include "re2/re2.h" #include "re2/set.h" @@ -37,21 +37,43 @@ namespace py = pybind11; // a py::buffer_info in order to access the actual bytes. Under the hood, // the py::buffer_info manages a reference count to the py::buffer, so it // must be constructed and subsequently destructed while holding the GIL. -static inline absl::string_view FromBytes(const py::buffer_info& bytes) { - char* data = reinterpret_cast(bytes.ptr); +static inline void ValidateBytesBuffer(const py::buffer_info &bytes) { + if (bytes.itemsize != 1) { + throw py::value_error("buffer must have single-byte elements"); + } + if (bytes.ndim != 1) { + throw py::value_error("buffer must be one-dimensional"); + } + if (bytes.shape.size() != 1 || bytes.strides.size() != 1 || + bytes.strides[0] != 1) { + throw py::value_error("buffer must be contiguous"); + } +} + +static inline absl::string_view FromBytes(const py::buffer_info &bytes) { + ValidateBytesBuffer(bytes); + char *data = reinterpret_cast(bytes.ptr); ssize_t size = bytes.size; return absl::string_view(data, size); } -static inline int OneCharLen(const char* ptr) { +static inline int OneCharLen(const char *ptr) { return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*ptr & 0xFF) >> 4]; } +static inline void ValidateSpan(absl::string_view text, ssize_t pos, + ssize_t endpos) { + if (pos < 0 || endpos < pos || endpos > static_cast(text.size())) { + throw py::value_error("invalid byte offsets"); + } +} + // Helper function for when Python encodes str to bytes and then needs to // convert str offsets to bytes offsets. Assumes that text is valid UTF-8. ssize_t CharLenToBytes(py::buffer buffer, ssize_t pos, ssize_t len) { auto bytes = buffer.request(); auto text = FromBytes(bytes); + ValidateSpan(text, pos, pos); auto ptr = text.data() + pos; auto end = text.data() + text.size(); while (ptr < end && len > 0) { @@ -66,6 +88,7 @@ ssize_t CharLenToBytes(py::buffer buffer, ssize_t pos, ssize_t len) { ssize_t BytesToCharLen(py::buffer buffer, ssize_t pos, ssize_t endpos) { auto bytes = buffer.request(); auto text = FromBytes(bytes); + ValidateSpan(text, pos, endpos); auto ptr = text.data() + pos; auto end = text.data() + endpos; ssize_t len = 0; @@ -77,67 +100,66 @@ ssize_t BytesToCharLen(py::buffer buffer, ssize_t pos, ssize_t endpos) { } std::unique_ptr RE2InitShim(py::buffer buffer, - const RE2::Options& options) { + const RE2::Options &options) { auto bytes = buffer.request(); auto pattern = FromBytes(bytes); return std::make_unique(pattern, options); } -py::bytes RE2ErrorShim(const RE2& self) { +py::bytes RE2ErrorShim(const RE2 &self) { // Return std::string as bytes. That is, without decoding to str. return self.error(); } -std::vector> RE2NamedCapturingGroupsShim( - const RE2& self) { +std::vector> +RE2NamedCapturingGroupsShim(const RE2 &self) { const int num_groups = self.NumberOfCapturingGroups(); std::vector> groups; groups.reserve(num_groups); - for (const auto& it : self.NamedCapturingGroups()) { + for (const auto &it : self.NamedCapturingGroups()) { groups.emplace_back(it.first, it.second); } return groups; } -std::vector RE2ProgramFanoutShim(const RE2& self) { +std::vector RE2ProgramFanoutShim(const RE2 &self) { std::vector histogram; self.ProgramFanout(&histogram); return histogram; } -std::vector RE2ReverseProgramFanoutShim(const RE2& self) { +std::vector RE2ReverseProgramFanoutShim(const RE2 &self) { std::vector histogram; self.ReverseProgramFanout(&histogram); return histogram; } -std::tuple RE2PossibleMatchRangeShim( - const RE2& self, int maxlen) { +std::tuple +RE2PossibleMatchRangeShim(const RE2 &self, int maxlen) { std::string min, max; // Return std::string as bytes. That is, without decoding to str. return {self.PossibleMatchRange(&min, &max, maxlen), min, max}; } -std::vector> RE2MatchShim(const RE2& self, - RE2::Anchor anchor, - py::buffer buffer, - ssize_t pos, - ssize_t endpos) { +std::vector> +RE2MatchShim(const RE2 &self, RE2::Anchor anchor, py::buffer buffer, + ssize_t pos, ssize_t endpos) { auto bytes = buffer.request(); auto text = FromBytes(bytes); - const int num_groups = self.NumberOfCapturingGroups() + 1; // need $0 + ValidateSpan(text, pos, endpos); + const int num_groups = self.NumberOfCapturingGroups() + 1; // need $0 std::vector groups; groups.resize(num_groups); py::gil_scoped_release release_gil; if (!self.Match(text, pos, endpos, anchor, groups.data(), groups.size())) { // Ensure that groups are null before converting to spans! - for (auto& it : groups) { + for (auto &it : groups) { it = absl::string_view(); } } std::vector> spans; spans.reserve(num_groups); - for (const auto& it : groups) { + for (const auto &it : groups) { if (it.data() == NULL) { spans.emplace_back(-1, -1); } else { @@ -156,20 +178,20 @@ py::bytes RE2QuoteMetaShim(py::buffer buffer) { } class Set { - public: - Set(RE2::Anchor anchor, const RE2::Options& options) +public: + Set(RE2::Anchor anchor, const RE2::Options &options) : set_(options, anchor) {} ~Set() = default; // Not copyable or movable. - Set(const Set&) = delete; - Set& operator=(const Set&) = delete; + Set(const Set &) = delete; + Set &operator=(const Set &) = delete; int Add(py::buffer buffer) { auto bytes = buffer.request(); auto pattern = FromBytes(bytes); - int index = set_.Add(pattern, /*error=*/NULL); // -1 on error + int index = set_.Add(pattern, /*error=*/NULL); // -1 on error return index; } @@ -187,23 +209,23 @@ class Set { return matches; } - private: +private: RE2::Set set_; }; class Filter { - public: +public: Filter() = default; ~Filter() = default; // Not copyable or movable. - Filter(const Filter&) = delete; - Filter& operator=(const Filter&) = delete; + Filter(const Filter &) = delete; + Filter &operator=(const Filter &) = delete; - int Add(py::buffer buffer, const RE2::Options& options) { + int Add(py::buffer buffer, const RE2::Options &options) { auto bytes = buffer.request(); auto pattern = FromBytes(bytes); - int index = -1; // not clobbered on error + int index = -1; // not clobbered on error filter_.Add(pattern, options, &index); return index; } @@ -244,11 +266,9 @@ class Filter { return matches; } - const RE2& GetRE2(int index) const { - return filter_.GetRE2(index); - } + const RE2 &GetRE2(int index) const { return filter_.GetRE2(index); } - private: +private: re2::FilteredRE2 filter_; std::unique_ptr set_; }; @@ -282,45 +302,45 @@ PYBIND11_MODULE(_re2, module) { encoding.value("LATIN1", RE2::Options::Encoding::EncodingLatin1); options.def(py::init<>()) - .def_property("max_mem", // - &RE2::Options::max_mem, // - &RE2::Options::set_max_mem) // - .def_property("encoding", // - &RE2::Options::encoding, // - &RE2::Options::set_encoding) // - .def_property("posix_syntax", // - &RE2::Options::posix_syntax, // - &RE2::Options::set_posix_syntax) // - .def_property("longest_match", // - &RE2::Options::longest_match, // - &RE2::Options::set_longest_match) // - .def_property("log_errors", // - &RE2::Options::log_errors, // - &RE2::Options::set_log_errors) // - .def_property("literal", // - &RE2::Options::literal, // - &RE2::Options::set_literal) // - .def_property("never_nl", // - &RE2::Options::never_nl, // - &RE2::Options::set_never_nl) // - .def_property("dot_nl", // - &RE2::Options::dot_nl, // - &RE2::Options::set_dot_nl) // - .def_property("never_capture", // - &RE2::Options::never_capture, // - &RE2::Options::set_never_capture) // - .def_property("case_sensitive", // - &RE2::Options::case_sensitive, // - &RE2::Options::set_case_sensitive) // - .def_property("perl_classes", // - &RE2::Options::perl_classes, // - &RE2::Options::set_perl_classes) // - .def_property("word_boundary", // - &RE2::Options::word_boundary, // - &RE2::Options::set_word_boundary) // - .def_property("one_line", // - &RE2::Options::one_line, // - &RE2::Options::set_one_line); // + .def_property("max_mem", // + &RE2::Options::max_mem, // + &RE2::Options::set_max_mem) // + .def_property("encoding", // + &RE2::Options::encoding, // + &RE2::Options::set_encoding) // + .def_property("posix_syntax", // + &RE2::Options::posix_syntax, // + &RE2::Options::set_posix_syntax) // + .def_property("longest_match", // + &RE2::Options::longest_match, // + &RE2::Options::set_longest_match) // + .def_property("log_errors", // + &RE2::Options::log_errors, // + &RE2::Options::set_log_errors) // + .def_property("literal", // + &RE2::Options::literal, // + &RE2::Options::set_literal) // + .def_property("never_nl", // + &RE2::Options::never_nl, // + &RE2::Options::set_never_nl) // + .def_property("dot_nl", // + &RE2::Options::dot_nl, // + &RE2::Options::set_dot_nl) // + .def_property("never_capture", // + &RE2::Options::never_capture, // + &RE2::Options::set_never_capture) // + .def_property("case_sensitive", // + &RE2::Options::case_sensitive, // + &RE2::Options::set_case_sensitive) // + .def_property("perl_classes", // + &RE2::Options::perl_classes, // + &RE2::Options::set_perl_classes) // + .def_property("word_boundary", // + &RE2::Options::word_boundary, // + &RE2::Options::set_word_boundary) // + .def_property("one_line", // + &RE2::Options::one_line, // + &RE2::Options::set_one_line); // re2.def(py::init(&RE2InitShim)) .def("ok", &RE2::ok) @@ -336,7 +356,7 @@ PYBIND11_MODULE(_re2, module) { .def("Match", &RE2MatchShim) .def_static("QuoteMeta", &RE2QuoteMetaShim); - set.def(py::init()) + set.def(py::init()) .def("Add", &Set::Add) .def("Compile", &Set::Compile) .def("Match", &Set::Match); @@ -349,4 +369,4 @@ PYBIND11_MODULE(_re2, module) { py::return_value_policy::reference_internal); } -} // namespace re2_python +} // namespace re2_python \ No newline at end of file