From 2167c4fcf3b3411fabeca5dce64dfd89d932d399 Mon Sep 17 00:00:00 2001 From: Vizonex Date: Wed, 24 Dec 2025 19:12:41 -0600 Subject: [PATCH 1/3] in lexbor use python's heap instead of C's heap as an extra performance enhancement --- selectolax/lexbor.pxd | 17 +++++++++++++++++ selectolax/lexbor.pyx | 32 ++++++++++++++++++++++++++++++-- 2 files changed, 47 insertions(+), 2 deletions(-) diff --git a/selectolax/lexbor.pxd b/selectolax/lexbor.pxd index 361eb53..07fb3c3 100644 --- a/selectolax/lexbor.pxd +++ b/selectolax/lexbor.pxd @@ -1,6 +1,9 @@ from libc.stdint cimport uint8_t, uint32_t, uintptr_t + + + cdef extern from "lexbor/core/core.h" nogil: ctypedef uint32_t lxb_codepoint_t ctypedef unsigned char lxb_char_t @@ -32,6 +35,20 @@ cdef extern from "lexbor/core/core.h" nogil: lexbor_str_t* lexbor_str_create() lxb_char_t * lexbor_str_data_noi(lexbor_str_t *str) +cdef extern from "lexbor/core/lexbor.h" nogil: + + ctypedef void *(*lexbor_memory_malloc_f)(size_t size) nogil + ctypedef void *(*lexbor_memory_realloc_f)(void *dst, size_t size) nogil + ctypedef void *(*lexbor_memory_calloc_f)(size_t num, size_t size) nogil + ctypedef void (*lexbor_memory_free_f)(void *dst) nogil + lxb_status_t lexbor_memory_setup( + lexbor_memory_malloc_f new_malloc, + lexbor_memory_realloc_f new_realloc, + lexbor_memory_calloc_f new_calloc, + lexbor_memory_free_f new_free + ) + + cdef extern from "lexbor/html/html.h" nogil: ctypedef unsigned int lxb_html_document_opt_t diff --git a/selectolax/lexbor.pyx b/selectolax/lexbor.pyx index 6d036ca..d3a305c 100644 --- a/selectolax/lexbor.pyx +++ b/selectolax/lexbor.pyx @@ -1,5 +1,13 @@ from cpython.bool cimport bool - +from cpython.buffer cimport PyBUF_SIMPLE, PyBuffer_Release, PyObject_GetBuffer +from cpython.bytes cimport PyBytes_FromStringAndSize +from cpython.exc cimport PyErr_SetObject +from cpython.mem cimport ( + PyMem_RawCalloc, + PyMem_RawFree, + PyMem_RawMalloc, + PyMem_RawRealloc +) _ENCODING = 'UTF-8' include "base.pxi" @@ -24,7 +32,7 @@ cdef class LexborHTMLParser: html : str (unicode) or bytes """ - def __init__(self, html: str | bytes, is_fragment: bool = False): + def __init__(self, html: str | bytes | bytearray | memoryview, is_fragment: bool = False): """Create a parser and load HTML. Parameters @@ -47,6 +55,7 @@ cdef class LexborHTMLParser: """ cdef size_t html_len cdef object bytes_html + self._is_fragment = is_fragment self._fragment_document = NULL self._selector = None @@ -55,6 +64,7 @@ cdef class LexborHTMLParser: self._parse_html(bytes_html, html_len) self.raw_html = bytes_html + cdef inline lxb_html_document_t* main_document(self) nogil: if self._is_fragment: return self._fragment_document @@ -752,3 +762,21 @@ cdef class LexborHTMLParser: dom_node = element return LexborNode.new(dom_node, self) + + + +# Putting lexbor on python's heap is better than putting it +# onto C's Heap, because python's Garbage collector can collect +# this memory after use and has the bonus of gaining access to +# mimalloc which python uses under the hood... +if lexbor_memory_setup( + PyMem_RawMalloc, + PyMem_RawRealloc, + PyMem_RawCalloc, + PyMem_RawFree +) != LXB_STATUS_OK: + # This will almost never happen due to the code in both the windows and posix versions + # but if something were to happen this excecption on import should be triggered... + raise SelectolaxError("Can't initalize allocators from lexbor_memory_setup(...)") + + From 7e2b1f2d2c3121da8ec446a4dae0659e2ca0258e Mon Sep 17 00:00:00 2001 From: Vizonex <114684698+Vizonex@users.noreply.github.com> Date: Wed, 24 Dec 2025 19:20:02 -0600 Subject: [PATCH 2/3] Remove bytearray and memoryview typehints from constructor --- selectolax/lexbor.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/selectolax/lexbor.pyx b/selectolax/lexbor.pyx index d3a305c..ffe2e54 100644 --- a/selectolax/lexbor.pyx +++ b/selectolax/lexbor.pyx @@ -32,7 +32,7 @@ cdef class LexborHTMLParser: html : str (unicode) or bytes """ - def __init__(self, html: str | bytes | bytearray | memoryview, is_fragment: bool = False): + def __init__(self, html: str | bytes, is_fragment: bool = False): """Create a parser and load HTML. Parameters From 1463b0f34db233ae759edc888268309821132072 Mon Sep 17 00:00:00 2001 From: Vizonex Date: Wed, 24 Dec 2025 19:27:07 -0600 Subject: [PATCH 3/3] fix linting --- selectolax/lexbor.pxd | 13 ++++--------- selectolax/lexbor.pyx | 19 ++++++------------- 2 files changed, 10 insertions(+), 22 deletions(-) diff --git a/selectolax/lexbor.pxd b/selectolax/lexbor.pxd index 07fb3c3..5a7ef1d 100644 --- a/selectolax/lexbor.pxd +++ b/selectolax/lexbor.pxd @@ -1,9 +1,5 @@ from libc.stdint cimport uint8_t, uint32_t, uintptr_t - - - - cdef extern from "lexbor/core/core.h" nogil: ctypedef uint32_t lxb_codepoint_t ctypedef unsigned char lxb_char_t @@ -36,15 +32,14 @@ cdef extern from "lexbor/core/core.h" nogil: lxb_char_t * lexbor_str_data_noi(lexbor_str_t *str) cdef extern from "lexbor/core/lexbor.h" nogil: - ctypedef void *(*lexbor_memory_malloc_f)(size_t size) nogil ctypedef void *(*lexbor_memory_realloc_f)(void *dst, size_t size) nogil ctypedef void *(*lexbor_memory_calloc_f)(size_t num, size_t size) nogil - ctypedef void (*lexbor_memory_free_f)(void *dst) nogil + ctypedef void (*lexbor_memory_free_f)(void *dst) nogil lxb_status_t lexbor_memory_setup( - lexbor_memory_malloc_f new_malloc, - lexbor_memory_realloc_f new_realloc, - lexbor_memory_calloc_f new_calloc, + lexbor_memory_malloc_f new_malloc, + lexbor_memory_realloc_f new_realloc, + lexbor_memory_calloc_f new_calloc, lexbor_memory_free_f new_free ) diff --git a/selectolax/lexbor.pyx b/selectolax/lexbor.pyx index d3a305c..e47b323 100644 --- a/selectolax/lexbor.pyx +++ b/selectolax/lexbor.pyx @@ -1,10 +1,8 @@ from cpython.bool cimport bool -from cpython.buffer cimport PyBUF_SIMPLE, PyBuffer_Release, PyObject_GetBuffer -from cpython.bytes cimport PyBytes_FromStringAndSize from cpython.exc cimport PyErr_SetObject from cpython.mem cimport ( - PyMem_RawCalloc, - PyMem_RawFree, + PyMem_RawCalloc, + PyMem_RawFree, PyMem_RawMalloc, PyMem_RawRealloc ) @@ -55,7 +53,7 @@ cdef class LexborHTMLParser: """ cdef size_t html_len cdef object bytes_html - + self._is_fragment = is_fragment self._fragment_document = NULL self._selector = None @@ -64,7 +62,6 @@ cdef class LexborHTMLParser: self._parse_html(bytes_html, html_len) self.raw_html = bytes_html - cdef inline lxb_html_document_t* main_document(self) nogil: if self._is_fragment: return self._fragment_document @@ -763,10 +760,8 @@ cdef class LexborHTMLParser: return LexborNode.new(dom_node, self) - - -# Putting lexbor on python's heap is better than putting it -# onto C's Heap, because python's Garbage collector can collect +# Putting lexbor on python's heap is better than putting it +# onto C's Heap, because python's Garbage collector can collect # this memory after use and has the bonus of gaining access to # mimalloc which python uses under the hood... if lexbor_memory_setup( @@ -775,8 +770,6 @@ if lexbor_memory_setup( PyMem_RawCalloc, PyMem_RawFree ) != LXB_STATUS_OK: - # This will almost never happen due to the code in both the windows and posix versions + # This will almost never happen due to the code in both the windows and posix versions # but if something were to happen this excecption on import should be triggered... raise SelectolaxError("Can't initalize allocators from lexbor_memory_setup(...)") - -