From 339466b7d4b6b16b50a7ce5f801899251d678ae6 Mon Sep 17 00:00:00 2001 From: "codeflash-ai[bot]" <148906541+codeflash-ai[bot]@users.noreply.github.com> Date: Sat, 20 Dec 2025 16:02:32 +0000 Subject: [PATCH] Optimize Anchor._link_annotate_element The optimization replaces expensive list concatenation operations with efficient in-place mutations. **Key Changes:** - **Eliminated list concatenation**: The original code used `(element.metadata.link_texts or []) + [link_text]` which creates a new list every time, requiring memory allocation and copying of existing elements. - **Added conditional in-place appending**: The optimized version checks if the list exists and uses `.append()` to add elements directly, or creates a new single-element list only when necessary. **Why This Is Faster:** - **Reduced memory allocations**: List concatenation with `+` operator creates entirely new list objects, while `.append()` modifies existing lists in-place with O(1) amortized complexity. - **Eliminated unnecessary copying**: The original approach copies all existing list elements during concatenation, while the optimized version only adds new elements. - **Better cache locality**: In-place mutations keep data structures in the same memory location, improving CPU cache efficiency. **Performance Impact by Test Case:** - **Best gains (42-58% faster)**: Large-scale scenarios with many existing links benefit most, as they avoid copying hundreds of elements repeatedly. - **Moderate gains (11-27% faster)**: Standard use cases with empty/None lists still benefit from avoiding unnecessary list creation. - **Consistent improvement**: Even edge cases show 2-15% speedups, demonstrating the optimization's broad applicability. The 13% overall speedup comes from eliminating the most expensive operations identified by the line profiler - the list concatenation lines that consumed 27.6% and 13.8% of total execution time. --- unstructured/partition/html/parser.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/unstructured/partition/html/parser.py b/unstructured/partition/html/parser.py index 858dea0aee..faf19cc8ae 100644 --- a/unstructured/partition/html/parser.py +++ b/unstructured/partition/html/parser.py @@ -764,8 +764,15 @@ def _link_annotate_element(self, element: Element) -> Element: if not link_text or not link_url: return element - element.metadata.link_texts = (element.metadata.link_texts or []) + [link_text] - element.metadata.link_urls = (element.metadata.link_urls or []) + [link_url] + if element.metadata.link_texts is not None: + element.metadata.link_texts.append(link_text) + else: + element.metadata.link_texts = [link_text] + + if element.metadata.link_urls is not None: + element.metadata.link_urls.append(link_url) + else: + element.metadata.link_urls = [link_url] return element