From 27f195d859588c94feffed05a93e607895d21940 Mon Sep 17 00:00:00 2001 From: "codeflash-ai[bot]" <148906541+codeflash-ai[bot]@users.noreply.github.com> Date: Sat, 20 Dec 2025 10:45:25 +0000 Subject: [PATCH] Optimize filter_element_types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The optimization achieves an **11% speedup** through two key changes that reduce Python overhead: **1. Generator Expression in `exactly_one()`** - Changed `sum([(arg is not None and arg != "") for arg in kwargs.values()])` to `sum((arg is not None and arg != "") for arg in kwargs.values())` - Eliminates creation of an intermediate list, reducing memory allocation overhead - Though this function shows minimal improvement in isolation, it's called frequently (94 times in the profiler) **2. List Comprehensions Replace Manual Loops in `filter_element_types()`** - Replaced explicit `for` loops with `filtered_elements.append()` calls with direct list comprehensions - `return [element for element in elements if type(element) in include_element_types]` - `return [element for element in elements if type(element) not in exclude_element_types]` **Why This Speeds Up Execution:** - **Reduced Python bytecode overhead**: List comprehensions are implemented in C and execute faster than explicit Python loops with `.append()` calls - **Fewer function calls**: Eliminates repeated `append()` method calls which have per-call overhead - **Better memory patterns**: List comprehensions can pre-allocate the result list size in some cases **Performance Impact by Test Case:** - **Large datasets benefit most**: Tests with 1000+ elements show 23-40% improvements (e.g., `test_large_number_of_elements_include` goes from 36.9μs to 26.3μs) - **Small datasets have modest overhead**: Basic tests with few elements show 5-20% slower performance due to list comprehension setup costs - **The optimization is particularly effective when filtering large collections**, which is typical for document processing workflows where this function likely operates on many document elements The optimization maintains identical functionality while providing substantial performance gains for realistic workloads involving larger element collections. --- unstructured/partition/common/common.py | 3 ++- unstructured/staging/base.py | 14 ++------------ 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/unstructured/partition/common/common.py b/unstructured/partition/common/common.py index d18fc8c87b..4d10c78acc 100644 --- a/unstructured/partition/common/common.py +++ b/unstructured/partition/common/common.py @@ -338,7 +338,8 @@ def exactly_one(**kwargs: Any) -> None: Example: >>> exactly_one(filename=filename, file=file, text=text, url=url) """ - if sum([(arg is not None and arg != "") for arg in kwargs.values()]) != 1: + # Use generator expression to avoid creating a temporary list, reducing memory use + if sum((arg is not None and arg != "") for arg in kwargs.values()) != 1: names = list(kwargs.keys()) if len(names) > 1: message = f"Exactly one of {', '.join(names[:-1])} and {names[-1]} must be specified." diff --git a/unstructured/staging/base.py b/unstructured/staging/base.py index aab1b1647f..738b077fbe 100644 --- a/unstructured/staging/base.py +++ b/unstructured/staging/base.py @@ -462,21 +462,11 @@ def filter_element_types( include_element_types=include_element_types, exclude_element_types=exclude_element_types, ) - - filtered_elements: list[Element] = [] if include_element_types: - for element in elements: - if type(element) in include_element_types: - filtered_elements.append(element) - - return filtered_elements + return [element for element in elements if type(element) in include_element_types] elif exclude_element_types: - for element in elements: - if type(element) not in exclude_element_types: - filtered_elements.append(element) - - return filtered_elements + return [element for element in elements if type(element) not in exclude_element_types] return list(elements)