Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,6 @@ ignore = [
"ANN101", # Missing type annotation for self in method
"ANN102", # Missing type annotation for cls in classmethod
"PLR0913", # Too many arguments to function call (> 5)
"E501", # Line too long
"C901", # Function is too complex
"PLR0912", # Too many branches (>= 12)
]
Expand Down
12 changes: 6 additions & 6 deletions sec_parser/processing_engine/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,19 @@
HighlightedTextClassifier,
)
from sec_parser.processing_steps.image_classifier import ImageClassifier
from sec_parser.processing_steps.individual_semantic_element_extractor.individual_semantic_element_extractor import (
from sec_parser.processing_steps.individual_semantic_element_extractor.individual_semantic_element_extractor import ( # noqa: E501
IndividualSemanticElementExtractor,
)
from sec_parser.processing_steps.individual_semantic_element_extractor.single_element_checks.image_check import (
from sec_parser.processing_steps.individual_semantic_element_extractor.single_element_checks.image_check import ( # noqa: E501
ImageCheck,
)
from sec_parser.processing_steps.individual_semantic_element_extractor.single_element_checks.table_check import (
from sec_parser.processing_steps.individual_semantic_element_extractor.single_element_checks.table_check import ( # noqa: E501
TableCheck,
)
from sec_parser.processing_steps.individual_semantic_element_extractor.single_element_checks.top_section_title_check import (
from sec_parser.processing_steps.individual_semantic_element_extractor.single_element_checks.top_section_title_check import ( # noqa: E501
TopSectionTitleCheck,
)
from sec_parser.processing_steps.individual_semantic_element_extractor.single_element_checks.xbrl_tag_check import (
from sec_parser.processing_steps.individual_semantic_element_extractor.single_element_checks.xbrl_tag_check import ( # noqa: E501
XbrlTagCheck,
)
from sec_parser.processing_steps.introductory_section_classifier import (
Expand Down Expand Up @@ -62,7 +62,7 @@
from sec_parser.processing_steps.abstract_classes.abstract_processing_step import (
AbstractProcessingStep,
)
from sec_parser.processing_steps.individual_semantic_element_extractor.single_element_checks.abstract_single_element_check import (
from sec_parser.processing_steps.individual_semantic_element_extractor.single_element_checks.abstract_single_element_check import ( # noqa: E501
AbstractSingleElementCheck,
)
from sec_parser.semantic_elements.abstract_semantic_element import (
Expand Down
4 changes: 3 additions & 1 deletion sec_parser/processing_engine/html_tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,9 @@ def get_approx_table_metrics(self) -> ApproxTableMetrics | None:
self._approx_table_metrics,
ApproxTableMetrics,
):
msg = f"Invalid type for _approx_table_metrics: {type(self._approx_table_metrics).__name__}"
msg = (f"Invalid type for _approx_table_metrics: "
f"{type(self._approx_table_metrics).__name__}"
)
raise ValueError(msg)

return self._approx_table_metrics
Expand Down
4 changes: 2 additions & 2 deletions sec_parser/processing_steps/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@
section identification, title parsing, and text extraction, etc.
"""

from sec_parser.processing_steps.abstract_classes.abstract_elementwise_processing_step import (
from sec_parser.processing_steps.abstract_classes.abstract_elementwise_processing_step import ( # noqa: E501
AbstractElementwiseProcessingStep,
)
from sec_parser.processing_steps.abstract_classes.abstract_processing_step import (
AbstractProcessingStep,
)
from sec_parser.processing_steps.empty_element_classifier import EmptyElementClassifier
from sec_parser.processing_steps.image_classifier import ImageClassifier
from sec_parser.processing_steps.individual_semantic_element_extractor.individual_semantic_element_extractor import (
from sec_parser.processing_steps.individual_semantic_element_extractor.individual_semantic_element_extractor import ( # noqa: E501
IndividualSemanticElementExtractor,
)
from sec_parser.processing_steps.supplementary_text_classifier import (
Expand Down
2 changes: 1 addition & 1 deletion sec_parser/processing_steps/empty_element_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from typing import TYPE_CHECKING

from sec_parser.processing_steps.abstract_classes.abstract_elementwise_processing_step import (
from sec_parser.processing_steps.abstract_classes.abstract_elementwise_processing_step import ( # noqa: E501
AbstractElementwiseProcessingStep,
ElementProcessingContext,
)
Expand Down
2 changes: 1 addition & 1 deletion sec_parser/processing_steps/highlighted_text_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from typing import TYPE_CHECKING

from sec_parser.processing_steps.abstract_classes.abstract_elementwise_processing_step import (
from sec_parser.processing_steps.abstract_classes.abstract_elementwise_processing_step import ( # noqa: E501
AbstractElementwiseProcessingStep,
ElementProcessingContext,
)
Expand Down
2 changes: 1 addition & 1 deletion sec_parser/processing_steps/image_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from typing import TYPE_CHECKING

from sec_parser.processing_steps.abstract_classes.abstract_elementwise_processing_step import (
from sec_parser.processing_steps.abstract_classes.abstract_elementwise_processing_step import ( # noqa: E501
AbstractElementwiseProcessingStep,
ElementProcessingContext,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from typing import TYPE_CHECKING, Callable

from sec_parser.exceptions import SecParserValueError
from sec_parser.processing_steps.abstract_classes.abstract_elementwise_processing_step import (
from sec_parser.processing_steps.abstract_classes.abstract_elementwise_processing_step import ( # noqa: E501
AbstractElementwiseProcessingStep,
ElementProcessingContext,
)
Expand All @@ -13,7 +13,7 @@
from sec_parser.semantic_elements.semantic_elements import NotYetClassifiedElement

if TYPE_CHECKING: # pragma: no cover
from sec_parser.processing_steps.individual_semantic_element_extractor.single_element_checks.abstract_single_element_check import (
from sec_parser.processing_steps.individual_semantic_element_extractor.single_element_checks.abstract_single_element_check import ( # noqa: E501
AbstractSingleElementCheck,
)
from sec_parser.semantic_elements.abstract_semantic_element import (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,13 @@ class AbstractSingleElementCheck(ABC):
def contains_single_element(self, element: AbstractSemanticElement) -> bool | None:
"""
Designed to work as series of subsequent checks.
- Returning None means that the check is inconclusive, and the next check should be performed.
- Returning True means that no further checks are necessary, and the HTML element will be
- Returning None means that the check is inconclusive, and the next check should
be performed.
- Returning True means that no further checks are necessary, and the HTML
element will be
later be able to be converted into a semantic element without any splits.
- Returning False means that the HTML element will be split into multiple semantic elements
- Returning False means that the HTML element will be split into multiple
semantic elements
of type NotYetClassifiedElement.
"""
raise NotImplementedError # pragma: no cover
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from typing import TYPE_CHECKING

from sec_parser.processing_steps.individual_semantic_element_extractor.single_element_checks.abstract_single_element_check import (
from sec_parser.processing_steps.individual_semantic_element_extractor.single_element_checks.abstract_single_element_check import ( # noqa: E501
AbstractSingleElementCheck,
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from typing import TYPE_CHECKING

from sec_parser.processing_steps.individual_semantic_element_extractor.single_element_checks.abstract_single_element_check import (
from sec_parser.processing_steps.individual_semantic_element_extractor.single_element_checks.abstract_single_element_check import ( # noqa: E501
AbstractSingleElementCheck,
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from typing import TYPE_CHECKING

from sec_parser.processing_steps.individual_semantic_element_extractor.single_element_checks.abstract_single_element_check import (
from sec_parser.processing_steps.individual_semantic_element_extractor.single_element_checks.abstract_single_element_check import ( # noqa: E501
AbstractSingleElementCheck,
)
from sec_parser.processing_steps.top_section_manager_for_10q import (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from typing import TYPE_CHECKING

from sec_parser.processing_steps.individual_semantic_element_extractor.single_element_checks.abstract_single_element_check import (
from sec_parser.processing_steps.individual_semantic_element_extractor.single_element_checks.abstract_single_element_check import ( # noqa: E501
AbstractSingleElementCheck,
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from typing import TYPE_CHECKING

from sec_parser.processing_steps.abstract_classes.abstract_elementwise_processing_step import (
from sec_parser.processing_steps.abstract_classes.abstract_elementwise_processing_step import ( # noqa: E501
AbstractElementwiseProcessingStep,
ElementProcessingContext,
)
Expand Down
2 changes: 1 addition & 1 deletion sec_parser/processing_steps/page_header_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from dataclasses import dataclass
from typing import TYPE_CHECKING

from sec_parser.processing_steps.abstract_classes.abstract_elementwise_processing_step import (
from sec_parser.processing_steps.abstract_classes.abstract_elementwise_processing_step import ( # noqa: E501
AbstractElementwiseProcessingStep,
ElementProcessingContext,
)
Expand Down
7 changes: 5 additions & 2 deletions sec_parser/processing_steps/page_number_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from enum import Enum, auto
from typing import TYPE_CHECKING

from sec_parser.processing_steps.abstract_classes.abstract_elementwise_processing_step import (
from sec_parser.processing_steps.abstract_classes.abstract_elementwise_processing_step import ( # noqa: E501
AbstractElementwiseProcessingStep,
ElementProcessingContext,
)
Expand Down Expand Up @@ -95,7 +95,10 @@ def _classify_elements(
return element

element.processing_log.add_item(
message=f"Matches the most common (x{self._most_common_candidate_count}) candidate: {candidate}",
message = (f"Matches the most common (x"
f"{self._most_common_candidate_count})"
f"candidate: {candidate}"
),
log_origin=self.__class__.__name__,
)
return PageNumberElement.create_from_element(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from typing import TYPE_CHECKING

from sec_parser.processing_steps.abstract_classes.abstract_elementwise_processing_step import (
from sec_parser.processing_steps.abstract_classes.abstract_elementwise_processing_step import ( # noqa: E501
AbstractElementwiseProcessingStep,
ElementProcessingContext,
)
Expand Down
2 changes: 1 addition & 1 deletion sec_parser/processing_steps/table_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from typing import TYPE_CHECKING

from sec_parser.processing_steps.abstract_classes.abstract_elementwise_processing_step import (
from sec_parser.processing_steps.abstract_classes.abstract_elementwise_processing_step import ( # noqa: E501
AbstractElementwiseProcessingStep,
ElementProcessingContext,
)
Expand Down
5 changes: 3 additions & 2 deletions sec_parser/processing_steps/table_of_contents_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from typing import TYPE_CHECKING

from sec_parser.processing_steps.abstract_classes.abstract_elementwise_processing_step import (
from sec_parser.processing_steps.abstract_classes.abstract_elementwise_processing_step import ( # noqa: E501
AbstractElementwiseProcessingStep,
ElementProcessingContext,
)
Expand All @@ -18,7 +18,8 @@

class TableOfContentsClassifier(AbstractElementwiseProcessingStep):
"""
TableOfContentsClassifier class for converting elements into TableOfContentsElement instances.
TableOfContentsClassifier class for converting elements into TableOfContentsElement
instances.

This step scans through a list of semantic elements and changes it,
primarily by replacing suitable candidates with TableOfContentsElement instances.
Expand Down
2 changes: 1 addition & 1 deletion sec_parser/processing_steps/text_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from typing import TYPE_CHECKING

from sec_parser.processing_steps.abstract_classes.abstract_elementwise_processing_step import (
from sec_parser.processing_steps.abstract_classes.abstract_elementwise_processing_step import ( # noqa: E501
AbstractElementwiseProcessingStep,
ElementProcessingContext,
)
Expand Down
24 changes: 16 additions & 8 deletions sec_parser/processing_steps/text_element_merger.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from typing import TYPE_CHECKING, cast

from sec_parser.processing_engine.html_tag import HtmlTag
from sec_parser.processing_steps.abstract_classes.abstract_element_batch_processing_step import (
from sec_parser.processing_steps.abstract_classes.abstract_element_batch_processing_step import ( # noqa: E501
AbstractElementBatchProcessingStep,
)
from sec_parser.semantic_elements.abstract_semantic_element import (
Expand All @@ -28,9 +28,14 @@ class TextElementMerger(AbstractElementBatchProcessingStep):
into a single TextElement(<span></span><span></span>).

Intended to fix weird formatting artifacts, such as:
<ix:nonnumeric contextref="c-1" name="us-gaap:PropertyPlantAndEquipmentTextBlock" id="f-989" escape="true">
<span style="background-color:#ffffff;color:#000000;font-family:'Arial',sans-serif;font-size:10pt;font-weight:400;line-height:120%">Property and equipment, net, co</span>
<span style="color:#000000;font-family:'Arial',sans-serif;font-size:10pt;font-weight:400;line-height:120%">nsisted of the following (in millions):</span>
<ix:nonnumeric contextref="c-1"
name="us-gaap:PropertyPlantAndEquipmentTextBlock" id="f-989" escape="true">
<span style="background-color:#ffffff;color:#000000;font-family:'Arial',
sans-serif;font-size:10pt;font-weight:400;line-height:120%">Property and
equipment, net, co</span>
<span style="color:#000000;font-family:'Arial',sans-serif;font-size:10pt;
font-weight:400;line-height:120%">nsisted of the following (in millions):
</span>
</ix:nonnumeric>
Notice, how text is split into two spans, even though it's a single sentence.
Source: https://www.sec.gov/Archives/edgar/data/1652044/000165204423000094/goog-20230930.htm
Expand Down Expand Up @@ -77,13 +82,16 @@ def _merge(
[e.html_tag for e in elements],
)
merged_processing_log = elements[0].processing_log.copy()
# After merging, we retain the processing log of the first element and drop the logs of the others.
# This is because the merged text element now represents a single entity, and we want to avoid
# log duplication or confusion about which part of the merged text the logs refer to.
# After merging, we retain the processing log of the first element and drop the
# logs of the others. This is because the merged text element now represents a
# single entity, and we want to avoid log duplication or confusion about which
# part of the merged text the logs refer to.
dropped_logs = [e.processing_log for e in elements[1:]]
if any(dropped_logs):
merged_processing_log.add_item(
message="Merged multiple TextElements. Processing logs from subsequent elements are dropped.",
message = ("Merged multiple TextElements. "
"Processing logs from subsequent elements are dropped."
),
log_origin=cls.__name__,
)
return TextElement(
Expand Down
2 changes: 1 addition & 1 deletion sec_parser/processing_steps/title_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from typing import TYPE_CHECKING

from sec_parser.processing_steps.abstract_classes.abstract_elementwise_processing_step import (
from sec_parser.processing_steps.abstract_classes.abstract_elementwise_processing_step import ( # noqa: E501
AbstractElementwiseProcessingStep,
ElementProcessingContext,
)
Expand Down
Loading