Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def register_converters(markitdown: MarkItDown, **kwargs):

class RtfConverter(DocumentConverter):
"""
Converts an RTF file to in the simplest possible way.
Converts an RTF file in the simplest possible way.
"""

def accepts(
Expand All @@ -60,7 +60,7 @@ def convert(
stream_info: StreamInfo,
**kwargs: Any,
) -> DocumentConverterResult:
# Read the file stream into an str using hte provided charset encoding, or using the system default
# Read the file stream into a str using the provided charset encoding, or using the system default
encoding = stream_info.charset or locale.getpreferredencoding()
stream_data = file_stream.read().decode(encoding)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@


def test_converter() -> None:
"""Tests the RTF converter dirctly."""
"""Tests the RTF converter directly."""
with open(os.path.join(TEST_FILES_DIR, "test.rtf"), "rb") as file_stream:
converter = RtfConverter()
result = converter.convert(
Expand Down
6 changes: 3 additions & 3 deletions packages/markitdown/src/markitdown/_base_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def accepts(
"""
Return a quick determination on if the converter should attempt converting the document.
This is primarily based `stream_info` (typically, `stream_info.mimetype`, `stream_info.extension`).
In cases where the data is retrieved via HTTP, the `steam_info.url` might also be referenced to
In cases where the data is retrieved via HTTP, the `stream_info.url` might also be referenced to
make a determination (e.g., special converters for Wikipedia, YouTube etc).
Finally, it is conceivable that the `stream_info.filename` might be used to in cases
where the filename is well-known (e.g., `Dockerfile`, `Makefile`, etc)
Expand All @@ -71,7 +71,7 @@ def accepts(

Parameters:
- file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods.
- stream_info: The StreamInfo object containing metadata about the file (mimetype, extension, charset, set)
- stream_info: The StreamInfo object containing metadata about the file (mimetype, extension, charset)
- kwargs: Additional keyword arguments for the converter.

Returns:
Expand All @@ -92,7 +92,7 @@ def convert(

Parameters:
- file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods.
- stream_info: The StreamInfo object containing metadata about the file (mimetype, extension, charset, set)
- stream_info: The StreamInfo object containing metadata about the file (mimetype, extension, charset)
- kwargs: Additional keyword arguments for the converter.

Returns:
Expand Down
2 changes: 1 addition & 1 deletion packages/markitdown/src/markitdown/_exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ class UnsupportedFormatException(MarkItDownException):

class FailedConversionAttempt(object):
"""
Represents an a single attempt to convert a file.
Represents a single attempt to convert a file.
"""

def __init__(self, converter: Any, exc_info: Optional[tuple] = None):
Expand Down
4 changes: 2 additions & 2 deletions packages/markitdown/src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -580,7 +580,7 @@ def _convert(
# Add the list of converters for nested processing
_kwargs["_parent_converters"] = self._converters

# Add legaxy kwargs
# Add legacy kwargs
if stream_info is not None:
if stream_info.extension is not None:
_kwargs["file_extension"] = stream_info.extension
Expand Down Expand Up @@ -631,7 +631,7 @@ def _convert(
)

def register_page_converter(self, converter: DocumentConverter) -> None:
"""DEPRECATED: User register_converter instead."""
"""DEPRECATED: Use register_converter instead."""
warn(
"register_page_converter is deprecated. Use register_converter instead.",
DeprecationWarning,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ def accepts(
def _analysis_features(self, stream_info: StreamInfo) -> List[str]:
"""
Helper needed to determine which analysis features to use.
Certain document analysis features are not availiable for
Certain document analysis features are not available for
office filetypes (.xlsx, .pptx, .html, .docx)
"""
mimetype = (stream_info.mimetype or "").lower()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@

class DocxConverter(HtmlConverter):
"""
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
Converts DOCX files to Markdown. Style information (e.g., headings) and tables are preserved where possible.
"""

def __init__(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

class EpubConverter(HtmlConverter):
"""
Converts EPUB files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
Converts EPUB files to Markdown. Style information (e.g., headings) and tables are preserved where possible.
"""

def __init__(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def accepts(
finally:
file_stream.seek(cur_pos)

# Brue force, check if it's an Outlook file
# Brute force, check if it's an Outlook file
try:
if olefile is not None:
msg = olefile.OleFileIO(file_stream)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def transcribe_audio(file_stream: BinaryIO, *, audio_format: str = "wav") -> str
# Check for installed dependencies
if _dependency_exc_info is not None:
raise MissingDependencyException(
"Speech transcription requires installing MarkItdown with the [audio-transcription] optional dependencies. E.g., `pip install markitdown[audio-transcription]` or `pip install markitdown[all]`"
"Speech transcription requires installing MarkItDown with the [audio-transcription] optional dependencies. E.g., `pip install markitdown[audio-transcription]` or `pip install markitdown[all]`"
) from _dependency_exc_info[
1
].with_traceback( # type: ignore[union-attr]
Expand Down
4 changes: 2 additions & 2 deletions packages/markitdown/tests/test_cli_vectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
TEST_FILES_URL = "https://raw.githubusercontent.com/microsoft/markitdown/refs/heads/main/packages/markitdown/tests/test_files"


# Prepare CLI test vectors (remove vectors that require mockig the url)
# Prepare CLI test vectors (remove vectors that require mocking the url)
CLI_TEST_VECTORS: List[FileTestVector] = []
for test_vector in GENERAL_TEST_VECTORS:
if test_vector.url is not None:
Expand Down Expand Up @@ -96,7 +96,7 @@ def test_output_to_file(shared_tmp_dir, test_vector) -> None:

@pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS)
def test_input_from_stdin_without_hints(shared_tmp_dir, test_vector) -> None:
"""Test that the CLI readds from stdin correctly."""
"""Test that the CLI reads from stdin correctly."""

test_input = b""
with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
Expand Down
4 changes: 2 additions & 2 deletions packages/markitdown/tests/test_module_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def test_stream_info_operations() -> None:
**{keyword: f"{keyword}.2"}
)

# Make sure the targted attribute is updated
# Make sure the targeted attribute is updated
assert getattr(updated_stream_info, keyword) == f"{keyword}.2"

# Make sure the other attributes are unchanged
Expand All @@ -143,7 +143,7 @@ def test_stream_info_operations() -> None:
StreamInfo(**{keyword: f"{keyword}.2"})
)

# Make sure the targted attribute is updated
# Make sure the targeted attribute is updated
assert getattr(updated_stream_info, keyword) == f"{keyword}.2"

# Make sure the other attributes are unchanged
Expand Down