Merge pull request #5 from openize-com/muhammadumar-patch

muhammadumargroupdocs · web-flow · commit b41ebd90212e · 2025-07-09T19:45:47.000+05:00
Publishing the hotfix 25.6.1
diff --git a/packages/markitdown/README.md b/packages/markitdown/README.md
@@ -100,6 +100,47 @@ $env:CLAUDE_API_KEY = "your-claude-key"
 $env:GEMINI_API_KEY = "your-gemini-key"
 $env:MISTRAL_API_KEY = "your-mistral-key"
 ```
+## Running Tests
+
+To run unit tests for **Openize.MarkItDown**, follow these steps:
+
+### 1. Navigate to the package directory
+
+From the root of the repository, change into the package directory:
+
+```bash
+cd openize-markitdown/packages/markitdown
+```
+
+### 2. Install test dependencies
+
+Make sure `pytest` and `pytest-mock` are installed:
+
+```bash
+pip install pytest pytest-mock
+```
+
+### 3. Run tests using `pytest`
+
+To run all tests:
+
+```bash
+pytest
+```
+
+To run a specific test file:
+
+```bash
+pytest tests/test.py
+```
+
+### Tip
+
+Use `-v` for more detailed test output:
+
+```bash
+pytest -v
+```
 
 ## License
 
diff --git a/packages/markitdown/pytest.ini b/packages/markitdown/pytest.ini
@@ -1,2 +1,2 @@
 [pytest]
-pythonpath = packages/markitdown/src
+pythonpath = src
diff --git a/packages/markitdown/setup.cfg b/packages/markitdown/setup.cfg
@@ -1,7 +1,7 @@
 
 [metadata]
 name = openize-markitdown-python
-version = 25.6.0
+version = 25.6.1
 
 author = Openize
 author_email = packages@openize.com
diff --git a/packages/markitdown/src/openize/markitdown/__init__.py b/packages/markitdown/src/openize/markitdown/__init__.py
@@ -8,7 +8,7 @@
 from .processor import DocumentProcessor
 from .converters import WordConverter, PDFConverter, ExcelConverter, PowerPointConverter
 from .factory import ConverterFactory
-from .llm_strategy import SaveLocally, InsertIntoLLM
+from .llm_strategy import SaveLocally, LLMFactory
 from .license_manager import LicenseManager
 
 __all__ = [
@@ -19,6 +19,6 @@
     'PowerPointConverter',
     'ConverterFactory',
     'SaveLocally',
-    'InsertIntoLLM',
+    'LLMFactory',
     'LicenseManager',
 ]
diff --git a/packages/markitdown/src/openize/markitdown/llm_strategy.py b/packages/markitdown/src/openize/markitdown/llm_strategy.py
@@ -1,5 +1,6 @@
 import logging
 import os
+from anthropic import Anthropic
 from abc import ABC, abstractmethod
 import openai
 
@@ -55,32 +56,33 @@ def process(self, md_file):
 class ClaudeClient(LLMStrategy):
     def __init__(self):
         self.api_key = os.getenv("CLAUDE_API_KEY")
-        self.model = os.getenv("CLAUDE_MODEL", "claude-v1")
+        self.model = os.getenv("CLAUDE_MODEL", "claude-3-opus-20240229")  # update to your model
 
         if not self.api_key:
             raise ValueError("Missing Claude API key. Please set it in the environment.")
 
-        # Initialize Claude client here (replace with actual SDK code)
-        # self.client = ClaudeAPIClient(api_key=self.api_key)
+        self.client = Anthropic(api_key=self.api_key)
 
     def process(self, md_file):
         try:
             with open(md_file, "r", encoding="utf-8") as file:
                 content = file.read()
 
-            # Replace with actual Claude API call
-            # response = self.client.complete(prompt=content, model=self.model)
-
-            # Dummy placeholder response
-            response_text = f"Simulated Claude response for {md_file}"
+            response = self.client.messages.create(
+                model=self.model,
+                max_tokens=1024,
+                messages=[
+                    {"role": "user", "content": content}
+                ]
+            )
 
-            logging.info(f"Claude Response for {md_file}: {response_text}")
+            message = response.content[0].text if response.content else ""
+            logging.info(f"Claude Response for {md_file}: {message}")
 
         except FileNotFoundError:
             logging.error(f"Markdown file not found: {md_file}")
         except Exception as e:
             logging.exception(f"Unexpected error processing {md_file}: {e}")
-
 class GeminiClient(LLMStrategy):
     def __init__(self):
         self.api_key = os.getenv("GEMINI_API_KEY")
diff --git a/packages/markitdown/src/openize/markitdown/processor.py b/packages/markitdown/src/openize/markitdown/processor.py
@@ -1,13 +1,14 @@
 import logging
 from pathlib import Path
 from factory import ConverterFactory
-from llm_strategy import SaveLocally, InsertIntoLLM
+from llm_strategy import SaveLocally, LLMFactory
 
 
 class DocumentProcessor:
-    def __init__(self, output_dir=Path("converted_md")):
+    def __init__(self, output_dir=Path("converted_md"), llm_client_name="openai"):
         self.output_dir = Path(output_dir)
         self.output_dir.mkdir(parents=True, exist_ok=True)
+        self.llm_client_name = llm_client_name
 
     def process_document(self, file_path, insert_into_llm=False):
         file_path = Path(file_path)
@@ -20,6 +21,12 @@ def process_document(self, file_path, insert_into_llm=False):
 
         md_file = converter.convert_to_md(file_path, self.output_dir)
         if md_file:
-            strategy = InsertIntoLLM() if insert_into_llm else SaveLocally()
-            strategy.process(md_file)
-
+            try:
+                strategy = (
+                    LLMFactory.get_llm(self.llm_client_name)
+                    if insert_into_llm
+                    else SaveLocally()
+                )
+                strategy.process(md_file)
+            except ValueError as e:
+                logging.error(f"Failed to initialize strategy: {e}")
diff --git a/packages/markitdown/test_input/sample.docx b/packages/markitdown/test_input/sample.docx
diff --git a/packages/markitdown/test_output/sample.001.png b/packages/markitdown/test_output/sample.001.png
diff --git a/packages/markitdown/test_output/sample.md b/packages/markitdown/test_output/sample.md
@@ -0,0 +1,3 @@
+# Sample Markdown File
+
+This is a test.
diff --git a/packages/markitdown/tests/test.py b/packages/markitdown/tests/test.py
@@ -2,10 +2,10 @@
 from pathlib import Path
 import os
 
-from ..src.openize.markitdown.converters import WordConverter, PDFConverter, ExcelConverter, PowerPointConverter
-from ..src.openize.markitdown.factory import ConverterFactory
-from ..src.openize.markitdown.llm_strategy import SaveLocally, LLMFactory, OpenAIClient, ClaudeClient
-from ..src.openize.markitdown.processor import DocumentProcessor
+from openize.markitdown.converters import WordConverter, PDFConverter, ExcelConverter, PowerPointConverter
+from openize.markitdown.factory import ConverterFactory
+from openize.markitdown.llm_strategy import SaveLocally, LLMFactory, OpenAIClient, ClaudeClient,MistralClient, GeminiClient
+from openize.markitdown.processor import DocumentProcessor
 
 
 @pytest.fixture
@@ -60,14 +60,14 @@ def test_insert_into_llm_openai(mocker, sample_md_file):
     mocker.patch("openai.ChatCompletion.create", return_value={
         "choices": [{"message": {"content": "Mocked OpenAI Response"}}]
     })
-    strategy = OpenAIClient(provider="openai")
+    strategy = OpenAIClient()
     strategy.process(sample_md_file)
 
 def test_insert_into_llm_claude(mocker, sample_md_file):
     mock_anthropic = mocker.patch("openize.markitdown.llm_strategy.Anthropic")
     mock_client = mock_anthropic.return_value
     mock_client.messages.create.return_value.content = "Mocked Claude Response"
-    strategy = ClaudeClient(provider="claude")
+    strategy = ClaudeClient()
     strategy.process(sample_md_file)
 
 
@@ -76,7 +76,7 @@ def test_insert_into_llm_claude(mocker, sample_md_file):
 def test_document_processor_local_conversion(mocker, sample_output_dir):
     mock_converter = mocker.patch("openize.markitdown.factory.ConverterFactory.get_converter", return_value=WordConverter())
     processor = DocumentProcessor(output_dir=sample_output_dir)
-    processor.process_document("sample.docx", insert_into_llm=False)
+    processor.process_document("test_input/sample.docx", insert_into_llm=False)
     output_file = sample_output_dir / "sample.md"
     assert output_file.exists()
 
@@ -85,8 +85,8 @@ def test_document_processor_with_llm_openai(mocker, sample_output_dir):
     mocker.patch("openai.ChatCompletion.create", return_value={
         "choices": [{"message": {"content": "LLM Output"}}]
     })
-    processor = DocumentProcessor(output_dir=sample_output_dir)
-    processor.process_document("sample.docx", insert_into_llm=True, llm_provider="openai")
+    processor = DocumentProcessor(output_dir=sample_output_dir, llm_client_name="openai")
+    processor.process_document("test_input/sample.docx", insert_into_llm=True)
     output_file = sample_output_dir / "sample.md"
     assert output_file.exists()
 
@@ -95,8 +95,44 @@ def test_document_processor_with_llm_claude(mocker, sample_output_dir):
     mock_anthropic = mocker.patch("openize.markitdown.llm_strategy.Anthropic")
     mock_client = mock_anthropic.return_value
     mock_client.messages.create.return_value.content = "LLM Claude Output"
-    processor = DocumentProcessor(output_dir=sample_output_dir)
-    processor.process_document("sample.docx", insert_into_llm=True, llm_provider="claude")
+    processor = DocumentProcessor(output_dir=sample_output_dir, llm_client_name="claude")
+    processor.process_document("test_input/sample.docx", insert_into_llm=True)
     output_file = sample_output_dir / "sample.md"
     assert output_file.exists()
 
+def test_insert_into_llm_gemini(mocker, sample_md_file):
+    mock_response = mocker.Mock()
+    mock_response.raise_for_status.return_value = None
+    mock_response.json.return_value = {
+        "candidates": [
+            {"content": {"parts": [{"text": "Mocked Gemini Response"}]}}
+        ]
+    }
+
+    mocker.patch("requests.post", return_value=mock_response)
+    mocker.patch.dict(os.environ, {
+        "GEMINI_API_KEY": "dummy_key",
+        "GEMINI_MODEL": "gemini-pro"
+    })
+
+    client = GeminiClient()
+    client.process(sample_md_file)
+def test_insert_into_llm_mistral(mocker, sample_md_file):
+    mock_response = mocker.Mock()
+    mock_response.raise_for_status.return_value = None
+    mock_response.json.return_value = {
+        "choices": [
+            {"message": {"content": "Mocked Mistral Response"}}
+        ]
+    }
+
+    mocker.patch("requests.post", return_value=mock_response)
+    mocker.patch.dict(os.environ, {
+        "MISTRAL_API_KEY": "dummy_key",
+        "MISTRAL_MODEL": "mistral-medium"
+    })
+
+    client = MistralClient()
+    client.process(sample_md_file)
+
+

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`[pytest]`
`2`		`-pythonpath = packages/markitdown/src`
	`2`	`+pythonpath = src`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# Sample Markdown File`
	`2`	`+`
	`3`	`+This is a test.`