From 8feccb807fc5dc812c0b684b4b48cf27fa17267b Mon Sep 17 00:00:00 2001
From: Howard van Rooijen <Howard.vanRooijen@endjin.com>
Date: Thu, 29 Jan 2026 11:44:48 +0000
Subject: [PATCH 1/4] Enhance WordDocumentReader to preserve semantic structure

- Modified `WordDocumentReader` to detect "Heading1" through "Heading6" styles and prepend Markdown-style headers (`#` to `######`) to the text.
- Added code comments explaining the heading detection logic.
- Removed unused `HeaderRegex` from `SemanticChunker`.
- Added `WordDocumentReaderTests` to verify that document structure is preserved.

Fixes #6
---
 .../Core/Documents/WordDocumentReaderTests.cs | 112 ++++++++++++++++++
 .../Rlm.Cli/Core/Chunking/SemanticChunker.cs  |   3 -
 .../Core/Documents/WordDocumentReader.cs      |  19 ++-
 3 files changed, 130 insertions(+), 4 deletions(-)
 create mode 100644 Solutions/Rlm.Cli.Tests/Core/Documents/WordDocumentReaderTests.cs

diff --git a/Solutions/Rlm.Cli.Tests/Core/Documents/WordDocumentReaderTests.cs b/Solutions/Rlm.Cli.Tests/Core/Documents/WordDocumentReaderTests.cs
new file mode 100644
index 0000000..1e601af
--- /dev/null
+++ b/Solutions/Rlm.Cli.Tests/Core/Documents/WordDocumentReaderTests.cs
@@ -0,0 +1,112 @@
+using DocumentFormat.OpenXml;
+using DocumentFormat.OpenXml.Packaging;
+using DocumentFormat.OpenXml.Wordprocessing;
+using Rlm.Cli.Core.Documents;
+using Shouldly;
+using Spectre.IO;
+using Spectre.IO.Testing;
+using System.Text;
+
+namespace Rlm.Cli.Tests.Core.Documents;
+
+[TestClass]
+public sealed class WordDocumentReaderTests
+{
+    private FakeFileSystem fakeFileSystem = null!;
+    private WordDocumentReader reader = null!;
+    private string tempFilePath = null!;
+
+    [TestInitialize]
+    public void Setup()
+    {
+        // Setup fake file system for the reader's dependency
+        // But note: WordDocumentReader uses System.IO to open the file via OpenXml SDK
+        // So we need to ensure we pass checks but use a real file for OpenXml
+        
+        // We will use a real file path for the test execution because OpenXml SDK works with real files
+        tempFilePath = System.IO.Path.GetTempFileName() + ".docx";
+        
+        // We need the fake file system to "know" about this file so CanRead/ReadAsync checks pass
+        FakeEnvironment environment = FakeEnvironment.CreateLinuxEnvironment();
+        fakeFileSystem = new(environment);
+        
+        // Populate fake file system so checks pass
+        // The reader implementation checks if file exists using IFileSystem
+        // So we must mock that
+        fakeFileSystem.CreateFile(tempFilePath);
+
+        reader = new(fakeFileSystem);
+    }
+
+    [TestCleanup]
+    public void Cleanup()
+    {
+        if (File.Exists(tempFilePath))
+        {
+            File.Delete(tempFilePath);
+        }
+    }
+
+    [TestMethod]
+    public async Task ReadAsync_DocumentWithHeadings_PreservesStructure()
+    {
+        // Arrange
+        CreateDocxWithHeadings(tempFilePath);
+        Uri uri = new(tempFilePath);
+
+        // Act
+        RlmDocument? document = await reader.ReadAsync(uri, TestContext.CancellationToken);
+
+        // Assert
+        document.ShouldNotBeNull();
+        
+        // This assertion is expected to FAIL currently
+        // We want to see:
+        // # Heading Level 1
+        // Normal text
+        // ## Heading Level 2
+        // More text
+        
+        string content = document.Content;
+        content.ShouldContain("# Heading Level 1");
+        content.ShouldContain("## Heading Level 2");
+    }
+
+    private static void CreateDocxWithHeadings(string filePath)
+    {
+        using WordprocessingDocument doc = WordprocessingDocument.Create(filePath, WordprocessingDocumentType.Document);
+        MainDocumentPart mainPart = doc.AddMainDocumentPart();
+        mainPart.Document = new Document();
+        Body body = mainPart.Document.AppendChild(new Body());
+
+        // Heading 1
+        Paragraph p1 = body.AppendChild(new Paragraph());
+        ParagraphProperties pPr1 = new();
+        pPr1.ParagraphStyleId = new ParagraphStyleId() { Val = "Heading1" };
+        p1.AppendChild(pPr1);
+        Run r1 = p1.AppendChild(new Run());
+        r1.AppendChild(new Text("Heading Level 1"));
+
+        // Normal text
+        Paragraph p2 = body.AppendChild(new Paragraph());
+        Run r2 = p2.AppendChild(new Run());
+        r2.AppendChild(new Text("Normal text under H1"));
+
+        // Heading 2
+        Paragraph p3 = body.AppendChild(new Paragraph());
+        ParagraphProperties pPr3 = new();
+        pPr3.ParagraphStyleId = new ParagraphStyleId() { Val = "Heading2" };
+        p3.AppendChild(pPr3);
+        Run r3 = p3.AppendChild(new Run());
+        r3.AppendChild(new Text("Heading Level 2"));
+
+        // More text
+        Paragraph p4 = body.AppendChild(new Paragraph());
+        Run r4 = p4.AppendChild(new Run());
+        r4.AppendChild(new Text("More text under H2"));
+        
+        doc.Save();
+    }
+
+    public TestContext TestContext { get; set; }
+}
\ No newline at end of file
diff --git a/Solutions/Rlm.Cli/Core/Chunking/SemanticChunker.cs b/Solutions/Rlm.Cli/Core/Chunking/SemanticChunker.cs
index f3a9c9f..57d020b 100644
--- a/Solutions/Rlm.Cli/Core/Chunking/SemanticChunker.cs
+++ b/Solutions/Rlm.Cli/Core/Chunking/SemanticChunker.cs
@@ -27,9 +27,6 @@ public sealed partial class SemanticChunker(
     bool mergeSmall = false,
     string? filterPattern = null) : IChunker
 {
-    [GeneratedRegex(@"^(#{1,6})\s+(.+)$", RegexOptions.Multiline)]
-    private static partial Regex HeaderRegex();
-
     public async IAsyncEnumerable<ContentChunk> ChunkAsync(
         RlmDocument document,
         [EnumeratorCancellation] CancellationToken cancellationToken = default)
diff --git a/Solutions/Rlm.Cli/Core/Documents/WordDocumentReader.cs b/Solutions/Rlm.Cli/Core/Documents/WordDocumentReader.cs
index a45e918..a0b63be 100644
--- a/Solutions/Rlm.Cli/Core/Documents/WordDocumentReader.cs
+++ b/Solutions/Rlm.Cli/Core/Documents/WordDocumentReader.cs
@@ -73,7 +73,24 @@ public async IAsyncEnumerable<RlmDocument> ReadManyAsync(
             StringBuilder content = new();
             foreach (Paragraph para in body.Elements<Paragraph>())
             {
-                content.AppendLine(para.InnerText);
+                string paragraphText = para.InnerText;
+                string? styleId = para.ParagraphProperties?.ParagraphStyleId?.Val?.Value;
+
+                // Check if the paragraph has a style ID that indicates it is a heading.
+                // Standard Word heading styles are named "Heading1", "Heading2", etc.
+                // We extract the level number and prepend the corresponding number of '#' characters
+                // to convert it into a Markdown-style header. This allows downstream components
+                // (like the SemanticChunker) to understand the document structure.
+                if (!string.IsNullOrEmpty(styleId) &&
+                    styleId.StartsWith("Heading", StringComparison.OrdinalIgnoreCase) &&
+                    styleId.Length > 7 &&
+                    int.TryParse(styleId.AsSpan(7), out int level) &&
+                    level >= 1 && level <= 6)
+                {
+                    content.Append(new string('#', level)).Append(' ');
+                }
+
+                content.AppendLine(paragraphText);
             }
 
             string text = content.ToString();

From e678b5097ce1e57f307a7a8921f48d25b2580442 Mon Sep 17 00:00:00 2001
From: Howard van Rooijen <Howard.vanRooijen@endjin.com>
Date: Thu, 29 Jan 2026 12:07:34 +0000
Subject: [PATCH 2/4] Add tests for WordDocumentReader to verify heading
 preservation in complex documents

---
 .../Core/Documents/WordDocumentReaderTests.cs | 84 +++++++++++++++++--
 1 file changed, 76 insertions(+), 8 deletions(-)

diff --git a/Solutions/Rlm.Cli.Tests/Core/Documents/WordDocumentReaderTests.cs b/Solutions/Rlm.Cli.Tests/Core/Documents/WordDocumentReaderTests.cs
index 1e601af..aac2626 100644
--- a/Solutions/Rlm.Cli.Tests/Core/Documents/WordDocumentReaderTests.cs
+++ b/Solutions/Rlm.Cli.Tests/Core/Documents/WordDocumentReaderTests.cs
@@ -1,11 +1,13 @@
+// <copyright file="WordDocumentReaderTests.cs" company="Endjin Limited">
+// Copyright (c) Endjin Limited. All rights reserved.
+// </copyright>
+
 using DocumentFormat.OpenXml;
 using DocumentFormat.OpenXml.Packaging;
 using DocumentFormat.OpenXml.Wordprocessing;
 using Rlm.Cli.Core.Documents;
 using Shouldly;
-using Spectre.IO;
 using Spectre.IO.Testing;
-using System.Text;
 
 namespace Rlm.Cli.Tests.Core.Documents;
 
@@ -60,18 +62,48 @@ public async Task ReadAsync_DocumentWithHeadings_PreservesStructure()
         // Assert
         document.ShouldNotBeNull();
         
-        // This assertion is expected to FAIL currently
-        // We want to see:
-        // # Heading Level 1
-        // Normal text
-        // ## Heading Level 2
-        // More text
+        // This test verifies that WordDocumentReader correctly detects headings
+        // and preserves them as Markdown-style headings in the document content.
         
         string content = document.Content;
         content.ShouldContain("# Heading Level 1");
         content.ShouldContain("## Heading Level 2");
     }
 
+    [TestMethod]
+    public async Task ReadAsync_ComplexDocument_PreservesStructureCorrectly()
+    {
+        // Arrange
+        CreateComplexDocx(tempFilePath);
+        Uri uri = new(tempFilePath);
+
+        // Act
+        RlmDocument? document = await reader.ReadAsync(uri, TestContext.CancellationToken);
+
+        // Assert
+        document.ShouldNotBeNull();
+        string content = document.Content;
+        
+        // H6 should be detected
+        content.ShouldContain("###### Heading Level 6");
+        
+        // H7 should NOT be detected (treated as normal text)
+        content.ShouldContain("Heading Level 7");
+        content.ShouldNotContain("####### Heading Level 7");
+        
+        // Heading0 should NOT be detected
+        content.ShouldContain("Heading Level 0");
+        content.ShouldNotContain("# Heading Level 0");
+        
+        // Case insensitive check (heading1 vs Heading1)
+        content.ShouldContain("# Lowercase Heading 1");
+        
+        // Empty paragraph with heading style should be just the hashes? Or maybe handled gracefully?
+        // Implementation: content.Append(new string('#', level)).Append(' '); then append text.
+        // So empty paragraph -> "# \n"
+        content.ShouldContain("# \n");
+    }
+
     private static void CreateDocxWithHeadings(string filePath)
     {
         using WordprocessingDocument doc = WordprocessingDocument.Create(filePath, WordprocessingDocumentType.Document);
@@ -108,5 +140,41 @@ private static void CreateDocxWithHeadings(string filePath)
         doc.Save();
     }
 
+    private static void CreateComplexDocx(string filePath)
+    {
+        using WordprocessingDocument doc = WordprocessingDocument.Create(filePath, WordprocessingDocumentType.Document);
+        MainDocumentPart mainPart = doc.AddMainDocumentPart();
+        mainPart.Document = new Document();
+        Body body = mainPart.Document.AppendChild(new Body());
+
+        // Helper to add styled paragraph
+        void AddStyledPara(string text, string styleId)
+        {
+            Paragraph p = body.AppendChild(new Paragraph());
+            if (styleId != null)
+            {
+                ParagraphProperties pPr = new();
+                pPr.ParagraphStyleId = new ParagraphStyleId() { Val = styleId };
+                p.AppendChild(pPr);
+            }
+            Run r = p.AppendChild(new Run());
+            r.AppendChild(new Text(text));
+        }
+
+        AddStyledPara("Heading Level 6", "Heading6");
+        AddStyledPara("Heading Level 7", "Heading7");
+        AddStyledPara("Heading Level 0", "Heading0");
+        AddStyledPara("Lowercase Heading 1", "heading1");
+        
+        // Empty paragraph with Heading1 style
+        Paragraph pEmpty = body.AppendChild(new Paragraph());
+        ParagraphProperties pPrEmpty = new();
+        pPrEmpty.ParagraphStyleId = new ParagraphStyleId() { Val = "Heading1" };
+        pEmpty.AppendChild(pPrEmpty);
+        // No text run
+
+        doc.Save();
+    }
+
     public TestContext TestContext { get; set; }
 }
\ No newline at end of file

From d0269e462c124eee862680e40b3f1613479c9ba0 Mon Sep 17 00:00:00 2001
From: Copilot <198982749+Copilot@users.noreply.github.com>
Date: Thu, 29 Jan 2026 12:38:42 +0000
Subject: [PATCH 3/4] Fix platform-dependent newline in WordDocumentReader test
 assertion (#8)

* Initial plan

* Fix platform-dependent newline assertion in WordDocumentReaderTests

Co-authored-by: HowardvanRooijen <128664+HowardvanRooijen@users.noreply.github.com>

---------

Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: HowardvanRooijen <128664+HowardvanRooijen@users.noreply.github.com>
---
 .../Rlm.Cli.Tests/Core/Documents/WordDocumentReaderTests.cs   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Solutions/Rlm.Cli.Tests/Core/Documents/WordDocumentReaderTests.cs b/Solutions/Rlm.Cli.Tests/Core/Documents/WordDocumentReaderTests.cs
index aac2626..741d00e 100644
--- a/Solutions/Rlm.Cli.Tests/Core/Documents/WordDocumentReaderTests.cs
+++ b/Solutions/Rlm.Cli.Tests/Core/Documents/WordDocumentReaderTests.cs
@@ -100,8 +100,8 @@ public async Task ReadAsync_ComplexDocument_PreservesStructureCorrectly()
         
         // Empty paragraph with heading style should be just the hashes? Or maybe handled gracefully?
         // Implementation: content.Append(new string('#', level)).Append(' '); then append text.
-        // So empty paragraph -> "# \n"
-        content.ShouldContain("# \n");
+        // So empty paragraph -> "# " + Environment.NewLine
+        content.ShouldContain("# " + Environment.NewLine);
     }
 
     private static void CreateDocxWithHeadings(string filePath)

From 95b1a4b7094ea9c6b092a31e85bcb7aba8e93975 Mon Sep 17 00:00:00 2001
From: Howard van Rooijen <Howard.vanRooijen@endjin.com>
Date: Fri, 30 Jan 2026 06:55:33 +0000
Subject: [PATCH 4/4] Add comprehensive tests for WordDocumentReader
 functionality and metadata extraction

---
 .../Core/Documents/WordDocumentReaderTests.cs | 227 ++++++++++++++++--
 1 file changed, 208 insertions(+), 19 deletions(-)

diff --git a/Solutions/Rlm.Cli.Tests/Core/Documents/WordDocumentReaderTests.cs b/Solutions/Rlm.Cli.Tests/Core/Documents/WordDocumentReaderTests.cs
index 741d00e..2658ae7 100644
--- a/Solutions/Rlm.Cli.Tests/Core/Documents/WordDocumentReaderTests.cs
+++ b/Solutions/Rlm.Cli.Tests/Core/Documents/WordDocumentReaderTests.cs
@@ -24,14 +24,14 @@ public void Setup()
         // Setup fake file system for the reader's dependency
         // But note: WordDocumentReader uses System.IO to open the file via OpenXml SDK
         // So we need to ensure we pass checks but use a real file for OpenXml
-        
+
         // We will use a real file path for the test execution because OpenXml SDK works with real files
         tempFilePath = System.IO.Path.GetTempFileName() + ".docx";
-        
+
         // We need the fake file system to "know" about this file so CanRead/ReadAsync checks pass
         FakeEnvironment environment = FakeEnvironment.CreateLinuxEnvironment();
         fakeFileSystem = new(environment);
-        
+
         // Populate fake file system so checks pass
         // The reader implementation checks if file exists using IFileSystem
         // So we must mock that
@@ -49,6 +49,160 @@ public void Cleanup()
         }
     }
 
+    #region CanRead Tests
+
+    [TestMethod]
+    public void CanRead_DocxFile_ReturnsTrue()
+    {
+        Uri uri = new("file:///test/document.docx");
+        reader.CanRead(uri).ShouldBeTrue();
+    }
+
+    [TestMethod]
+    public void CanRead_NonDocxFile_ReturnsFalse()
+    {
+        Uri uri = new("file:///test/document.pdf");
+        reader.CanRead(uri).ShouldBeFalse();
+    }
+
+    [TestMethod]
+    public void CanRead_NonFileScheme_ReturnsFalse()
+    {
+        Uri uri = new("http://example.com/document.docx");
+        reader.CanRead(uri).ShouldBeFalse();
+    }
+
+    [TestMethod]
+    public void CanRead_DocxExtensionCaseInsensitive_ReturnsTrue()
+    {
+        Uri uri = new("file:///test/document.DOCX");
+        reader.CanRead(uri).ShouldBeTrue();
+    }
+
+    #endregion
+
+    #region ReadAsync Null/Error Path Tests
+
+    [TestMethod]
+    public async Task ReadAsync_NonDocxFile_ReturnsNull()
+    {
+        Uri uri = new("file:///test/document.pdf");
+        RlmDocument? result = await reader.ReadAsync(uri, TestContext.CancellationToken);
+        result.ShouldBeNull();
+    }
+
+    [TestMethod]
+    public async Task ReadAsync_FileDoesNotExist_ReturnsNull()
+    {
+        Uri uri = new("file:///nonexistent/document.docx");
+        RlmDocument? result = await reader.ReadAsync(uri, TestContext.CancellationToken);
+        result.ShouldBeNull();
+    }
+
+    [TestMethod]
+    public async Task ReadAsync_CorruptedDocument_ReturnsNull()
+    {
+        // Create an invalid docx file (just text, not a valid ZIP/OpenXml)
+        File.WriteAllText(tempFilePath, "not a valid docx");
+        Uri uri = new(tempFilePath);
+
+        RlmDocument? result = await reader.ReadAsync(uri, TestContext.CancellationToken);
+        result.ShouldBeNull();
+    }
+
+    #endregion
+
+    #region ReadAsync Metadata Tests
+
+    [TestMethod]
+    public async Task ReadAsync_DocumentWithMetadata_ExtractsProperties()
+    {
+        CreateDocxWithMetadata(tempFilePath, title: "Test Title", author: "Test Author");
+        Uri uri = new(tempFilePath);
+
+        RlmDocument? document = await reader.ReadAsync(uri, TestContext.CancellationToken);
+
+        document.ShouldNotBeNull();
+        document.Id.ShouldBe("Test Title");
+        document.Metadata.Title.ShouldBe("Test Title");
+        document.Metadata.Author.ShouldBe("Test Author");
+    }
+
+    [TestMethod]
+    public async Task ReadAsync_DocumentWithoutTitle_UsesFilenameAsId()
+    {
+        CreateDocxWithHeadings(tempFilePath);
+        Uri uri = new(tempFilePath);
+
+        RlmDocument? document = await reader.ReadAsync(uri, TestContext.CancellationToken);
+
+        document.ShouldNotBeNull();
+        document.Id.ShouldContain(".docx");
+    }
+
+    [TestMethod]
+    public async Task ReadAsync_Document_CalculatesWordCount()
+    {
+        CreateDocxWithKnownContent(tempFilePath, "one two three four five");
+        Uri uri = new(tempFilePath);
+
+        RlmDocument? document = await reader.ReadAsync(uri, TestContext.CancellationToken);
+
+        document.ShouldNotBeNull();
+        document.Metadata.WordCount.ShouldBe(5);
+    }
+
+    [TestMethod]
+    public async Task ReadAsync_Document_CalculatesReadingTime()
+    {
+        // 400 words = 2 minutes reading time (200 words per minute)
+        string content = string.Join(" ", Enumerable.Range(1, 400).Select(i => "word"));
+        CreateDocxWithKnownContent(tempFilePath, content);
+        Uri uri = new(tempFilePath);
+
+        RlmDocument? document = await reader.ReadAsync(uri, TestContext.CancellationToken);
+
+        document.ShouldNotBeNull();
+        document.Metadata.EstimatedReadingTimeMinutes.ShouldBe(2);
+    }
+
+    #endregion
+
+    #region ReadManyAsync Tests
+
+    [TestMethod]
+    public async Task ReadManyAsync_ValidDocument_YieldsOneDocument()
+    {
+        CreateDocxWithHeadings(tempFilePath);
+        Uri uri = new(tempFilePath);
+
+        var documents = new List<RlmDocument>();
+        await foreach (var doc in reader.ReadManyAsync(uri, null, TestContext.CancellationToken))
+        {
+            documents.Add(doc);
+        }
+
+        documents.Count.ShouldBe(1);
+    }
+
+    [TestMethod]
+    public async Task ReadManyAsync_InvalidDocument_YieldsNothing()
+    {
+        Uri uri = new("file:///nonexistent.docx");
+
+        var documents = new List<RlmDocument>();
+        await foreach (var doc in reader.ReadManyAsync(uri, null, TestContext.CancellationToken))
+        {
+            documents.Add(doc);
+        }
+
+        documents.ShouldBeEmpty();
+    }
+
+    #endregion
+
+    #region ReadAsync Structure Tests
+
     [TestMethod]
     public async Task ReadAsync_DocumentWithHeadings_PreservesStructure()
     {
@@ -61,13 +215,15 @@ public async Task ReadAsync_DocumentWithHeadings_PreservesStructure()
 
         // Assert
         document.ShouldNotBeNull();
-        
-        // This test verifies that WordDocumentReader correctly detects headings
-        // and preserves them as Markdown-style headings in the document content.
-        
+
         string content = document.Content;
         content.ShouldContain("# Heading Level 1");
         content.ShouldContain("## Heading Level 2");
+
+        // Verify exact heading count: 1 H1 + 1 H2 = 2 headings total
+        string[] lines = content.Split('\n');
+        int headingCount = lines.Count(line => line.TrimStart().StartsWith('#'));
+        headingCount.ShouldBe(2);
     }
 
     [TestMethod]
@@ -83,27 +239,29 @@ public async Task ReadAsync_ComplexDocument_PreservesStructureCorrectly()
         // Assert
         document.ShouldNotBeNull();
         string content = document.Content;
-        
+
         // H6 should be detected
         content.ShouldContain("###### Heading Level 6");
-        
+
         // H7 should NOT be detected (treated as normal text)
         content.ShouldContain("Heading Level 7");
         content.ShouldNotContain("####### Heading Level 7");
-        
+
         // Heading0 should NOT be detected
         content.ShouldContain("Heading Level 0");
         content.ShouldNotContain("# Heading Level 0");
-        
+
         // Case insensitive check (heading1 vs Heading1)
         content.ShouldContain("# Lowercase Heading 1");
-        
-        // Empty paragraph with heading style should be just the hashes? Or maybe handled gracefully?
-        // Implementation: content.Append(new string('#', level)).Append(' '); then append text.
-        // So empty paragraph -> "# " + Environment.NewLine
+
+        // Empty paragraph with heading style produces "# " followed by newline
         content.ShouldContain("# " + Environment.NewLine);
     }
 
+    #endregion
+
+    #region Helper Methods
+
     private static void CreateDocxWithHeadings(string filePath)
     {
         using WordprocessingDocument doc = WordprocessingDocument.Create(filePath, WordprocessingDocumentType.Document);
@@ -136,7 +294,7 @@ private static void CreateDocxWithHeadings(string filePath)
         Paragraph p4 = body.AppendChild(new Paragraph());
         Run r4 = p4.AppendChild(new Run());
         r4.AppendChild(new Text("More text under H2"));
-        
+
         doc.Save();
     }
 
@@ -148,7 +306,7 @@ private static void CreateComplexDocx(string filePath)
         Body body = mainPart.Document.AppendChild(new Body());
 
         // Helper to add styled paragraph
-        void AddStyledPara(string text, string styleId)
+        void AddStyledPara(string text, string? styleId)
         {
             Paragraph p = body.AppendChild(new Paragraph());
             if (styleId != null)
@@ -157,6 +315,7 @@ void AddStyledPara(string text, string styleId)
                 pPr.ParagraphStyleId = new ParagraphStyleId() { Val = styleId };
                 p.AppendChild(pPr);
             }
+
             Run r = p.AppendChild(new Run());
             r.AppendChild(new Text(text));
         }
@@ -165,7 +324,7 @@ void AddStyledPara(string text, string styleId)
         AddStyledPara("Heading Level 7", "Heading7");
         AddStyledPara("Heading Level 0", "Heading0");
         AddStyledPara("Lowercase Heading 1", "heading1");
-        
+
         // Empty paragraph with Heading1 style
         Paragraph pEmpty = body.AppendChild(new Paragraph());
         ParagraphProperties pPrEmpty = new();
@@ -176,5 +335,35 @@ void AddStyledPara(string text, string styleId)
         doc.Save();
     }
 
-    public TestContext TestContext { get; set; }
+    private static void CreateDocxWithMetadata(string filePath, string? title, string? author)
+    {
+        using WordprocessingDocument doc = WordprocessingDocument.Create(filePath, WordprocessingDocumentType.Document);
+        doc.PackageProperties.Title = title;
+        doc.PackageProperties.Creator = author;
+
+        MainDocumentPart mainPart = doc.AddMainDocumentPart();
+        mainPart.Document = new Document();
+        Body body = mainPart.Document.AppendChild(new Body());
+
+        // Add minimal content
+        Paragraph p = body.AppendChild(new Paragraph());
+        p.AppendChild(new Run()).AppendChild(new Text("Content"));
+        doc.Save();
+    }
+
+    private static void CreateDocxWithKnownContent(string filePath, string content)
+    {
+        using WordprocessingDocument doc = WordprocessingDocument.Create(filePath, WordprocessingDocumentType.Document);
+        MainDocumentPart mainPart = doc.AddMainDocumentPart();
+        mainPart.Document = new Document();
+        Body body = mainPart.Document.AppendChild(new Body());
+
+        Paragraph p = body.AppendChild(new Paragraph());
+        p.AppendChild(new Run()).AppendChild(new Text(content));
+        doc.Save();
+    }
+
+    #endregion
+
+    public TestContext TestContext { get; set; } = null!;
 }
\ No newline at end of file