diff --git a/Solutions/Rlm.Cli.Tests/Core/Documents/WordDocumentReaderTests.cs b/Solutions/Rlm.Cli.Tests/Core/Documents/WordDocumentReaderTests.cs
new file mode 100644
index 0000000..2658ae7
--- /dev/null
+++ b/Solutions/Rlm.Cli.Tests/Core/Documents/WordDocumentReaderTests.cs
@@ -0,0 +1,369 @@
+//
+// Copyright (c) Endjin Limited. All rights reserved.
+//
+
+using DocumentFormat.OpenXml;
+using DocumentFormat.OpenXml.Packaging;
+using DocumentFormat.OpenXml.Wordprocessing;
+using Rlm.Cli.Core.Documents;
+using Shouldly;
+using Spectre.IO.Testing;
+
+namespace Rlm.Cli.Tests.Core.Documents;
+
+[TestClass]
+public sealed class WordDocumentReaderTests
+{
+ private FakeFileSystem fakeFileSystem = null!;
+ private WordDocumentReader reader = null!;
+ private string tempFilePath = null!;
+
+ [TestInitialize]
+ public void Setup()
+ {
+ // Setup fake file system for the reader's dependency
+ // But note: WordDocumentReader uses System.IO to open the file via OpenXml SDK
+ // So we need to ensure we pass checks but use a real file for OpenXml
+
+ // We will use a real file path for the test execution because OpenXml SDK works with real files
+ tempFilePath = System.IO.Path.GetTempFileName() + ".docx";
+
+ // We need the fake file system to "know" about this file so CanRead/ReadAsync checks pass
+ FakeEnvironment environment = FakeEnvironment.CreateLinuxEnvironment();
+ fakeFileSystem = new(environment);
+
+ // Populate fake file system so checks pass
+ // The reader implementation checks if file exists using IFileSystem
+ // So we must mock that
+ fakeFileSystem.CreateFile(tempFilePath);
+
+ reader = new(fakeFileSystem);
+ }
+
+ [TestCleanup]
+ public void Cleanup()
+ {
+ if (File.Exists(tempFilePath))
+ {
+ File.Delete(tempFilePath);
+ }
+ }
+
+ #region CanRead Tests
+
+ [TestMethod]
+ public void CanRead_DocxFile_ReturnsTrue()
+ {
+ Uri uri = new("file:///test/document.docx");
+ reader.CanRead(uri).ShouldBeTrue();
+ }
+
+ [TestMethod]
+ public void CanRead_NonDocxFile_ReturnsFalse()
+ {
+ Uri uri = new("file:///test/document.pdf");
+ reader.CanRead(uri).ShouldBeFalse();
+ }
+
+ [TestMethod]
+ public void CanRead_NonFileScheme_ReturnsFalse()
+ {
+ Uri uri = new("http://example.com/document.docx");
+ reader.CanRead(uri).ShouldBeFalse();
+ }
+
+ [TestMethod]
+ public void CanRead_DocxExtensionCaseInsensitive_ReturnsTrue()
+ {
+ Uri uri = new("file:///test/document.DOCX");
+ reader.CanRead(uri).ShouldBeTrue();
+ }
+
+ #endregion
+
+ #region ReadAsync Null/Error Path Tests
+
+ [TestMethod]
+ public async Task ReadAsync_NonDocxFile_ReturnsNull()
+ {
+ Uri uri = new("file:///test/document.pdf");
+ RlmDocument? result = await reader.ReadAsync(uri, TestContext.CancellationToken);
+ result.ShouldBeNull();
+ }
+
+ [TestMethod]
+ public async Task ReadAsync_FileDoesNotExist_ReturnsNull()
+ {
+ Uri uri = new("file:///nonexistent/document.docx");
+ RlmDocument? result = await reader.ReadAsync(uri, TestContext.CancellationToken);
+ result.ShouldBeNull();
+ }
+
+ [TestMethod]
+ public async Task ReadAsync_CorruptedDocument_ReturnsNull()
+ {
+ // Create an invalid docx file (just text, not a valid ZIP/OpenXml)
+ File.WriteAllText(tempFilePath, "not a valid docx");
+ Uri uri = new(tempFilePath);
+
+ RlmDocument? result = await reader.ReadAsync(uri, TestContext.CancellationToken);
+ result.ShouldBeNull();
+ }
+
+ #endregion
+
+ #region ReadAsync Metadata Tests
+
+ [TestMethod]
+ public async Task ReadAsync_DocumentWithMetadata_ExtractsProperties()
+ {
+ CreateDocxWithMetadata(tempFilePath, title: "Test Title", author: "Test Author");
+ Uri uri = new(tempFilePath);
+
+ RlmDocument? document = await reader.ReadAsync(uri, TestContext.CancellationToken);
+
+ document.ShouldNotBeNull();
+ document.Id.ShouldBe("Test Title");
+ document.Metadata.Title.ShouldBe("Test Title");
+ document.Metadata.Author.ShouldBe("Test Author");
+ }
+
+ [TestMethod]
+ public async Task ReadAsync_DocumentWithoutTitle_UsesFilenameAsId()
+ {
+ CreateDocxWithHeadings(tempFilePath);
+ Uri uri = new(tempFilePath);
+
+ RlmDocument? document = await reader.ReadAsync(uri, TestContext.CancellationToken);
+
+ document.ShouldNotBeNull();
+ document.Id.ShouldContain(".docx");
+ }
+
+ [TestMethod]
+ public async Task ReadAsync_Document_CalculatesWordCount()
+ {
+ CreateDocxWithKnownContent(tempFilePath, "one two three four five");
+ Uri uri = new(tempFilePath);
+
+ RlmDocument? document = await reader.ReadAsync(uri, TestContext.CancellationToken);
+
+ document.ShouldNotBeNull();
+ document.Metadata.WordCount.ShouldBe(5);
+ }
+
+ [TestMethod]
+ public async Task ReadAsync_Document_CalculatesReadingTime()
+ {
+ // 400 words = 2 minutes reading time (200 words per minute)
+ string content = string.Join(" ", Enumerable.Range(1, 400).Select(i => "word"));
+ CreateDocxWithKnownContent(tempFilePath, content);
+ Uri uri = new(tempFilePath);
+
+ RlmDocument? document = await reader.ReadAsync(uri, TestContext.CancellationToken);
+
+ document.ShouldNotBeNull();
+ document.Metadata.EstimatedReadingTimeMinutes.ShouldBe(2);
+ }
+
+ #endregion
+
+ #region ReadManyAsync Tests
+
+ [TestMethod]
+ public async Task ReadManyAsync_ValidDocument_YieldsOneDocument()
+ {
+ CreateDocxWithHeadings(tempFilePath);
+ Uri uri = new(tempFilePath);
+
+ var documents = new List();
+ await foreach (var doc in reader.ReadManyAsync(uri, null, TestContext.CancellationToken))
+ {
+ documents.Add(doc);
+ }
+
+ documents.Count.ShouldBe(1);
+ }
+
+ [TestMethod]
+ public async Task ReadManyAsync_InvalidDocument_YieldsNothing()
+ {
+ Uri uri = new("file:///nonexistent.docx");
+
+ var documents = new List();
+ await foreach (var doc in reader.ReadManyAsync(uri, null, TestContext.CancellationToken))
+ {
+ documents.Add(doc);
+ }
+
+ documents.ShouldBeEmpty();
+ }
+
+ #endregion
+
+ #region ReadAsync Structure Tests
+
+ [TestMethod]
+ public async Task ReadAsync_DocumentWithHeadings_PreservesStructure()
+ {
+ // Arrange
+ CreateDocxWithHeadings(tempFilePath);
+ Uri uri = new(tempFilePath);
+
+ // Act
+ RlmDocument? document = await reader.ReadAsync(uri, TestContext.CancellationToken);
+
+ // Assert
+ document.ShouldNotBeNull();
+
+ string content = document.Content;
+ content.ShouldContain("# Heading Level 1");
+ content.ShouldContain("## Heading Level 2");
+
+ // Verify exact heading count: 1 H1 + 1 H2 = 2 headings total
+ string[] lines = content.Split('\n');
+ int headingCount = lines.Count(line => line.TrimStart().StartsWith('#'));
+ headingCount.ShouldBe(2);
+ }
+
+ [TestMethod]
+ public async Task ReadAsync_ComplexDocument_PreservesStructureCorrectly()
+ {
+ // Arrange
+ CreateComplexDocx(tempFilePath);
+ Uri uri = new(tempFilePath);
+
+ // Act
+ RlmDocument? document = await reader.ReadAsync(uri, TestContext.CancellationToken);
+
+ // Assert
+ document.ShouldNotBeNull();
+ string content = document.Content;
+
+ // H6 should be detected
+ content.ShouldContain("###### Heading Level 6");
+
+ // H7 should NOT be detected (treated as normal text)
+ content.ShouldContain("Heading Level 7");
+ content.ShouldNotContain("####### Heading Level 7");
+
+ // Heading0 should NOT be detected
+ content.ShouldContain("Heading Level 0");
+ content.ShouldNotContain("# Heading Level 0");
+
+ // Case insensitive check (heading1 vs Heading1)
+ content.ShouldContain("# Lowercase Heading 1");
+
+ // Empty paragraph with heading style produces "# " followed by newline
+ content.ShouldContain("# " + Environment.NewLine);
+ }
+
+ #endregion
+
+ #region Helper Methods
+
+ private static void CreateDocxWithHeadings(string filePath)
+ {
+ using WordprocessingDocument doc = WordprocessingDocument.Create(filePath, WordprocessingDocumentType.Document);
+ MainDocumentPart mainPart = doc.AddMainDocumentPart();
+ mainPart.Document = new Document();
+ Body body = mainPart.Document.AppendChild(new Body());
+
+ // Heading 1
+ Paragraph p1 = body.AppendChild(new Paragraph());
+ ParagraphProperties pPr1 = new();
+ pPr1.ParagraphStyleId = new ParagraphStyleId() { Val = "Heading1" };
+ p1.AppendChild(pPr1);
+ Run r1 = p1.AppendChild(new Run());
+ r1.AppendChild(new Text("Heading Level 1"));
+
+ // Normal text
+ Paragraph p2 = body.AppendChild(new Paragraph());
+ Run r2 = p2.AppendChild(new Run());
+ r2.AppendChild(new Text("Normal text under H1"));
+
+ // Heading 2
+ Paragraph p3 = body.AppendChild(new Paragraph());
+ ParagraphProperties pPr3 = new();
+ pPr3.ParagraphStyleId = new ParagraphStyleId() { Val = "Heading2" };
+ p3.AppendChild(pPr3);
+ Run r3 = p3.AppendChild(new Run());
+ r3.AppendChild(new Text("Heading Level 2"));
+
+ // More text
+ Paragraph p4 = body.AppendChild(new Paragraph());
+ Run r4 = p4.AppendChild(new Run());
+ r4.AppendChild(new Text("More text under H2"));
+
+ doc.Save();
+ }
+
+ private static void CreateComplexDocx(string filePath)
+ {
+ using WordprocessingDocument doc = WordprocessingDocument.Create(filePath, WordprocessingDocumentType.Document);
+ MainDocumentPart mainPart = doc.AddMainDocumentPart();
+ mainPart.Document = new Document();
+ Body body = mainPart.Document.AppendChild(new Body());
+
+ // Helper to add styled paragraph
+ void AddStyledPara(string text, string? styleId)
+ {
+ Paragraph p = body.AppendChild(new Paragraph());
+ if (styleId != null)
+ {
+ ParagraphProperties pPr = new();
+ pPr.ParagraphStyleId = new ParagraphStyleId() { Val = styleId };
+ p.AppendChild(pPr);
+ }
+
+ Run r = p.AppendChild(new Run());
+ r.AppendChild(new Text(text));
+ }
+
+ AddStyledPara("Heading Level 6", "Heading6");
+ AddStyledPara("Heading Level 7", "Heading7");
+ AddStyledPara("Heading Level 0", "Heading0");
+ AddStyledPara("Lowercase Heading 1", "heading1");
+
+ // Empty paragraph with Heading1 style
+ Paragraph pEmpty = body.AppendChild(new Paragraph());
+ ParagraphProperties pPrEmpty = new();
+ pPrEmpty.ParagraphStyleId = new ParagraphStyleId() { Val = "Heading1" };
+ pEmpty.AppendChild(pPrEmpty);
+ // No text run
+
+ doc.Save();
+ }
+
+ private static void CreateDocxWithMetadata(string filePath, string? title, string? author)
+ {
+ using WordprocessingDocument doc = WordprocessingDocument.Create(filePath, WordprocessingDocumentType.Document);
+ doc.PackageProperties.Title = title;
+ doc.PackageProperties.Creator = author;
+
+ MainDocumentPart mainPart = doc.AddMainDocumentPart();
+ mainPart.Document = new Document();
+ Body body = mainPart.Document.AppendChild(new Body());
+
+ // Add minimal content
+ Paragraph p = body.AppendChild(new Paragraph());
+ p.AppendChild(new Run()).AppendChild(new Text("Content"));
+ doc.Save();
+ }
+
+ private static void CreateDocxWithKnownContent(string filePath, string content)
+ {
+ using WordprocessingDocument doc = WordprocessingDocument.Create(filePath, WordprocessingDocumentType.Document);
+ MainDocumentPart mainPart = doc.AddMainDocumentPart();
+ mainPart.Document = new Document();
+ Body body = mainPart.Document.AppendChild(new Body());
+
+ Paragraph p = body.AppendChild(new Paragraph());
+ p.AppendChild(new Run()).AppendChild(new Text(content));
+ doc.Save();
+ }
+
+ #endregion
+
+ public TestContext TestContext { get; set; } = null!;
+}
\ No newline at end of file
diff --git a/Solutions/Rlm.Cli/Core/Chunking/SemanticChunker.cs b/Solutions/Rlm.Cli/Core/Chunking/SemanticChunker.cs
index f3a9c9f..57d020b 100644
--- a/Solutions/Rlm.Cli/Core/Chunking/SemanticChunker.cs
+++ b/Solutions/Rlm.Cli/Core/Chunking/SemanticChunker.cs
@@ -27,9 +27,6 @@ public sealed partial class SemanticChunker(
bool mergeSmall = false,
string? filterPattern = null) : IChunker
{
- [GeneratedRegex(@"^(#{1,6})\s+(.+)$", RegexOptions.Multiline)]
- private static partial Regex HeaderRegex();
-
public async IAsyncEnumerable ChunkAsync(
RlmDocument document,
[EnumeratorCancellation] CancellationToken cancellationToken = default)
diff --git a/Solutions/Rlm.Cli/Core/Documents/WordDocumentReader.cs b/Solutions/Rlm.Cli/Core/Documents/WordDocumentReader.cs
index a45e918..a0b63be 100644
--- a/Solutions/Rlm.Cli/Core/Documents/WordDocumentReader.cs
+++ b/Solutions/Rlm.Cli/Core/Documents/WordDocumentReader.cs
@@ -73,7 +73,24 @@ public async IAsyncEnumerable ReadManyAsync(
StringBuilder content = new();
foreach (Paragraph para in body.Elements())
{
- content.AppendLine(para.InnerText);
+ string paragraphText = para.InnerText;
+ string? styleId = para.ParagraphProperties?.ParagraphStyleId?.Val?.Value;
+
+ // Check if the paragraph has a style ID that indicates it is a heading.
+ // Standard Word heading styles are named "Heading1", "Heading2", etc.
+ // We extract the level number and prepend the corresponding number of '#' characters
+ // to convert it into a Markdown-style header. This allows downstream components
+ // (like the SemanticChunker) to understand the document structure.
+ if (!string.IsNullOrEmpty(styleId) &&
+ styleId.StartsWith("Heading", StringComparison.OrdinalIgnoreCase) &&
+ styleId.Length > 7 &&
+ int.TryParse(styleId.AsSpan(7), out int level) &&
+ level >= 1 && level <= 6)
+ {
+ content.Append(new string('#', level)).Append(' ');
+ }
+
+ content.AppendLine(paragraphText);
}
string text = content.ToString();