From 8feccb807fc5dc812c0b684b4b48cf27fa17267b Mon Sep 17 00:00:00 2001 From: Howard van Rooijen Date: Thu, 29 Jan 2026 11:44:48 +0000 Subject: [PATCH 1/4] Enhance WordDocumentReader to preserve semantic structure - Modified `WordDocumentReader` to detect "Heading1" through "Heading6" styles and prepend Markdown-style headers (`#` to `######`) to the text. - Added code comments explaining the heading detection logic. - Removed unused `HeaderRegex` from `SemanticChunker`. - Added `WordDocumentReaderTests` to verify that document structure is preserved. Fixes #6 --- .../Core/Documents/WordDocumentReaderTests.cs | 112 ++++++++++++++++++ .../Rlm.Cli/Core/Chunking/SemanticChunker.cs | 3 - .../Core/Documents/WordDocumentReader.cs | 19 ++- 3 files changed, 130 insertions(+), 4 deletions(-) create mode 100644 Solutions/Rlm.Cli.Tests/Core/Documents/WordDocumentReaderTests.cs diff --git a/Solutions/Rlm.Cli.Tests/Core/Documents/WordDocumentReaderTests.cs b/Solutions/Rlm.Cli.Tests/Core/Documents/WordDocumentReaderTests.cs new file mode 100644 index 0000000..1e601af --- /dev/null +++ b/Solutions/Rlm.Cli.Tests/Core/Documents/WordDocumentReaderTests.cs @@ -0,0 +1,112 @@ +using DocumentFormat.OpenXml; +using DocumentFormat.OpenXml.Packaging; +using DocumentFormat.OpenXml.Wordprocessing; +using Rlm.Cli.Core.Documents; +using Shouldly; +using Spectre.IO; +using Spectre.IO.Testing; +using System.Text; + +namespace Rlm.Cli.Tests.Core.Documents; + +[TestClass] +public sealed class WordDocumentReaderTests +{ + private FakeFileSystem fakeFileSystem = null!; + private WordDocumentReader reader = null!; + private string tempFilePath = null!; + + [TestInitialize] + public void Setup() + { + // Setup fake file system for the reader's dependency + // But note: WordDocumentReader uses System.IO to open the file via OpenXml SDK + // So we need to ensure we pass checks but use a real file for OpenXml + + // We will use a real file path for the test execution because OpenXml SDK works with real files + tempFilePath = System.IO.Path.GetTempFileName() + ".docx"; + + // We need the fake file system to "know" about this file so CanRead/ReadAsync checks pass + FakeEnvironment environment = FakeEnvironment.CreateLinuxEnvironment(); + fakeFileSystem = new(environment); + + // Populate fake file system so checks pass + // The reader implementation checks if file exists using IFileSystem + // So we must mock that + fakeFileSystem.CreateFile(tempFilePath); + + reader = new(fakeFileSystem); + } + + [TestCleanup] + public void Cleanup() + { + if (File.Exists(tempFilePath)) + { + File.Delete(tempFilePath); + } + } + + [TestMethod] + public async Task ReadAsync_DocumentWithHeadings_PreservesStructure() + { + // Arrange + CreateDocxWithHeadings(tempFilePath); + Uri uri = new(tempFilePath); + + // Act + RlmDocument? document = await reader.ReadAsync(uri, TestContext.CancellationToken); + + // Assert + document.ShouldNotBeNull(); + + // This assertion is expected to FAIL currently + // We want to see: + // # Heading Level 1 + // Normal text + // ## Heading Level 2 + // More text + + string content = document.Content; + content.ShouldContain("# Heading Level 1"); + content.ShouldContain("## Heading Level 2"); + } + + private static void CreateDocxWithHeadings(string filePath) + { + using WordprocessingDocument doc = WordprocessingDocument.Create(filePath, WordprocessingDocumentType.Document); + MainDocumentPart mainPart = doc.AddMainDocumentPart(); + mainPart.Document = new Document(); + Body body = mainPart.Document.AppendChild(new Body()); + + // Heading 1 + Paragraph p1 = body.AppendChild(new Paragraph()); + ParagraphProperties pPr1 = new(); + pPr1.ParagraphStyleId = new ParagraphStyleId() { Val = "Heading1" }; + p1.AppendChild(pPr1); + Run r1 = p1.AppendChild(new Run()); + r1.AppendChild(new Text("Heading Level 1")); + + // Normal text + Paragraph p2 = body.AppendChild(new Paragraph()); + Run r2 = p2.AppendChild(new Run()); + r2.AppendChild(new Text("Normal text under H1")); + + // Heading 2 + Paragraph p3 = body.AppendChild(new Paragraph()); + ParagraphProperties pPr3 = new(); + pPr3.ParagraphStyleId = new ParagraphStyleId() { Val = "Heading2" }; + p3.AppendChild(pPr3); + Run r3 = p3.AppendChild(new Run()); + r3.AppendChild(new Text("Heading Level 2")); + + // More text + Paragraph p4 = body.AppendChild(new Paragraph()); + Run r4 = p4.AppendChild(new Run()); + r4.AppendChild(new Text("More text under H2")); + + doc.Save(); + } + + public TestContext TestContext { get; set; } +} \ No newline at end of file diff --git a/Solutions/Rlm.Cli/Core/Chunking/SemanticChunker.cs b/Solutions/Rlm.Cli/Core/Chunking/SemanticChunker.cs index f3a9c9f..57d020b 100644 --- a/Solutions/Rlm.Cli/Core/Chunking/SemanticChunker.cs +++ b/Solutions/Rlm.Cli/Core/Chunking/SemanticChunker.cs @@ -27,9 +27,6 @@ public sealed partial class SemanticChunker( bool mergeSmall = false, string? filterPattern = null) : IChunker { - [GeneratedRegex(@"^(#{1,6})\s+(.+)$", RegexOptions.Multiline)] - private static partial Regex HeaderRegex(); - public async IAsyncEnumerable ChunkAsync( RlmDocument document, [EnumeratorCancellation] CancellationToken cancellationToken = default) diff --git a/Solutions/Rlm.Cli/Core/Documents/WordDocumentReader.cs b/Solutions/Rlm.Cli/Core/Documents/WordDocumentReader.cs index a45e918..a0b63be 100644 --- a/Solutions/Rlm.Cli/Core/Documents/WordDocumentReader.cs +++ b/Solutions/Rlm.Cli/Core/Documents/WordDocumentReader.cs @@ -73,7 +73,24 @@ public async IAsyncEnumerable ReadManyAsync( StringBuilder content = new(); foreach (Paragraph para in body.Elements()) { - content.AppendLine(para.InnerText); + string paragraphText = para.InnerText; + string? styleId = para.ParagraphProperties?.ParagraphStyleId?.Val?.Value; + + // Check if the paragraph has a style ID that indicates it is a heading. + // Standard Word heading styles are named "Heading1", "Heading2", etc. + // We extract the level number and prepend the corresponding number of '#' characters + // to convert it into a Markdown-style header. This allows downstream components + // (like the SemanticChunker) to understand the document structure. + if (!string.IsNullOrEmpty(styleId) && + styleId.StartsWith("Heading", StringComparison.OrdinalIgnoreCase) && + styleId.Length > 7 && + int.TryParse(styleId.AsSpan(7), out int level) && + level >= 1 && level <= 6) + { + content.Append(new string('#', level)).Append(' '); + } + + content.AppendLine(paragraphText); } string text = content.ToString(); From e678b5097ce1e57f307a7a8921f48d25b2580442 Mon Sep 17 00:00:00 2001 From: Howard van Rooijen Date: Thu, 29 Jan 2026 12:07:34 +0000 Subject: [PATCH 2/4] Add tests for WordDocumentReader to verify heading preservation in complex documents --- .../Core/Documents/WordDocumentReaderTests.cs | 84 +++++++++++++++++-- 1 file changed, 76 insertions(+), 8 deletions(-) diff --git a/Solutions/Rlm.Cli.Tests/Core/Documents/WordDocumentReaderTests.cs b/Solutions/Rlm.Cli.Tests/Core/Documents/WordDocumentReaderTests.cs index 1e601af..aac2626 100644 --- a/Solutions/Rlm.Cli.Tests/Core/Documents/WordDocumentReaderTests.cs +++ b/Solutions/Rlm.Cli.Tests/Core/Documents/WordDocumentReaderTests.cs @@ -1,11 +1,13 @@ +// +// Copyright (c) Endjin Limited. All rights reserved. +// + using DocumentFormat.OpenXml; using DocumentFormat.OpenXml.Packaging; using DocumentFormat.OpenXml.Wordprocessing; using Rlm.Cli.Core.Documents; using Shouldly; -using Spectre.IO; using Spectre.IO.Testing; -using System.Text; namespace Rlm.Cli.Tests.Core.Documents; @@ -60,18 +62,48 @@ public async Task ReadAsync_DocumentWithHeadings_PreservesStructure() // Assert document.ShouldNotBeNull(); - // This assertion is expected to FAIL currently - // We want to see: - // # Heading Level 1 - // Normal text - // ## Heading Level 2 - // More text + // This test verifies that WordDocumentReader correctly detects headings + // and preserves them as Markdown-style headings in the document content. string content = document.Content; content.ShouldContain("# Heading Level 1"); content.ShouldContain("## Heading Level 2"); } + [TestMethod] + public async Task ReadAsync_ComplexDocument_PreservesStructureCorrectly() + { + // Arrange + CreateComplexDocx(tempFilePath); + Uri uri = new(tempFilePath); + + // Act + RlmDocument? document = await reader.ReadAsync(uri, TestContext.CancellationToken); + + // Assert + document.ShouldNotBeNull(); + string content = document.Content; + + // H6 should be detected + content.ShouldContain("###### Heading Level 6"); + + // H7 should NOT be detected (treated as normal text) + content.ShouldContain("Heading Level 7"); + content.ShouldNotContain("####### Heading Level 7"); + + // Heading0 should NOT be detected + content.ShouldContain("Heading Level 0"); + content.ShouldNotContain("# Heading Level 0"); + + // Case insensitive check (heading1 vs Heading1) + content.ShouldContain("# Lowercase Heading 1"); + + // Empty paragraph with heading style should be just the hashes? Or maybe handled gracefully? + // Implementation: content.Append(new string('#', level)).Append(' '); then append text. + // So empty paragraph -> "# \n" + content.ShouldContain("# \n"); + } + private static void CreateDocxWithHeadings(string filePath) { using WordprocessingDocument doc = WordprocessingDocument.Create(filePath, WordprocessingDocumentType.Document); @@ -108,5 +140,41 @@ private static void CreateDocxWithHeadings(string filePath) doc.Save(); } + private static void CreateComplexDocx(string filePath) + { + using WordprocessingDocument doc = WordprocessingDocument.Create(filePath, WordprocessingDocumentType.Document); + MainDocumentPart mainPart = doc.AddMainDocumentPart(); + mainPart.Document = new Document(); + Body body = mainPart.Document.AppendChild(new Body()); + + // Helper to add styled paragraph + void AddStyledPara(string text, string styleId) + { + Paragraph p = body.AppendChild(new Paragraph()); + if (styleId != null) + { + ParagraphProperties pPr = new(); + pPr.ParagraphStyleId = new ParagraphStyleId() { Val = styleId }; + p.AppendChild(pPr); + } + Run r = p.AppendChild(new Run()); + r.AppendChild(new Text(text)); + } + + AddStyledPara("Heading Level 6", "Heading6"); + AddStyledPara("Heading Level 7", "Heading7"); + AddStyledPara("Heading Level 0", "Heading0"); + AddStyledPara("Lowercase Heading 1", "heading1"); + + // Empty paragraph with Heading1 style + Paragraph pEmpty = body.AppendChild(new Paragraph()); + ParagraphProperties pPrEmpty = new(); + pPrEmpty.ParagraphStyleId = new ParagraphStyleId() { Val = "Heading1" }; + pEmpty.AppendChild(pPrEmpty); + // No text run + + doc.Save(); + } + public TestContext TestContext { get; set; } } \ No newline at end of file From d0269e462c124eee862680e40b3f1613479c9ba0 Mon Sep 17 00:00:00 2001 From: Copilot <198982749+Copilot@users.noreply.github.com> Date: Thu, 29 Jan 2026 12:38:42 +0000 Subject: [PATCH 3/4] Fix platform-dependent newline in WordDocumentReader test assertion (#8) * Initial plan * Fix platform-dependent newline assertion in WordDocumentReaderTests Co-authored-by: HowardvanRooijen <128664+HowardvanRooijen@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: HowardvanRooijen <128664+HowardvanRooijen@users.noreply.github.com> --- .../Rlm.Cli.Tests/Core/Documents/WordDocumentReaderTests.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Solutions/Rlm.Cli.Tests/Core/Documents/WordDocumentReaderTests.cs b/Solutions/Rlm.Cli.Tests/Core/Documents/WordDocumentReaderTests.cs index aac2626..741d00e 100644 --- a/Solutions/Rlm.Cli.Tests/Core/Documents/WordDocumentReaderTests.cs +++ b/Solutions/Rlm.Cli.Tests/Core/Documents/WordDocumentReaderTests.cs @@ -100,8 +100,8 @@ public async Task ReadAsync_ComplexDocument_PreservesStructureCorrectly() // Empty paragraph with heading style should be just the hashes? Or maybe handled gracefully? // Implementation: content.Append(new string('#', level)).Append(' '); then append text. - // So empty paragraph -> "# \n" - content.ShouldContain("# \n"); + // So empty paragraph -> "# " + Environment.NewLine + content.ShouldContain("# " + Environment.NewLine); } private static void CreateDocxWithHeadings(string filePath) From 95b1a4b7094ea9c6b092a31e85bcb7aba8e93975 Mon Sep 17 00:00:00 2001 From: Howard van Rooijen Date: Fri, 30 Jan 2026 06:55:33 +0000 Subject: [PATCH 4/4] Add comprehensive tests for WordDocumentReader functionality and metadata extraction --- .../Core/Documents/WordDocumentReaderTests.cs | 227 ++++++++++++++++-- 1 file changed, 208 insertions(+), 19 deletions(-) diff --git a/Solutions/Rlm.Cli.Tests/Core/Documents/WordDocumentReaderTests.cs b/Solutions/Rlm.Cli.Tests/Core/Documents/WordDocumentReaderTests.cs index 741d00e..2658ae7 100644 --- a/Solutions/Rlm.Cli.Tests/Core/Documents/WordDocumentReaderTests.cs +++ b/Solutions/Rlm.Cli.Tests/Core/Documents/WordDocumentReaderTests.cs @@ -24,14 +24,14 @@ public void Setup() // Setup fake file system for the reader's dependency // But note: WordDocumentReader uses System.IO to open the file via OpenXml SDK // So we need to ensure we pass checks but use a real file for OpenXml - + // We will use a real file path for the test execution because OpenXml SDK works with real files tempFilePath = System.IO.Path.GetTempFileName() + ".docx"; - + // We need the fake file system to "know" about this file so CanRead/ReadAsync checks pass FakeEnvironment environment = FakeEnvironment.CreateLinuxEnvironment(); fakeFileSystem = new(environment); - + // Populate fake file system so checks pass // The reader implementation checks if file exists using IFileSystem // So we must mock that @@ -49,6 +49,160 @@ public void Cleanup() } } + #region CanRead Tests + + [TestMethod] + public void CanRead_DocxFile_ReturnsTrue() + { + Uri uri = new("file:///test/document.docx"); + reader.CanRead(uri).ShouldBeTrue(); + } + + [TestMethod] + public void CanRead_NonDocxFile_ReturnsFalse() + { + Uri uri = new("file:///test/document.pdf"); + reader.CanRead(uri).ShouldBeFalse(); + } + + [TestMethod] + public void CanRead_NonFileScheme_ReturnsFalse() + { + Uri uri = new("http://example.com/document.docx"); + reader.CanRead(uri).ShouldBeFalse(); + } + + [TestMethod] + public void CanRead_DocxExtensionCaseInsensitive_ReturnsTrue() + { + Uri uri = new("file:///test/document.DOCX"); + reader.CanRead(uri).ShouldBeTrue(); + } + + #endregion + + #region ReadAsync Null/Error Path Tests + + [TestMethod] + public async Task ReadAsync_NonDocxFile_ReturnsNull() + { + Uri uri = new("file:///test/document.pdf"); + RlmDocument? result = await reader.ReadAsync(uri, TestContext.CancellationToken); + result.ShouldBeNull(); + } + + [TestMethod] + public async Task ReadAsync_FileDoesNotExist_ReturnsNull() + { + Uri uri = new("file:///nonexistent/document.docx"); + RlmDocument? result = await reader.ReadAsync(uri, TestContext.CancellationToken); + result.ShouldBeNull(); + } + + [TestMethod] + public async Task ReadAsync_CorruptedDocument_ReturnsNull() + { + // Create an invalid docx file (just text, not a valid ZIP/OpenXml) + File.WriteAllText(tempFilePath, "not a valid docx"); + Uri uri = new(tempFilePath); + + RlmDocument? result = await reader.ReadAsync(uri, TestContext.CancellationToken); + result.ShouldBeNull(); + } + + #endregion + + #region ReadAsync Metadata Tests + + [TestMethod] + public async Task ReadAsync_DocumentWithMetadata_ExtractsProperties() + { + CreateDocxWithMetadata(tempFilePath, title: "Test Title", author: "Test Author"); + Uri uri = new(tempFilePath); + + RlmDocument? document = await reader.ReadAsync(uri, TestContext.CancellationToken); + + document.ShouldNotBeNull(); + document.Id.ShouldBe("Test Title"); + document.Metadata.Title.ShouldBe("Test Title"); + document.Metadata.Author.ShouldBe("Test Author"); + } + + [TestMethod] + public async Task ReadAsync_DocumentWithoutTitle_UsesFilenameAsId() + { + CreateDocxWithHeadings(tempFilePath); + Uri uri = new(tempFilePath); + + RlmDocument? document = await reader.ReadAsync(uri, TestContext.CancellationToken); + + document.ShouldNotBeNull(); + document.Id.ShouldContain(".docx"); + } + + [TestMethod] + public async Task ReadAsync_Document_CalculatesWordCount() + { + CreateDocxWithKnownContent(tempFilePath, "one two three four five"); + Uri uri = new(tempFilePath); + + RlmDocument? document = await reader.ReadAsync(uri, TestContext.CancellationToken); + + document.ShouldNotBeNull(); + document.Metadata.WordCount.ShouldBe(5); + } + + [TestMethod] + public async Task ReadAsync_Document_CalculatesReadingTime() + { + // 400 words = 2 minutes reading time (200 words per minute) + string content = string.Join(" ", Enumerable.Range(1, 400).Select(i => "word")); + CreateDocxWithKnownContent(tempFilePath, content); + Uri uri = new(tempFilePath); + + RlmDocument? document = await reader.ReadAsync(uri, TestContext.CancellationToken); + + document.ShouldNotBeNull(); + document.Metadata.EstimatedReadingTimeMinutes.ShouldBe(2); + } + + #endregion + + #region ReadManyAsync Tests + + [TestMethod] + public async Task ReadManyAsync_ValidDocument_YieldsOneDocument() + { + CreateDocxWithHeadings(tempFilePath); + Uri uri = new(tempFilePath); + + var documents = new List(); + await foreach (var doc in reader.ReadManyAsync(uri, null, TestContext.CancellationToken)) + { + documents.Add(doc); + } + + documents.Count.ShouldBe(1); + } + + [TestMethod] + public async Task ReadManyAsync_InvalidDocument_YieldsNothing() + { + Uri uri = new("file:///nonexistent.docx"); + + var documents = new List(); + await foreach (var doc in reader.ReadManyAsync(uri, null, TestContext.CancellationToken)) + { + documents.Add(doc); + } + + documents.ShouldBeEmpty(); + } + + #endregion + + #region ReadAsync Structure Tests + [TestMethod] public async Task ReadAsync_DocumentWithHeadings_PreservesStructure() { @@ -61,13 +215,15 @@ public async Task ReadAsync_DocumentWithHeadings_PreservesStructure() // Assert document.ShouldNotBeNull(); - - // This test verifies that WordDocumentReader correctly detects headings - // and preserves them as Markdown-style headings in the document content. - + string content = document.Content; content.ShouldContain("# Heading Level 1"); content.ShouldContain("## Heading Level 2"); + + // Verify exact heading count: 1 H1 + 1 H2 = 2 headings total + string[] lines = content.Split('\n'); + int headingCount = lines.Count(line => line.TrimStart().StartsWith('#')); + headingCount.ShouldBe(2); } [TestMethod] @@ -83,27 +239,29 @@ public async Task ReadAsync_ComplexDocument_PreservesStructureCorrectly() // Assert document.ShouldNotBeNull(); string content = document.Content; - + // H6 should be detected content.ShouldContain("###### Heading Level 6"); - + // H7 should NOT be detected (treated as normal text) content.ShouldContain("Heading Level 7"); content.ShouldNotContain("####### Heading Level 7"); - + // Heading0 should NOT be detected content.ShouldContain("Heading Level 0"); content.ShouldNotContain("# Heading Level 0"); - + // Case insensitive check (heading1 vs Heading1) content.ShouldContain("# Lowercase Heading 1"); - - // Empty paragraph with heading style should be just the hashes? Or maybe handled gracefully? - // Implementation: content.Append(new string('#', level)).Append(' '); then append text. - // So empty paragraph -> "# " + Environment.NewLine + + // Empty paragraph with heading style produces "# " followed by newline content.ShouldContain("# " + Environment.NewLine); } + #endregion + + #region Helper Methods + private static void CreateDocxWithHeadings(string filePath) { using WordprocessingDocument doc = WordprocessingDocument.Create(filePath, WordprocessingDocumentType.Document); @@ -136,7 +294,7 @@ private static void CreateDocxWithHeadings(string filePath) Paragraph p4 = body.AppendChild(new Paragraph()); Run r4 = p4.AppendChild(new Run()); r4.AppendChild(new Text("More text under H2")); - + doc.Save(); } @@ -148,7 +306,7 @@ private static void CreateComplexDocx(string filePath) Body body = mainPart.Document.AppendChild(new Body()); // Helper to add styled paragraph - void AddStyledPara(string text, string styleId) + void AddStyledPara(string text, string? styleId) { Paragraph p = body.AppendChild(new Paragraph()); if (styleId != null) @@ -157,6 +315,7 @@ void AddStyledPara(string text, string styleId) pPr.ParagraphStyleId = new ParagraphStyleId() { Val = styleId }; p.AppendChild(pPr); } + Run r = p.AppendChild(new Run()); r.AppendChild(new Text(text)); } @@ -165,7 +324,7 @@ void AddStyledPara(string text, string styleId) AddStyledPara("Heading Level 7", "Heading7"); AddStyledPara("Heading Level 0", "Heading0"); AddStyledPara("Lowercase Heading 1", "heading1"); - + // Empty paragraph with Heading1 style Paragraph pEmpty = body.AppendChild(new Paragraph()); ParagraphProperties pPrEmpty = new(); @@ -176,5 +335,35 @@ void AddStyledPara(string text, string styleId) doc.Save(); } - public TestContext TestContext { get; set; } + private static void CreateDocxWithMetadata(string filePath, string? title, string? author) + { + using WordprocessingDocument doc = WordprocessingDocument.Create(filePath, WordprocessingDocumentType.Document); + doc.PackageProperties.Title = title; + doc.PackageProperties.Creator = author; + + MainDocumentPart mainPart = doc.AddMainDocumentPart(); + mainPart.Document = new Document(); + Body body = mainPart.Document.AppendChild(new Body()); + + // Add minimal content + Paragraph p = body.AppendChild(new Paragraph()); + p.AppendChild(new Run()).AppendChild(new Text("Content")); + doc.Save(); + } + + private static void CreateDocxWithKnownContent(string filePath, string content) + { + using WordprocessingDocument doc = WordprocessingDocument.Create(filePath, WordprocessingDocumentType.Document); + MainDocumentPart mainPart = doc.AddMainDocumentPart(); + mainPart.Document = new Document(); + Body body = mainPart.Document.AppendChild(new Body()); + + Paragraph p = body.AppendChild(new Paragraph()); + p.AppendChild(new Run()).AppendChild(new Text(content)); + doc.Save(); + } + + #endregion + + public TestContext TestContext { get; set; } = null!; } \ No newline at end of file