diff --git a/CHANGELOG.md b/CHANGELOG.md index 1ccd5af..aa63ab0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,11 +6,11 @@ All notable changes to this project will be documented in this file. ### Fixed - **Move markup Word compatibility (Issue #96)** - Documents with move operations no longer cause Word "unreadable content" warnings - - Added `SimplifyMoveMarkup` setting to convert native move markup (`w:moveFrom`/`w:moveTo`) to simple `w:del`/`w:ins` - - Changed `DetectMoves` default from `true` to `false` until the underlying ID collision bug is fixed in Phase II - - Root cause identified: `FixUpRevMarkIds()` was overwriting IDs of `w:del`/`w:ins` after `FixUpRevisionIds()` had assigned unique IDs, causing collisions with move element IDs - - Users who want move detection with Word compatibility should set both `DetectMoves = true` and `SimplifyMoveMarkup = true` - - Trade-off: With `SimplifyMoveMarkup = true`, users lose the visual "moved" distinction (green double-underline) but get guaranteed Word compatibility + - Root cause: `FixUpRevMarkIds()` was overwriting IDs of `w:del`/`w:ins` after `FixUpRevisionIds()` had already assigned unique IDs, causing collisions with move element IDs + - Fix: Removed redundant `FixUpRevMarkIds()` call - `FixUpRevisionIds()` already handles all revision element IDs correctly + - Added `SimplifyMoveMarkup` setting to optionally convert move markup to simple `w:del`/`w:ins` if desired + - Added comprehensive ID uniqueness tests to prevent regression + - `DetectMoves` now defaults to `true` (move detection is safe to use) - **Footnote/endnote numbering** - Fixed footnotes and endnotes displaying raw XML IDs instead of sequential display numbers - Per ECMA-376, `w:id` is a reference identifier, not the display number - Added `FootnoteNumberingTracker` class to scan document and build XML ID → display number mapping diff --git a/CLAUDE.md b/CLAUDE.md index 0ed021c..4d13e3c 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -139,17 +139,12 @@ DocumentBuilder.BuildDocument(sources, outputPath); - `AuthorForRevisions` - Author name for tracked changes - `DetailThreshold` - 0.0-1.0, lower = more detailed comparison (default: 0.15) - `CaseInsensitive` - Case-insensitive comparison -- `DetectMoves` - Enable move detection in `GetRevisions()` (default: false - see warning below) -- `SimplifyMoveMarkup` - Convert move markup to del/ins for Word compatibility (default: false) +- `DetectMoves` - Enable move detection in `GetRevisions()` (default: true) +- `SimplifyMoveMarkup` - Convert move markup to del/ins (default: false) - `MoveSimilarityThreshold` - Jaccard similarity threshold for moves (default: 0.8) - `MoveMinimumWordCount` - Minimum words for move detection (default: 3) - `DetectFormatChanges` - Enable format change detection (default: true) -**WARNING: Move Detection Known Issue (Issue #96)** - Move markup can cause Word to display "unreadable content" warnings due to an ID collision bug. Until Phase II of the fix is complete: -- `DetectMoves` defaults to `false` to avoid the issue -- If you need move detection, set both `DetectMoves = true` AND `SimplifyMoveMarkup = true` -- With `SimplifyMoveMarkup = true`, moves are converted to regular del/ins (loses green move styling but ensures Word compatibility) - Move detection produces **native Word move markup** (`w:moveFrom`/`w:moveTo`) when `DetectMoves` is enabled: - The comparer analyzes deleted/inserted content blocks for similarity after LCS comparison - Matching pairs (≥80% Jaccard similarity by default) are converted to move markup diff --git a/Docxodus.Tests/WmlComparerMoveDetectionTests.cs b/Docxodus.Tests/WmlComparerMoveDetectionTests.cs index 97317b9..fce2808 100644 --- a/Docxodus.Tests/WmlComparerMoveDetectionTests.cs +++ b/Docxodus.Tests/WmlComparerMoveDetectionTests.cs @@ -8,6 +8,7 @@ using System.Xml.Linq; using DocumentFormat.OpenXml; using DocumentFormat.OpenXml.Packaging; +using DocumentFormat.OpenXml.Validation; using DocumentFormat.OpenXml.Wordprocessing; using Docxodus; using Xunit; @@ -1175,13 +1176,13 @@ public void SimplifyMoveMarkup_WhenFalse_ShouldPreserveMoveElements() } /// - /// Verifies that DetectMoves defaults to false for safety. + /// Verifies that DetectMoves defaults to true. /// [Fact] - public void DetectMoves_ShouldDefaultToFalse() + public void DetectMoves_ShouldDefaultToTrue() { var settings = new WmlComparerSettings(); - Assert.False(settings.DetectMoves, "DetectMoves should default to false until Phase II fix is complete"); + Assert.True(settings.DetectMoves, "DetectMoves should default to true"); } /// @@ -1195,5 +1196,413 @@ public void SimplifyMoveMarkup_ShouldDefaultToFalse() } #endregion + + #region ID Uniqueness Tests (Issue #96 Phase II) + + /// + /// Verifies that all revision IDs are unique across the document when moves are present. + /// This is the core test for Issue #96 - duplicate IDs cause Word "unreadable content" warnings. + /// + [Fact] + public void MoveMarkup_AllRevisionIdsShouldBeUnique() + { + // Arrange: Create documents with moved content + var doc1 = CreateDocumentWithParagraphs( + "This is paragraph A with enough words for move detection.", + "This is paragraph B with sufficient content here.", + "This is paragraph C that stays in place.", + "This is paragraph D with additional content." + ); + var doc2 = CreateDocumentWithParagraphs( + "This is paragraph B with sufficient content here.", + "This is paragraph A with enough words for move detection.", + "This is paragraph C that stays in place but modified slightly.", + "This is paragraph D with additional content." + ); + + var settings = new WmlComparerSettings + { + DetectMoves = true, + SimplifyMoveMarkup = false, + MoveSimilarityThreshold = 0.8, + MoveMinimumWordCount = 3 + }; + + // Act + var compared = WmlComparer.Compare(doc1, doc2, settings); + + // Extract all revision IDs from all content parts + using var stream = new MemoryStream(compared.DocumentByteArray); + using var doc = WordprocessingDocument.Open(stream, false); + + var allIds = new List(); + XNamespace w = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"; + var revisionElements = new[] { "ins", "del", "moveFrom", "moveTo", + "moveFromRangeStart", "moveFromRangeEnd", "moveToRangeStart", "moveToRangeEnd", "rPrChange" }; + + // Check main document + var mainXDoc = doc.MainDocumentPart.GetXDocument(); + foreach (var elemName in revisionElements) + { + allIds.AddRange(mainXDoc.Descendants(w + elemName) + .Select(e => e.Attribute(w + "id")?.Value) + .Where(id => id != null)); + } + + // Check footnotes if present + if (doc.MainDocumentPart.FootnotesPart != null) + { + var fnXDoc = doc.MainDocumentPart.FootnotesPart.GetXDocument(); + foreach (var elemName in revisionElements) + { + allIds.AddRange(fnXDoc.Descendants(w + elemName) + .Select(e => e.Attribute(w + "id")?.Value) + .Where(id => id != null)); + } + } + + // Check endnotes if present + if (doc.MainDocumentPart.EndnotesPart != null) + { + var enXDoc = doc.MainDocumentPart.EndnotesPart.GetXDocument(); + foreach (var elemName in revisionElements) + { + allIds.AddRange(enXDoc.Descendants(w + elemName) + .Select(e => e.Attribute(w + "id")?.Value) + .Where(id => id != null)); + } + } + + // Assert: No duplicate IDs (excluding range start/end pairs which intentionally share IDs) + // For range elements, start and end share the same ID by design + // But NO other element should share an ID with any other element + var duplicates = allIds.GroupBy(x => x) + .Where(g => g.Count() > 2) // Allow pairs (start/end) but not more + .Select(g => new { Id = g.Key, Count = g.Count() }) + .ToList(); + + Assert.True(duplicates.Count == 0, + $"Found revision IDs used more than twice (only range pairs should share IDs): " + + $"{string.Join(", ", duplicates.Select(d => $"id={d.Id} count={d.Count}"))}"); + } + + /// + /// Verifies that move names properly pair moveFrom and moveTo elements. + /// Each move name should appear exactly once in moveFromRangeStart and once in moveToRangeStart. + /// Note: Consecutive paragraphs may be grouped as a single move block. + /// + [Fact] + public void MoveMarkup_MoveNamesShouldProperlyPairSourceAndDestination() + { + // Arrange: Create documents with moved content + var doc1 = CreateDocumentWithParagraphs( + "This is paragraph A with enough words for move detection.", + "This is paragraph B with sufficient content here." + ); + var doc2 = CreateDocumentWithParagraphs( + "This is paragraph B with sufficient content here.", + "This is paragraph A with enough words for move detection." + ); + + var settings = new WmlComparerSettings + { + DetectMoves = true, + SimplifyMoveMarkup = false, + MoveSimilarityThreshold = 0.8, + MoveMinimumWordCount = 3 + }; + + // Act + var compared = WmlComparer.Compare(doc1, doc2, settings); + + // Extract move names + using var stream = new MemoryStream(compared.DocumentByteArray); + using var doc = WordprocessingDocument.Open(stream, false); + + XNamespace w = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"; + var mainXDoc = doc.MainDocumentPart.GetXDocument(); + + var moveFromNames = mainXDoc.Descendants(w + "moveFromRangeStart") + .Select(e => e.Attribute(w + "name")?.Value) + .Where(n => n != null) + .ToList(); + + var moveToNames = mainXDoc.Descendants(w + "moveToRangeStart") + .Select(e => e.Attribute(w + "name")?.Value) + .Where(n => n != null) + .ToList(); + + // Assert: Should have at least one move detected + Assert.True(moveFromNames.Count > 0, "Expected at least one moveFromRangeStart with w:name"); + Assert.True(moveToNames.Count > 0, "Expected at least one moveToRangeStart with w:name"); + + // Assert: moveFrom and moveTo names should match (same names, same count) + Assert.True(moveFromNames.OrderBy(x => x).SequenceEqual(moveToNames.OrderBy(x => x)), + $"Move names should match between moveFrom and moveTo. " + + $"From: [{string.Join(", ", moveFromNames)}], To: [{string.Join(", ", moveToNames)}]"); + + // Assert: No empty or null move names + Assert.DoesNotContain("", moveFromNames); + Assert.DoesNotContain("", moveToNames); + Assert.True(moveFromNames.All(n => n.StartsWith("move")), + "All move names should follow the 'moveN' pattern"); + } + + /// + /// Verifies that a document with moves and other changes has unique IDs. + /// This specifically tests the scenario that caused Issue #96. + /// + [Fact] + public void MoveMarkup_WithMixedChanges_ShouldHaveUniqueIds() + { + // Arrange: Create documents with moves AND other ins/del changes + var doc1 = CreateDocumentWithParagraphs( + "This paragraph will be moved to a new location.", + "This paragraph stays but will be modified here.", + "This paragraph will be deleted entirely from doc.", + "This is static content that does not change." + ); + var doc2 = CreateDocumentWithParagraphs( + "This paragraph stays but has been changed now.", + "This is static content that does not change.", + "This paragraph will be moved to a new location.", + "This is a completely new paragraph inserted." + ); + + var settings = new WmlComparerSettings + { + DetectMoves = true, + SimplifyMoveMarkup = false, + MoveSimilarityThreshold = 0.8, + MoveMinimumWordCount = 3 + }; + + // Act + var compared = WmlComparer.Compare(doc1, doc2, settings); + + // Extract all revision IDs + using var stream = new MemoryStream(compared.DocumentByteArray); + using var doc = WordprocessingDocument.Open(stream, false); + + XNamespace w = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"; + var mainXDoc = doc.MainDocumentPart.GetXDocument(); + + // Get IDs from different element types + var insIds = mainXDoc.Descendants(w + "ins") + .Select(e => e.Attribute(w + "id")?.Value).Where(id => id != null).ToList(); + var delIds = mainXDoc.Descendants(w + "del") + .Select(e => e.Attribute(w + "id")?.Value).Where(id => id != null).ToList(); + var moveFromIds = mainXDoc.Descendants(w + "moveFrom") + .Select(e => e.Attribute(w + "id")?.Value).Where(id => id != null).ToList(); + var moveToIds = mainXDoc.Descendants(w + "moveTo") + .Select(e => e.Attribute(w + "id")?.Value).Where(id => id != null).ToList(); + + // Combine non-range IDs (these should all be unique) + var nonRangeIds = insIds.Concat(delIds).Concat(moveFromIds).Concat(moveToIds).ToList(); + + // Check for duplicates + var duplicates = nonRangeIds.GroupBy(x => x) + .Where(g => g.Count() > 1) + .ToList(); + + Assert.True(duplicates.Count == 0, + $"Found duplicate IDs among ins/del/moveFrom/moveTo elements: " + + $"{string.Join(", ", duplicates.Select(g => $"id={g.Key}"))}. " + + $"This is the Issue #96 bug - FixUpRevMarkIds was overwriting IDs."); + } + + #endregion + + #region Stress Tests (Issue #96) + + /// + /// Generates a paragraph with unique content for stress testing. + /// + private static string GenerateStressTestParagraph(int index) + { + var templates = new[] + { + "Paragraph {0}: This document section contains important information about the project requirements and specifications. Reference ID: {1}", + "Section {0}: The following content describes the technical implementation details for the proposed system architecture. Doc: {1}", + "Item {0}: According to the agreement dated herein, the parties shall comply with all terms and conditions specified. Contract: {1}", + "Clause {0}: The licensee agrees to use the software only for purposes permitted under this license agreement. License: {1}", + "Article {0}: This paragraph establishes the fundamental principles governing the relationship between the entities. Ref: {1}", + "Point {0}: The data processing activities shall be conducted in accordance with applicable privacy regulations. GDPR: {1}", + "Note {0}: All modifications to this document must be tracked and approved by the designated review committee. Rev: {1}", + "Entry {0}: The financial statements have been prepared in accordance with generally accepted accounting principles. GAAP: {1}", + "Record {0}: This memorandum summarizes the key decisions made during the executive committee meeting. Minutes: {1}", + "Statement {0}: The undersigned hereby certifies that all information provided is true and accurate. Cert: {1}", + "Provision {0}: Notwithstanding the foregoing, the obligations set forth herein shall survive termination. Legal: {1}", + "Stipulation {0}: The contractor shall deliver all work products by the specified deadline. Deadline: {1}", + "Requirement {0}: The system shall support concurrent users and maintain response times under load. Perf: {1}", + "Specification {0}: All API endpoints must implement proper authentication and authorization. Security: {1}", + "Definition {0}: For purposes of this agreement, the following terms shall have the meanings ascribed. Terms: {1}", + }; + + var template = templates[index % templates.Length]; + return string.Format(template, index, $"DOC-{index:D4}-{Guid.NewGuid().ToString().Substring(0, 8).ToUpper()}"); + } + + /// + /// Stress test for Issue #96: Validates that revision IDs remain unique even with + /// dozens of moves and hundreds of other changes. Uses fixed seed for reproducibility. + /// + [Theory] + [InlineData(50, 15, 30, "Small")] // 50 paragraphs, ~15 moves, ~30 other changes + [InlineData(100, 25, 50, "Medium")] // 100 paragraphs, ~25 moves, ~50 other changes + [InlineData(200, 40, 100, "Large")] // 200 paragraphs, ~40 moves, ~100 other changes + public void StressTest_ManyMovesAndChanges_ShouldHaveUniqueIds( + int paragraphCount, int moveCount, int changeCount, string testName) + { + // Arrange: Use fixed seed for reproducibility + var rng = new Random(42); + + // Generate original document with numbered paragraphs + var originalParagraphs = Enumerable.Range(1, paragraphCount) + .Select(i => GenerateStressTestParagraph(i)) + .ToList(); + + // Create modified version with moves and changes + var modifiedParagraphs = new List(originalParagraphs); + + // Apply moves: pick random paragraphs and move them to new positions + var availableForMove = Enumerable.Range(0, modifiedParagraphs.Count).ToList(); + for (int i = 0; i < moveCount && availableForMove.Count > 2; i++) + { + int fromIdx = availableForMove[rng.Next(availableForMove.Count)]; + availableForMove.Remove(fromIdx); + + var para = modifiedParagraphs[fromIdx]; + modifiedParagraphs.RemoveAt(fromIdx); + + // Adjust available indices after removal + availableForMove = availableForMove.Select(x => x > fromIdx ? x - 1 : x).ToList(); + + int toIdx = rng.Next(modifiedParagraphs.Count + 1); + modifiedParagraphs.Insert(toIdx, para); + + // Adjust available indices after insertion + availableForMove = availableForMove.Select(x => x >= toIdx ? x + 1 : x).ToList(); + } + + // Apply deletions + int deleteCount = changeCount / 3; + for (int i = 0; i < deleteCount && modifiedParagraphs.Count > paragraphCount / 2; i++) + { + int idx = rng.Next(modifiedParagraphs.Count); + modifiedParagraphs.RemoveAt(idx); + } + + // Apply insertions + int insertCount = changeCount / 3; + for (int i = 0; i < insertCount; i++) + { + int idx = rng.Next(modifiedParagraphs.Count + 1); + modifiedParagraphs.Insert(idx, $"[NEW-{i + 1}] This is a newly inserted paragraph with enough words to be meaningful. " + + $"It contains various content including technical terms, legal jargon, and general prose. " + + $"The purpose is to test the comparison engine with substantial insertions. Reference: INS-{Guid.NewGuid():N}"); + } + + // Apply modifications (change words in existing paragraphs) + int modifyCount = changeCount / 3; + for (int i = 0; i < modifyCount && modifiedParagraphs.Count > 0; i++) + { + int idx = rng.Next(modifiedParagraphs.Count); + var para = modifiedParagraphs[idx]; + para = para.Replace("paragraph", "section") + .Replace("content", "material") + .Replace("document", "file"); + if (!para.Contains("[MODIFIED]")) + { + para = "[MODIFIED] " + para; + } + modifiedParagraphs[idx] = para; + } + + // Create documents + var doc1 = CreateDocumentWithParagraphs(originalParagraphs.ToArray()); + var doc2 = CreateDocumentWithParagraphs(modifiedParagraphs.ToArray()); + + var settings = new WmlComparerSettings + { + DetectMoves = true, + SimplifyMoveMarkup = false, + MoveSimilarityThreshold = 0.75, + MoveMinimumWordCount = 5, + AuthorForRevisions = "StressTest" + }; + + // Act + var compared = WmlComparer.Compare(doc1, doc2, settings); + + // Assert: Analyze results + using var stream = new MemoryStream(compared.DocumentByteArray); + using var wDoc = WordprocessingDocument.Open(stream, false); + var mainXDoc = wDoc.MainDocumentPart.GetXDocument(); + + XNamespace w = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"; + + // Collect all revision IDs + var revisionElements = new[] { "ins", "del", "moveFrom", "moveTo", "rPrChange" }; + var allIds = new List<(string Id, string Type)>(); + + foreach (var elemName in revisionElements) + { + foreach (var elem in mainXDoc.Descendants(w + elemName)) + { + var id = elem.Attribute(w + "id")?.Value; + if (id != null) + { + allIds.Add((id, elemName)); + } + } + } + + // Check for duplicates - THE CRITICAL TEST + var duplicates = allIds.GroupBy(x => x.Id) + .Where(g => g.Count() > 1) + .ToList(); + + Assert.True(duplicates.Count == 0, + $"StressTest {testName}: Found {duplicates.Count} duplicate revision IDs. " + + $"First duplicates: {string.Join(", ", duplicates.Take(5).Select(d => $"id={d.Key}:[{string.Join(",", d.Select(x => x.Type))}]"))}. " + + $"Total elements: {allIds.Count}"); + + // Check move name pairing + var moveFromNames = mainXDoc.Descendants(w + "moveFromRangeStart") + .Select(e => e.Attribute(w + "name")?.Value) + .Where(n => !string.IsNullOrEmpty(n)) + .ToList(); + + var moveToNames = mainXDoc.Descendants(w + "moveToRangeStart") + .Select(e => e.Attribute(w + "name")?.Value) + .Where(n => !string.IsNullOrEmpty(n)) + .ToList(); + + // Validate all names are paired + var unpairedFrom = moveFromNames.Except(moveToNames).ToList(); + var unpairedTo = moveToNames.Except(moveFromNames).ToList(); + + Assert.True(!unpairedFrom.Any() && !unpairedTo.Any(), + $"StressTest {testName}: Unpaired move names found. " + + $"From without To: [{string.Join(", ", unpairedFrom)}], " + + $"To without From: [{string.Join(", ", unpairedTo)}]"); + + // OpenXML validation + var validator = new OpenXmlValidator(FileFormatVersions.Office2019); + var errors = validator.Validate(wDoc).ToList(); + + // Note: Some validation errors may be acceptable (e.g., missing optional parts) + // We focus on ensuring no critical structural errors + var criticalErrors = errors + .Where(e => e.ErrorType == DocumentFormat.OpenXml.Validation.ValidationErrorType.Schema) + .ToList(); + + Assert.True(criticalErrors.Count == 0, + $"StressTest {testName}: OpenXML schema validation failed with {criticalErrors.Count} errors. " + + $"First errors: {string.Join("; ", criticalErrors.Take(3).Select(e => e.Description))}"); + } + + #endregion } } diff --git a/Docxodus/WmlComparer.cs b/Docxodus/WmlComparer.cs index 2301a6f..c77309d 100644 --- a/Docxodus/WmlComparer.cs +++ b/Docxodus/WmlComparer.cs @@ -63,23 +63,17 @@ public class WmlComparerSettings public DirectoryInfo DebugTempFileDi; /// - /// Whether to detect and mark moved content in GetRevisions(). Default: false. + /// Whether to detect and mark moved content in GetRevisions(). Default: true. /// When enabled, deletion/insertion pairs with similar text are marked as moves /// using native w:moveFrom/w:moveTo markup. - /// - /// WARNING: Move markup can cause Word to display "unreadable content" warnings - /// due to a known ID collision bug (Issue #96). Until this is fixed in Phase II, - /// it is recommended to either keep this false, or set SimplifyMoveMarkup = true - /// when enabling move detection. /// - public bool DetectMoves = false; + public bool DetectMoves = true; /// /// When true, converts native move markup (w:moveFrom/w:moveTo) to simple - /// delete/insert markup (w:del/w:ins) after comparison. This ensures Word - /// compatibility at the cost of losing the visual "moved" distinction. + /// delete/insert markup (w:del/w:ins) after comparison. This trades the + /// visual "moved" distinction for simpler markup. /// - /// Use this setting when DetectMoves = true but Word compatibility is required. /// Default: false. /// public bool SimplifyMoveMarkup = false; @@ -1854,7 +1848,8 @@ private static WmlDocument ProduceDocumentWithTrackedRevisions(WmlComparerSettin wDocWithRevisions.MainDocumentPart.PutXDocument(); FixUpFootnotesEndnotesWithCustomMarkers(wDocWithRevisions); - FixUpRevMarkIds(wDocWithRevisions); + // Note: FixUpRevMarkIds was removed here - it was causing ID collisions with move + // elements (Issue #96). FixUpRevisionIds already handles all revision IDs properly. // Convert move markup to simple del/ins if requested (Issue #96 workaround) // This runs after all ID fixups to ensure proper conversion diff --git a/TestFiles/Issue96/.gitignore b/TestFiles/Issue96/.gitignore new file mode 100644 index 0000000..e050548 --- /dev/null +++ b/TestFiles/Issue96/.gitignore @@ -0,0 +1 @@ +*.docx diff --git a/TestFiles/Issue96/Issue96BugReproduction.cs b/TestFiles/Issue96/Issue96BugReproduction.cs new file mode 100644 index 0000000..2a6cba3 --- /dev/null +++ b/TestFiles/Issue96/Issue96BugReproduction.cs @@ -0,0 +1,253 @@ +// Issue #96 Bug Reproduction Test +// This specifically tests the scenario that caused the "unreadable content" warning: +// Move operations combined with regular ins/del that would have caused ID collisions + +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Xml.Linq; +using DocumentFormat.OpenXml; +using DocumentFormat.OpenXml.Packaging; +using DocumentFormat.OpenXml.Wordprocessing; +using DocumentFormat.OpenXml.Validation; +using Docxodus; + +class Issue96BugReproduction +{ + static XNamespace W = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"; + + static void Main(string[] args) + { + Console.WriteLine("╔══════════════════════════════════════════════════════════════╗"); + Console.WriteLine("║ Issue #96 Bug Reproduction Test ║"); + Console.WriteLine("║ Move operations + ins/del that caused ID collisions ║"); + Console.WriteLine("╚══════════════════════════════════════════════════════════════╝\n"); + + var outputDir = Path.GetDirectoryName(typeof(Issue96BugReproduction).Assembly.Location) + ?? Directory.GetCurrentDirectory(); + + // Create documents that will produce BOTH moves AND regular ins/del + // This is the scenario that triggered the bug + // The first paragraph is IDENTICAL in both docs but at different positions (MOVE) + // The last paragraph is completely different (DEL + INS) + var doc1 = CreateDocument(new[] { + "The quick brown fox jumps over the lazy sleeping dog in the park today.", + "Static content that does not change at all in this document test.", + "This paragraph will be deleted and replaced with something new." + }); + + var doc2 = CreateDocument(new[] { + "Static content that does not change at all in this document test.", + "The quick brown fox jumps over the lazy sleeping dog in the park today.", + "This is a completely new paragraph that was inserted here instead." + }); + + Console.WriteLine("Document 1 (Original):"); + Console.WriteLine(" [1] The quick brown fox jumps over the lazy sleeping dog..."); + Console.WriteLine(" [2] Static content that does not change..."); + Console.WriteLine(" [3] Another static paragraph..."); + Console.WriteLine(" [4] This paragraph has some text that will be partially modified."); + Console.WriteLine(); + Console.WriteLine("Document 2 (Modified):"); + Console.WriteLine(" [1] Static content that does not change..."); + Console.WriteLine(" [2] Another static paragraph..."); + Console.WriteLine(" [3] The quick brown fox jumps over the lazy sleeping dog... (MOVED)"); + Console.WriteLine(" [4] This paragraph has DIFFERENT text... (MODIFIED)"); + Console.WriteLine(); + + var settings = new WmlComparerSettings + { + DetectMoves = true, + SimplifyMoveMarkup = false, + MoveSimilarityThreshold = 0.8, + MoveMinimumWordCount = 3, + AuthorForRevisions = "Issue96Test" + }; + + Console.WriteLine("Comparing with DetectMoves=true, SimplifyMoveMarkup=false...\n"); + + var compared = WmlComparer.Compare(doc1, doc2, settings); + + // Save output + var outputPath = Path.Combine(outputDir, "Issue96_BugRepro_Output.docx"); + File.WriteAllBytes(outputPath, compared.DocumentByteArray); + Console.WriteLine($"📄 Output saved: {outputPath}\n"); + + // Analyze the output + Console.WriteLine("═══════════════════════════════════════════════════════════════"); + Console.WriteLine(" VALIDATION RESULTS"); + Console.WriteLine("═══════════════════════════════════════════════════════════════\n"); + + using var stream = new MemoryStream(compared.DocumentByteArray); + using var wDoc = WordprocessingDocument.Open(stream, false); + var mainXDoc = wDoc.MainDocumentPart.GetXDocument(); + + // Count elements + var moveFromCount = mainXDoc.Descendants(W + "moveFrom").Count(); + var moveToCount = mainXDoc.Descendants(W + "moveTo").Count(); + var moveFromRangeStartCount = mainXDoc.Descendants(W + "moveFromRangeStart").Count(); + var moveFromRangeEndCount = mainXDoc.Descendants(W + "moveFromRangeEnd").Count(); + var moveToRangeStartCount = mainXDoc.Descendants(W + "moveToRangeStart").Count(); + var moveToRangeEndCount = mainXDoc.Descendants(W + "moveToRangeEnd").Count(); + var delCount = mainXDoc.Descendants(W + "del").Count(); + var insCount = mainXDoc.Descendants(W + "ins").Count(); + + Console.WriteLine("Move Elements:"); + Console.WriteLine($" • w:moveFrom: {moveFromCount}"); + Console.WriteLine($" • w:moveTo: {moveToCount}"); + Console.WriteLine($" • w:moveFromRangeStart:{moveFromRangeStartCount}"); + Console.WriteLine($" • w:moveFromRangeEnd: {moveFromRangeEndCount}"); + Console.WriteLine($" • w:moveToRangeStart: {moveToRangeStartCount}"); + Console.WriteLine($" • w:moveToRangeEnd: {moveToRangeEndCount}"); + Console.WriteLine(); + Console.WriteLine("Regular Revision Elements:"); + Console.WriteLine($" • w:del: {delCount}"); + Console.WriteLine($" • w:ins: {insCount}"); + Console.WriteLine(); + + // Collect all revision IDs + var allRevisionIds = new Dictionary>(); + var revisionElements = new[] { "ins", "del", "moveFrom", "moveTo", "rPrChange" }; + + foreach (var elemName in revisionElements) + { + foreach (var elem in mainXDoc.Descendants(W + elemName)) + { + var id = elem.Attribute(W + "id")?.Value; + if (id != null) + { + if (!allRevisionIds.ContainsKey(id)) + allRevisionIds[id] = new List(); + allRevisionIds[id].Add(elemName); + } + } + } + + Console.WriteLine("ID Analysis:"); + Console.WriteLine($" • Total unique IDs: {allRevisionIds.Count}"); + + // Check for the Issue #96 bug: duplicate IDs + var duplicates = allRevisionIds.Where(kvp => kvp.Value.Count > 1).ToList(); + if (duplicates.Count > 0) + { + Console.WriteLine(); + Console.WriteLine(" ❌ DUPLICATE IDs FOUND (Issue #96 BUG!):"); + foreach (var dup in duplicates) + { + Console.WriteLine($" ID={dup.Key} used by: {string.Join(", ", dup.Value)}"); + } + } + else + { + Console.WriteLine(" ✅ No duplicate IDs (Issue #96 is FIXED!)"); + } + + // Show move names + var moveNames = mainXDoc.Descendants(W + "moveFromRangeStart") + .Select(e => e.Attribute(W + "name")?.Value) + .Where(n => n != null) + .Distinct() + .ToList(); + + if (moveNames.Count > 0) + { + Console.WriteLine(); + Console.WriteLine("Move Name Linking:"); + foreach (var name in moveNames) + { + var fromCount = mainXDoc.Descendants(W + "moveFromRangeStart") + .Count(e => e.Attribute(W + "name")?.Value == name); + var toCount = mainXDoc.Descendants(W + "moveToRangeStart") + .Count(e => e.Attribute(W + "name")?.Value == name); + Console.WriteLine($" • '{name}': {fromCount} source(s), {toCount} destination(s)"); + } + } + + // Run OpenXML validation + Console.WriteLine(); + Console.WriteLine("OpenXML Validation:"); + var validator = new OpenXmlValidator(FileFormatVersions.Office2019); + var errors = validator.Validate(wDoc).ToList(); + if (errors.Count == 0) + { + Console.WriteLine(" ✅ Document is valid per OpenXML schema"); + } + else + { + Console.WriteLine($" ⚠️ {errors.Count} validation issue(s):"); + foreach (var error in errors.Take(5)) + { + Console.WriteLine($" - {error.Description}"); + } + } + + Console.WriteLine(); + Console.WriteLine("═══════════════════════════════════════════════════════════════"); + Console.WriteLine(" SUMMARY"); + Console.WriteLine("═══════════════════════════════════════════════════════════════"); + + bool hasMoves = moveFromCount > 0 && moveToCount > 0; + bool hasDelIns = delCount > 0 || insCount > 0; + bool noDuplicates = duplicates.Count == 0; + bool validXml = errors.Count == 0; + + Console.WriteLine($" Move detection working: {(hasMoves ? "✅ YES" : "⚠️ NO")}"); + Console.WriteLine($" Has regular del/ins: {(hasDelIns ? "✅ YES" : "❌ NO")}"); + Console.WriteLine($" No duplicate IDs: {(noDuplicates ? "✅ PASS" : "❌ FAIL")}"); + Console.WriteLine($" OpenXML validation: {(validXml ? "✅ PASS" : "⚠️ ISSUES")}"); + Console.WriteLine(); + + if (hasMoves && hasDelIns && noDuplicates) + { + Console.WriteLine("🎉 Issue #96 is FIXED! Document has both moves AND del/ins with unique IDs."); + Console.WriteLine($" Open the file in Word to verify: {outputPath}"); + } + else if (!hasMoves) + { + Console.WriteLine("ℹ️ No moves were detected (similarity threshold not met)."); + Console.WriteLine(" The ID uniqueness fix is still validated."); + } + } + + static WmlDocument CreateDocument(string[] paragraphs) + { + using var stream = new MemoryStream(); + using (var doc = WordprocessingDocument.Create(stream, WordprocessingDocumentType.Document)) + { + var mainPart = doc.AddMainDocumentPart(); + mainPart.Document = new Document( + new Body( + paragraphs.Select(text => + new Paragraph( + new Run( + new Text(text) + ) + ) + ) + ) + ); + + var stylesPart = mainPart.AddNewPart(); + stylesPart.Styles = new Styles( + new DocDefaults( + new RunPropertiesDefault( + new RunPropertiesBaseStyle( + new RunFonts { Ascii = "Calibri" }, + new FontSize { Val = "22" } + ) + ), + new ParagraphPropertiesDefault() + ) + ); + + var settingsPart = mainPart.AddNewPart(); + settingsPart.Settings = new Settings(); + + doc.Save(); + } + + stream.Position = 0; + return new WmlDocument("test.docx", stream.ToArray()); + } +} diff --git a/TestFiles/Issue96/Issue96ValidationTest.cs b/TestFiles/Issue96/Issue96ValidationTest.cs new file mode 100644 index 0000000..8fd82ef --- /dev/null +++ b/TestFiles/Issue96/Issue96ValidationTest.cs @@ -0,0 +1,277 @@ +// Issue #96 Validation Test +// This test creates documents with moves and validates that: +// 1. Move detection works correctly +// 2. All revision IDs are unique (the core Issue #96 fix) +// 3. The output document can be opened without errors + +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Xml.Linq; +using DocumentFormat.OpenXml; +using DocumentFormat.OpenXml.Packaging; +using DocumentFormat.OpenXml.Wordprocessing; +using DocumentFormat.OpenXml.Validation; +using Docxodus; + +class Issue96ValidationTest +{ + static XNamespace W = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"; + + static void Main(string[] args) + { + Console.WriteLine("=== Issue #96 Validation Test ===\n"); + + var outputDir = Path.GetDirectoryName(typeof(Issue96ValidationTest).Assembly.Location) + ?? Directory.GetCurrentDirectory(); + + // Test 1: Simple paragraph swap (single move) + Console.WriteLine("Test 1: Simple paragraph swap"); + RunTest("SimpleSwap", outputDir, + new[] { + "The quick brown fox jumps over the lazy dog.", + "Pack my box with five dozen liquor jugs.", + "How vexingly quick daft zebras jump." + }, + new[] { + "Pack my box with five dozen liquor jugs.", + "The quick brown fox jumps over the lazy dog.", + "How vexingly quick daft zebras jump." + }); + + // Test 2: Move with additional changes (the Issue #96 scenario) + Console.WriteLine("\nTest 2: Move with additional ins/del changes"); + RunTest("MoveWithChanges", outputDir, + new[] { + "First paragraph that will be moved to the end.", + "Second paragraph that stays but gets modified.", + "Third paragraph that will be deleted entirely.", + "Fourth paragraph that remains unchanged." + }, + new[] { + "Second paragraph that was modified here today.", + "Fourth paragraph that remains unchanged.", + "First paragraph that will be moved to the end.", + "Fifth paragraph that is completely new." + }); + + // Test 3: Multiple independent moves + Console.WriteLine("\nTest 3: Multiple content blocks"); + RunTest("MultipleBlocks", outputDir, + new[] { + "Alpha paragraph with enough words for detection.", + "Beta paragraph with sufficient content here.", + "Gamma paragraph stays in the same position.", + "Delta paragraph with more words for testing." + }, + new[] { + "Gamma paragraph stays in the same position.", + "Beta paragraph with sufficient content here.", + "Alpha paragraph with enough words for detection.", + "Delta paragraph with more words for testing." + }); + + Console.WriteLine("\n=== All Tests Complete ==="); + Console.WriteLine($"Output files written to: {outputDir}"); + } + + static void RunTest(string testName, string outputDir, string[] doc1Paragraphs, string[] doc2Paragraphs) + { + try + { + // Create test documents + var doc1 = CreateDocument(doc1Paragraphs); + var doc2 = CreateDocument(doc2Paragraphs); + + // Compare with move detection enabled + var settings = new WmlComparerSettings + { + DetectMoves = true, + SimplifyMoveMarkup = false, // Keep native move markup + MoveSimilarityThreshold = 0.8, + MoveMinimumWordCount = 3, + AuthorForRevisions = "Issue96Test" + }; + + var compared = WmlComparer.Compare(doc1, doc2, settings); + + // Save the comparison document + var outputPath = Path.Combine(outputDir, $"{testName}_Compared.docx"); + File.WriteAllBytes(outputPath, compared.DocumentByteArray); + Console.WriteLine($" ✓ Saved: {testName}_Compared.docx"); + + // Validate ID uniqueness + var (isValid, details) = ValidateRevisionIds(compared); + if (isValid) + { + Console.WriteLine($" ✓ All revision IDs are unique"); + } + else + { + Console.WriteLine($" ✗ ID COLLISION DETECTED: {details}"); + } + + // Check for move elements + var moveInfo = AnalyzeMoveElements(compared); + Console.WriteLine($" ✓ Move elements: {moveInfo.MoveFromCount} moveFrom, {moveInfo.MoveToCount} moveTo"); + Console.WriteLine($" ✓ Del/Ins elements: {moveInfo.DelCount} del, {moveInfo.InsCount} ins"); + Console.WriteLine($" ✓ Move names: {string.Join(", ", moveInfo.MoveNames)}"); + + // Run OpenXML validation + var validationErrors = ValidateDocument(compared); + if (validationErrors.Count == 0) + { + Console.WriteLine($" ✓ OpenXML validation passed"); + } + else + { + Console.WriteLine($" ⚠ OpenXML validation: {validationErrors.Count} issues"); + foreach (var error in validationErrors.Take(3)) + { + Console.WriteLine($" - {error}"); + } + } + } + catch (Exception ex) + { + Console.WriteLine($" ✗ ERROR: {ex.Message}"); + } + } + + static WmlDocument CreateDocument(string[] paragraphs) + { + using var stream = new MemoryStream(); + using (var doc = WordprocessingDocument.Create(stream, WordprocessingDocumentType.Document)) + { + var mainPart = doc.AddMainDocumentPart(); + mainPart.Document = new Document( + new Body( + paragraphs.Select(text => + new Paragraph( + new Run( + new Text(text) + ) + ) + ) + ) + ); + + var stylesPart = mainPart.AddNewPart(); + stylesPart.Styles = new Styles( + new DocDefaults( + new RunPropertiesDefault( + new RunPropertiesBaseStyle( + new RunFonts { Ascii = "Calibri" }, + new FontSize { Val = "22" } + ) + ), + new ParagraphPropertiesDefault() + ) + ); + + var settingsPart = mainPart.AddNewPart(); + settingsPart.Settings = new Settings(); + + doc.Save(); + } + + stream.Position = 0; + return new WmlDocument("test.docx", stream.ToArray()); + } + + static (bool IsValid, string Details) ValidateRevisionIds(WmlDocument doc) + { + using var stream = new MemoryStream(doc.DocumentByteArray); + using var wDoc = WordprocessingDocument.Open(stream, false); + + var allIds = new List<(string Id, string ElementType, string Location)>(); + var revisionElements = new[] { "ins", "del", "moveFrom", "moveTo", "rPrChange" }; + + // Check main document + var mainXDoc = wDoc.MainDocumentPart.GetXDocument(); + foreach (var elemName in revisionElements) + { + foreach (var elem in mainXDoc.Descendants(W + elemName)) + { + var id = elem.Attribute(W + "id")?.Value; + if (id != null) + { + allIds.Add((id, elemName, "MainDocument")); + } + } + } + + // Check footnotes + if (wDoc.MainDocumentPart.FootnotesPart != null) + { + var fnXDoc = wDoc.MainDocumentPart.FootnotesPart.GetXDocument(); + foreach (var elemName in revisionElements) + { + foreach (var elem in fnXDoc.Descendants(W + elemName)) + { + var id = elem.Attribute(W + "id")?.Value; + if (id != null) + { + allIds.Add((id, elemName, "Footnotes")); + } + } + } + } + + // Find duplicates (excluding range pairs which share IDs by design) + var duplicates = allIds.GroupBy(x => x.Id) + .Where(g => g.Count() > 1) + .Where(g => { + // Range start/end pairs are allowed to share IDs + var types = g.Select(x => x.ElementType).Distinct().ToList(); + if (types.Count == 1 && (types[0] == "moveFromRangeStart" || types[0] == "moveToRangeStart")) + return false; + return true; + }) + .ToList(); + + if (duplicates.Count == 0) + { + return (true, $"All {allIds.Count} revision IDs are unique"); + } + + var details = string.Join("; ", duplicates.Select(g => + $"ID={g.Key} used by: {string.Join(", ", g.Select(x => $"{x.ElementType}@{x.Location}"))}")); + return (false, details); + } + + static (int MoveFromCount, int MoveToCount, int DelCount, int InsCount, List MoveNames) + AnalyzeMoveElements(WmlDocument doc) + { + using var stream = new MemoryStream(doc.DocumentByteArray); + using var wDoc = WordprocessingDocument.Open(stream, false); + + var mainXDoc = wDoc.MainDocumentPart.GetXDocument(); + + var moveFromCount = mainXDoc.Descendants(W + "moveFrom").Count(); + var moveToCount = mainXDoc.Descendants(W + "moveTo").Count(); + var delCount = mainXDoc.Descendants(W + "del").Count(); + var insCount = mainXDoc.Descendants(W + "ins").Count(); + + var moveNames = mainXDoc.Descendants(W + "moveFromRangeStart") + .Select(e => e.Attribute(W + "name")?.Value) + .Where(n => n != null) + .Distinct() + .ToList(); + + return (moveFromCount, moveToCount, delCount, insCount, moveNames); + } + + static List ValidateDocument(WmlDocument doc) + { + using var stream = new MemoryStream(doc.DocumentByteArray); + using var wDoc = WordprocessingDocument.Open(stream, false); + + var validator = new OpenXmlValidator(FileFormatVersions.Office2019); + return validator.Validate(wDoc) + .Select(e => $"{e.ErrorType}: {e.Description}") + .Take(10) + .ToList(); + } +} diff --git a/TestFiles/Issue96/Issue96ValidationTest.csproj b/TestFiles/Issue96/Issue96ValidationTest.csproj new file mode 100644 index 0000000..4a097fc --- /dev/null +++ b/TestFiles/Issue96/Issue96ValidationTest.csproj @@ -0,0 +1,16 @@ + + + + Exe + net8.0 + disable + disable + SA1636 + StressTest + + + + + + + diff --git a/TestFiles/Issue96/ShowMoveXml.cs b/TestFiles/Issue96/ShowMoveXml.cs new file mode 100644 index 0000000..9d4264b --- /dev/null +++ b/TestFiles/Issue96/ShowMoveXml.cs @@ -0,0 +1,157 @@ +// Simple test to show move markup is working correctly +using System; +using System.IO; +using System.Linq; +using System.Xml.Linq; +using DocumentFormat.OpenXml; +using DocumentFormat.OpenXml.Packaging; +using DocumentFormat.OpenXml.Wordprocessing; +using Docxodus; + +class ShowMoveXml +{ + static void Main() + { + Console.WriteLine("Creating test documents...\n"); + + // Create doc1: A, B, C + var doc1 = CreateDoc( + "This is paragraph A with enough words for move detection.", + "This is paragraph B with sufficient content here.", + "This is paragraph C with more words added." + ); + + // Create doc2: B, A, C (A moved after B) + var doc2 = CreateDoc( + "This is paragraph B with sufficient content here.", + "This is paragraph A with enough words for move detection.", + "This is paragraph C with more words added." + ); + + Console.WriteLine("Doc1: [A] [B] [C]"); + Console.WriteLine("Doc2: [B] [A] [C] (A moved after B)\n"); + + var settings = new WmlComparerSettings + { + DetectMoves = true, + SimplifyMoveMarkup = false, + MoveSimilarityThreshold = 0.8, + MoveMinimumWordCount = 3 + }; + + var compared = WmlComparer.Compare(doc1, doc2, settings); + + // Extract and display the body XML + using var stream = new MemoryStream(compared.DocumentByteArray); + using var wDoc = WordprocessingDocument.Open(stream, false); + + var body = wDoc.MainDocumentPart.Document.Body; + var bodyXml = XElement.Parse(body.OuterXml); + + XNamespace w = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"; + + Console.WriteLine("=== MOVE ELEMENTS FOUND ===\n"); + + // Show moveFromRangeStart elements + var moveFromStarts = bodyXml.Descendants(w + "moveFromRangeStart").ToList(); + Console.WriteLine($"moveFromRangeStart: {moveFromStarts.Count}"); + foreach (var e in moveFromStarts) + { + Console.WriteLine($" id={e.Attribute(w + "id")?.Value}, name={e.Attribute(w + "name")?.Value}"); + } + + // Show moveFrom elements + var moveFroms = bodyXml.Descendants(w + "moveFrom").ToList(); + Console.WriteLine($"\nmoveFrom: {moveFroms.Count}"); + foreach (var e in moveFroms) + { + var text = string.Join("", e.Descendants(w + "t").Select(t => t.Value)); + Console.WriteLine($" id={e.Attribute(w + "id")?.Value}, text=\"{Truncate(text, 50)}\""); + } + + // Show moveToRangeStart elements + var moveToStarts = bodyXml.Descendants(w + "moveToRangeStart").ToList(); + Console.WriteLine($"\nmoveToRangeStart: {moveToStarts.Count}"); + foreach (var e in moveToStarts) + { + Console.WriteLine($" id={e.Attribute(w + "id")?.Value}, name={e.Attribute(w + "name")?.Value}"); + } + + // Show moveTo elements + var moveTos = bodyXml.Descendants(w + "moveTo").ToList(); + Console.WriteLine($"\nmoveTo: {moveTos.Count}"); + foreach (var e in moveTos) + { + var text = string.Join("", e.Descendants(w + "t").Select(t => t.Value)); + Console.WriteLine($" id={e.Attribute(w + "id")?.Value}, text=\"{Truncate(text, 50)}\""); + } + + Console.WriteLine("\n=== DEL/INS ELEMENTS ===\n"); + + var dels = bodyXml.Descendants(w + "del").ToList(); + var inss = bodyXml.Descendants(w + "ins").ToList(); + Console.WriteLine($"del: {dels.Count}"); + Console.WriteLine($"ins: {inss.Count}"); + + Console.WriteLine("\n=== ALL REVISION IDs ===\n"); + + var allIds = new[] { "del", "ins", "moveFrom", "moveTo" } + .SelectMany(name => bodyXml.Descendants(w + name) + .Select(e => new { Type = name, Id = e.Attribute(w + "id")?.Value })) + .Where(x => x.Id != null) + .OrderBy(x => int.Parse(x.Id)) + .ToList(); + + foreach (var item in allIds) + { + Console.WriteLine($" {item.Type,-10} id={item.Id}"); + } + + // Check for duplicates + var duplicates = allIds.GroupBy(x => x.Id).Where(g => g.Count() > 1).ToList(); + Console.WriteLine(); + if (duplicates.Any()) + { + Console.WriteLine("❌ DUPLICATE IDs FOUND (BUG!):"); + foreach (var dup in duplicates) + { + Console.WriteLine($" ID {dup.Key}: {string.Join(", ", dup.Select(x => x.Type))}"); + } + } + else + { + Console.WriteLine("✅ All IDs are unique - Issue #96 is FIXED!"); + } + + // Save output file + var outputPath = "Issue96_MoveDemo.docx"; + File.WriteAllBytes(outputPath, compared.DocumentByteArray); + Console.WriteLine($"\n📄 Output saved: {Path.GetFullPath(outputPath)}"); + } + + static WmlDocument CreateDoc(params string[] paragraphs) + { + using var stream = new MemoryStream(); + using (var doc = WordprocessingDocument.Create(stream, WordprocessingDocumentType.Document)) + { + var mainPart = doc.AddMainDocumentPart(); + mainPart.Document = new Document(new Body( + paragraphs.Select(t => new Paragraph(new Run(new Text(t)))))); + + var stylesPart = mainPart.AddNewPart(); + stylesPart.Styles = new Styles(new DocDefaults( + new RunPropertiesDefault(new RunPropertiesBaseStyle( + new RunFonts { Ascii = "Calibri" }, new FontSize { Val = "22" })), + new ParagraphPropertiesDefault())); + + var settingsPart = mainPart.AddNewPart(); + settingsPart.Settings = new Settings(); + doc.Save(); + } + stream.Position = 0; + return new WmlDocument("test.docx", stream.ToArray()); + } + + static string Truncate(string s, int max) => + s.Length <= max ? s : s.Substring(0, max) + "..."; +} diff --git a/TestFiles/Issue96/StressTest.cs b/TestFiles/Issue96/StressTest.cs new file mode 100644 index 0000000..fa84347 --- /dev/null +++ b/TestFiles/Issue96/StressTest.cs @@ -0,0 +1,359 @@ +// Issue #96 Stress Test +// Creates complex documents with dozens of moves and hundreds of changes +// to thoroughly validate the ID uniqueness fix + +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using System.Xml.Linq; +using DocumentFormat.OpenXml; +using DocumentFormat.OpenXml.Packaging; +using DocumentFormat.OpenXml.Wordprocessing; +using DocumentFormat.OpenXml.Validation; +using Docxodus; + +class StressTest +{ + static readonly Random Rng = new Random(42); // Fixed seed for reproducibility + static XNamespace W = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"; + + static void Main() + { + Console.WriteLine("╔════════════════════════════════════════════════════════════════════╗"); + Console.WriteLine("║ ISSUE #96 STRESS TEST ║"); + Console.WriteLine("║ Dozens of moves + Hundreds of changes ║"); + Console.WriteLine("╚════════════════════════════════════════════════════════════════════╝\n"); + + // Test 1: Large document with many moves + Console.WriteLine("═══════════════════════════════════════════════════════════════════════"); + Console.WriteLine("TEST 1: 50 paragraphs, ~15 moves, ~30 other changes"); + Console.WriteLine("═══════════════════════════════════════════════════════════════════════\n"); + RunStressTest("StressTest1", 50, 15, 30); + + // Test 2: Even larger with more chaos + Console.WriteLine("\n═══════════════════════════════════════════════════════════════════════"); + Console.WriteLine("TEST 2: 100 paragraphs, ~25 moves, ~50 other changes"); + Console.WriteLine("═══════════════════════════════════════════════════════════════════════\n"); + RunStressTest("StressTest2", 100, 25, 50); + + // Test 3: Maximum chaos + Console.WriteLine("\n═══════════════════════════════════════════════════════════════════════"); + Console.WriteLine("TEST 3: 200 paragraphs, ~40 moves, ~100 other changes"); + Console.WriteLine("═══════════════════════════════════════════════════════════════════════\n"); + RunStressTest("StressTest3", 200, 40, 100); + + Console.WriteLine("\n════════════════════════════════════════════════════════════════════════"); + Console.WriteLine(" ALL STRESS TESTS COMPLETE"); + Console.WriteLine("════════════════════════════════════════════════════════════════════════\n"); + } + + static void RunStressTest(string name, int paragraphCount, int moveCount, int changeCount) + { + Console.WriteLine($"Generating {paragraphCount} paragraphs..."); + + // Generate original document with numbered paragraphs containing unique content + var originalParagraphs = Enumerable.Range(1, paragraphCount) + .Select(i => GenerateParagraph(i)) + .ToList(); + + // Create modified version with moves and changes + var modifiedParagraphs = new List(originalParagraphs); + + // Track what we're doing for reporting + var moves = new List<(int from, int to, string text)>(); + var deletions = new List(); + var insertions = new List(); + var modifications = new List(); + + // Apply moves: pick random paragraphs and move them to new positions + Console.WriteLine($"Applying ~{moveCount} moves..."); + var availableForMove = Enumerable.Range(0, modifiedParagraphs.Count).ToList(); + for (int i = 0; i < moveCount && availableForMove.Count > 2; i++) + { + int fromIdx = availableForMove[Rng.Next(availableForMove.Count)]; + availableForMove.Remove(fromIdx); + + var para = modifiedParagraphs[fromIdx]; + modifiedParagraphs.RemoveAt(fromIdx); + + // Adjust available indices after removal + availableForMove = availableForMove.Select(x => x > fromIdx ? x - 1 : x).ToList(); + + int toIdx = Rng.Next(modifiedParagraphs.Count + 1); + modifiedParagraphs.Insert(toIdx, para); + + // Adjust available indices after insertion + availableForMove = availableForMove.Select(x => x >= toIdx ? x + 1 : x).ToList(); + + moves.Add((fromIdx, toIdx, para.Substring(0, Math.Min(40, para.Length)))); + } + + // Apply deletions + int deleteCount = changeCount / 3; + Console.WriteLine($"Applying ~{deleteCount} deletions..."); + for (int i = 0; i < deleteCount && modifiedParagraphs.Count > paragraphCount / 2; i++) + { + int idx = Rng.Next(modifiedParagraphs.Count); + deletions.Add(idx); + modifiedParagraphs.RemoveAt(idx); + } + + // Apply insertions + int insertCount = changeCount / 3; + Console.WriteLine($"Applying ~{insertCount} insertions..."); + for (int i = 0; i < insertCount; i++) + { + int idx = Rng.Next(modifiedParagraphs.Count + 1); + insertions.Add(idx); + modifiedParagraphs.Insert(idx, $"[NEW-{i + 1}] This is a newly inserted paragraph with enough words to be meaningful. " + + $"It contains various content including technical terms, legal jargon, and general prose. " + + $"The purpose is to test the comparison engine with substantial insertions. Reference: INS-{Guid.NewGuid():N}"); + } + + // Apply modifications (change words in existing paragraphs) + int modifyCount = changeCount / 3; + Console.WriteLine($"Applying ~{modifyCount} modifications..."); + for (int i = 0; i < modifyCount && modifiedParagraphs.Count > 0; i++) + { + int idx = Rng.Next(modifiedParagraphs.Count); + var para = modifiedParagraphs[idx]; + // Modify by replacing some words + para = para.Replace("paragraph", "section") + .Replace("content", "material") + .Replace("document", "file"); + if (!para.Contains("[MODIFIED]")) + { + para = "[MODIFIED] " + para; + } + modifiedParagraphs[idx] = para; + modifications.Add(idx); + } + + Console.WriteLine($"\nCreating documents..."); + Console.WriteLine($" Original: {originalParagraphs.Count} paragraphs"); + Console.WriteLine($" Modified: {modifiedParagraphs.Count} paragraphs"); + + var doc1 = CreateDocument(originalParagraphs); + var doc2 = CreateDocument(modifiedParagraphs); + + Console.WriteLine($"\nComparing with DetectMoves=true..."); + + var settings = new WmlComparerSettings + { + DetectMoves = true, + SimplifyMoveMarkup = false, + MoveSimilarityThreshold = 0.75, // Slightly lower threshold to catch more moves + MoveMinimumWordCount = 5, + AuthorForRevisions = "StressTest" + }; + + var stopwatch = System.Diagnostics.Stopwatch.StartNew(); + var compared = WmlComparer.Compare(doc1, doc2, settings); + stopwatch.Stop(); + + Console.WriteLine($" Comparison completed in {stopwatch.ElapsedMilliseconds}ms"); + + // Save output + var outputPath = $"{name}_Output.docx"; + File.WriteAllBytes(outputPath, compared.DocumentByteArray); + Console.WriteLine($"\n📄 Output: {Path.GetFullPath(outputPath)}"); + + // Analyze results + Console.WriteLine($"\n--- ANALYSIS ---\n"); + + using var stream = new MemoryStream(compared.DocumentByteArray); + using var wDoc = WordprocessingDocument.Open(stream, false); + var mainXDoc = wDoc.MainDocumentPart.GetXDocument(); + + // Count elements + var stats = new Dictionary + { + ["moveFrom"] = mainXDoc.Descendants(W + "moveFrom").Count(), + ["moveTo"] = mainXDoc.Descendants(W + "moveTo").Count(), + ["moveFromRangeStart"] = mainXDoc.Descendants(W + "moveFromRangeStart").Count(), + ["moveFromRangeEnd"] = mainXDoc.Descendants(W + "moveFromRangeEnd").Count(), + ["moveToRangeStart"] = mainXDoc.Descendants(W + "moveToRangeStart").Count(), + ["moveToRangeEnd"] = mainXDoc.Descendants(W + "moveToRangeEnd").Count(), + ["del"] = mainXDoc.Descendants(W + "del").Count(), + ["ins"] = mainXDoc.Descendants(W + "ins").Count(), + ["rPrChange"] = mainXDoc.Descendants(W + "rPrChange").Count(), + }; + + Console.WriteLine("Element Counts:"); + Console.WriteLine($" Move elements:"); + Console.WriteLine($" moveFrom: {stats["moveFrom"],4}"); + Console.WriteLine($" moveTo: {stats["moveTo"],4}"); + Console.WriteLine($" moveFromRangeStart:{stats["moveFromRangeStart"],4}"); + Console.WriteLine($" moveToRangeStart: {stats["moveToRangeStart"],4}"); + Console.WriteLine($" Revision elements:"); + Console.WriteLine($" del: {stats["del"],4}"); + Console.WriteLine($" ins: {stats["ins"],4}"); + Console.WriteLine($" rPrChange: {stats["rPrChange"],4}"); + + // Collect all revision IDs + var revisionElements = new[] { "ins", "del", "moveFrom", "moveTo", "rPrChange" }; + var allIds = new List<(string Id, string Type)>(); + + foreach (var elemName in revisionElements) + { + foreach (var elem in mainXDoc.Descendants(W + elemName)) + { + var id = elem.Attribute(W + "id")?.Value; + if (id != null) + { + allIds.Add((id, elemName)); + } + } + } + + Console.WriteLine($"\n Total revision elements with IDs: {allIds.Count}"); + + // Check for duplicates - THE CRITICAL TEST + var duplicates = allIds.GroupBy(x => x.Id) + .Where(g => g.Count() > 1) + .ToList(); + + if (duplicates.Count > 0) + { + Console.WriteLine($"\n ❌ DUPLICATE IDs FOUND - ISSUE #96 BUG!"); + foreach (var dup in duplicates.Take(10)) + { + Console.WriteLine($" ID={dup.Key}: {string.Join(", ", dup.Select(x => x.Type))}"); + } + if (duplicates.Count > 10) + { + Console.WriteLine($" ... and {duplicates.Count - 10} more duplicates"); + } + } + else + { + Console.WriteLine($"\n ✅ All {allIds.Count} revision IDs are UNIQUE!"); + } + + // Check move name pairing + var moveFromNames = mainXDoc.Descendants(W + "moveFromRangeStart") + .Select(e => e.Attribute(W + "name")?.Value) + .Where(n => !string.IsNullOrEmpty(n)) + .ToList(); + + var moveToNames = mainXDoc.Descendants(W + "moveToRangeStart") + .Select(e => e.Attribute(W + "name")?.Value) + .Where(n => !string.IsNullOrEmpty(n)) + .ToList(); + + var uniqueMoveNames = moveFromNames.Union(moveToNames).Distinct().ToList(); + Console.WriteLine($"\n Move names: {uniqueMoveNames.Count} unique ({string.Join(", ", uniqueMoveNames.Take(10))}{(uniqueMoveNames.Count > 10 ? "..." : "")})"); + + // Validate all names are paired + var unpairedFrom = moveFromNames.Except(moveToNames).ToList(); + var unpairedTo = moveToNames.Except(moveFromNames).ToList(); + + if (unpairedFrom.Any() || unpairedTo.Any()) + { + Console.WriteLine($" ⚠️ Unpaired move names found:"); + if (unpairedFrom.Any()) Console.WriteLine($" From without To: {string.Join(", ", unpairedFrom)}"); + if (unpairedTo.Any()) Console.WriteLine($" To without From: {string.Join(", ", unpairedTo)}"); + } + else if (uniqueMoveNames.Count > 0) + { + Console.WriteLine($" ✅ All move names properly paired!"); + } + + // OpenXML validation + var validator = new OpenXmlValidator(FileFormatVersions.Office2019); + var errors = validator.Validate(wDoc).ToList(); + + if (errors.Count == 0) + { + Console.WriteLine($"\n ✅ OpenXML validation PASSED"); + } + else + { + Console.WriteLine($"\n ⚠️ OpenXML validation: {errors.Count} issues"); + foreach (var error in errors.Take(5)) + { + Console.WriteLine($" {error.ErrorType}: {error.Description.Substring(0, Math.Min(80, error.Description.Length))}..."); + } + } + + // Final verdict + Console.WriteLine($"\n--- VERDICT ---"); + bool passed = duplicates.Count == 0; + if (passed) + { + Console.WriteLine($" 🎉 {name} PASSED - No duplicate IDs with {stats["moveFrom"]} moves and {stats["del"] + stats["ins"]} del/ins"); + } + else + { + Console.WriteLine($" 💥 {name} FAILED - Found {duplicates.Count} duplicate IDs"); + } + } + + static string GenerateParagraph(int index) + { + var templates = new[] + { + "Paragraph {0}: This document section contains important information about the project requirements and specifications. Reference ID: {1}", + "Section {0}: The following content describes the technical implementation details for the proposed system architecture. Doc: {1}", + "Item {0}: According to the agreement dated herein, the parties shall comply with all terms and conditions specified. Contract: {1}", + "Clause {0}: The licensee agrees to use the software only for purposes permitted under this license agreement. License: {1}", + "Article {0}: This paragraph establishes the fundamental principles governing the relationship between the entities. Ref: {1}", + "Point {0}: The data processing activities shall be conducted in accordance with applicable privacy regulations. GDPR: {1}", + "Note {0}: All modifications to this document must be tracked and approved by the designated review committee. Rev: {1}", + "Entry {0}: The financial statements have been prepared in accordance with generally accepted accounting principles. GAAP: {1}", + "Record {0}: This memorandum summarizes the key decisions made during the executive committee meeting. Minutes: {1}", + "Statement {0}: The undersigned hereby certifies that all information provided is true and accurate. Cert: {1}", + "Provision {0}: Notwithstanding the foregoing, the obligations set forth herein shall survive termination. Legal: {1}", + "Stipulation {0}: The contractor shall deliver all work products by the specified deadline. Deadline: {1}", + "Requirement {0}: The system shall support concurrent users and maintain response times under load. Perf: {1}", + "Specification {0}: All API endpoints must implement proper authentication and authorization. Security: {1}", + "Definition {0}: For purposes of this agreement, the following terms shall have the meanings ascribed. Terms: {1}", + }; + + var template = templates[index % templates.Length]; + return string.Format(template, index, $"DOC-{index:D4}-{Guid.NewGuid().ToString().Substring(0, 8).ToUpper()}"); + } + + static WmlDocument CreateDocument(List paragraphs) + { + using var stream = new MemoryStream(); + using (var doc = WordprocessingDocument.Create(stream, WordprocessingDocumentType.Document)) + { + var mainPart = doc.AddMainDocumentPart(); + mainPart.Document = new Document( + new Body( + paragraphs.Select(text => + new Paragraph( + new Run( + new Text(text) + ) + ) + ) + ) + ); + + var stylesPart = mainPart.AddNewPart(); + stylesPart.Styles = new Styles( + new DocDefaults( + new RunPropertiesDefault( + new RunPropertiesBaseStyle( + new RunFonts { Ascii = "Calibri" }, + new FontSize { Val = "22" } + ) + ), + new ParagraphPropertiesDefault() + ) + ); + + var settingsPart = mainPart.AddNewPart(); + settingsPart.Settings = new Settings(); + + doc.Save(); + } + + stream.Position = 0; + return new WmlDocument("test.docx", stream.ToArray()); + } +}