diff --git a/src/Cli/test/LangChain.Cli.IntegrationTests/LangChain.Cli.IntegrationTests.csproj b/src/Cli/test/LangChain.Cli.IntegrationTests/LangChain.Cli.IntegrationTests.csproj index dc237dbd..6f1943f4 100644 --- a/src/Cli/test/LangChain.Cli.IntegrationTests/LangChain.Cli.IntegrationTests.csproj +++ b/src/Cli/test/LangChain.Cli.IntegrationTests/LangChain.Cli.IntegrationTests.csproj @@ -7,6 +7,10 @@ enable + + + + diff --git a/src/Core/test/UnitTests/LangChain.Core.UnitTests.csproj b/src/Core/test/UnitTests/LangChain.Core.UnitTests.csproj index 35422a2e..2942cba7 100644 --- a/src/Core/test/UnitTests/LangChain.Core.UnitTests.csproj +++ b/src/Core/test/UnitTests/LangChain.Core.UnitTests.csproj @@ -4,6 +4,10 @@ net9.0 + + + + diff --git a/src/Directory.Packages.props b/src/Directory.Packages.props index 9358d150..1ab22ceb 100644 --- a/src/Directory.Packages.props +++ b/src/Directory.Packages.props @@ -9,6 +9,7 @@ + all diff --git a/src/DocumentLoaders/IntegrationTests/LangChain.DocumentLoaders.IntegrationTests.csproj b/src/DocumentLoaders/IntegrationTests/LangChain.DocumentLoaders.IntegrationTests.csproj index d5e6c8c7..cacde49e 100644 --- a/src/DocumentLoaders/IntegrationTests/LangChain.DocumentLoaders.IntegrationTests.csproj +++ b/src/DocumentLoaders/IntegrationTests/LangChain.DocumentLoaders.IntegrationTests.csproj @@ -6,6 +6,10 @@ $(NoWarn) + + + + diff --git a/src/Meta/test/LangChain.IntegrationTests.csproj b/src/Meta/test/LangChain.IntegrationTests.csproj index 09d189ce..838985e9 100644 --- a/src/Meta/test/LangChain.IntegrationTests.csproj +++ b/src/Meta/test/LangChain.IntegrationTests.csproj @@ -6,6 +6,7 @@ + diff --git a/src/Splitters/Abstractions/src/Text/CharacterTextSplitter.cs b/src/Splitters/Abstractions/src/Text/CharacterTextSplitter.cs index b53d958d..ec9e5b7d 100644 --- a/src/Splitters/Abstractions/src/Text/CharacterTextSplitter.cs +++ b/src/Splitters/Abstractions/src/Text/CharacterTextSplitter.cs @@ -15,6 +15,8 @@ public override IReadOnlyList SplitText(string text) { text = text ?? throw new ArgumentNullException(nameof(text)); + text = text.Replace("\r", ""); // some people are using windows + List splits; if (separator != null) { diff --git a/src/Splitters/Abstractions/src/Text/MarkdownHeaderTextSplitter.cs b/src/Splitters/Abstractions/src/Text/MarkdownHeaderTextSplitter.cs index 18a4b706..67390e2d 100644 --- a/src/Splitters/Abstractions/src/Text/MarkdownHeaderTextSplitter.cs +++ b/src/Splitters/Abstractions/src/Text/MarkdownHeaderTextSplitter.cs @@ -76,7 +76,7 @@ public override IReadOnlyList SplitText(string text) { var existingHeader = currentHeader.Split('|'); - string prevHeader = string.Join("|", existingHeader.Take(existingHeader.Length - 1)); + string prevHeader = string.Join("|", existingHeader.Take(existingHeader.Length - (1 - hLen + currentHeaderLen))); currentHeader = prevHeader + "|" + strippedLine.TrimStart('#').Trim(); currentHeaderLen = hLen; continue; diff --git a/src/Splitters/Abstractions/src/Text/RecursiveCharacterTextSplitter.cs b/src/Splitters/Abstractions/src/Text/RecursiveCharacterTextSplitter.cs index 4e96e87f..85fcb75f 100644 --- a/src/Splitters/Abstractions/src/Text/RecursiveCharacterTextSplitter.cs +++ b/src/Splitters/Abstractions/src/Text/RecursiveCharacterTextSplitter.cs @@ -19,6 +19,8 @@ public override IReadOnlyList SplitText(string text) { text = text ?? throw new ArgumentNullException(nameof(text)); + text = text.Replace("\r", ""); // some people are using windows + List finalChunks = new List(); string separator = _separators[_separators.Count - 1]; diff --git a/src/Splitters/Abstractions/test/LangChain.Splitters.Abstractions.Tests.csproj b/src/Splitters/Abstractions/test/LangChain.Splitters.Abstractions.Tests.csproj index 208c95a9..b351aa60 100644 --- a/src/Splitters/Abstractions/test/LangChain.Splitters.Abstractions.Tests.csproj +++ b/src/Splitters/Abstractions/test/LangChain.Splitters.Abstractions.Tests.csproj @@ -4,6 +4,10 @@ net9.0 + + + + diff --git a/src/Splitters/Abstractions/test/Resources/markdown_test_material.md b/src/Splitters/Abstractions/test/Resources/markdown_test_material.md new file mode 100644 index 00000000..a5ef2b3c --- /dev/null +++ b/src/Splitters/Abstractions/test/Resources/markdown_test_material.md @@ -0,0 +1,71 @@ +# Header A + +Text A + +## Header A.A + +Text A.A + +## Header A.B + +Text A.B + +### Header A.B.A + +Text A.B.A + +### Header A.B.B + +Text A.B.B + +### Header A.B.C + +Text A.B.C + +## Header A.C + +Text A.C + +### Header A.C.A + +Text A.C.A + +### Header A.C.B + +Text A.C.B + +# Header B + +Text B + +## Header B.A + +Text B.A + +## Header B.B + +Text B.B + +### Header B.B.A + +Text B.B.A + +### Header B.B.B + +Text B.B.B + +## Header B.C + +Text B.C + +### Header B.C.A + +Text B.C.A + +### Header B.C.B + +Text B.C.B + +### Header B.C.C + +Text B.C.C \ No newline at end of file diff --git a/src/Splitters/Abstractions/test/Tests.MarkdownHeader.cs b/src/Splitters/Abstractions/test/Tests.MarkdownHeader.cs index 42f74eda..603fd88d 100644 --- a/src/Splitters/Abstractions/test/Tests.MarkdownHeader.cs +++ b/src/Splitters/Abstractions/test/Tests.MarkdownHeader.cs @@ -61,4 +61,69 @@ Hi this is Joe res[0].Should().Be("Hi this is Jim\nHi this is Joe"); res[1].Should().Be("Hi this is Molly"); } + + [Test] + public void TestMarkdown4() + { + var md = H.Resources.markdown_test_material_md.AsString(); + + var splitter = new MarkdownHeaderTextSplitter(); + var res = splitter.SplitText(md); + + res.Count.Should().Be(18); + + res[0].Split("\n")[0].Should().Be("Header A"); + res[0].Split("\n")[1].Should().Be("Text A"); + + res[1].Split("\n")[0].Should().Be("Header A: Header A.A"); + res[1].Split("\n")[1].Should().Be("Text A.A"); + + res[2].Split("\n")[0].Should().Be("Header A: Header A.B"); + res[2].Split("\n")[1].Should().Be("Text A.B"); + + res[3].Split("\n")[0].Should().Be("Header A: Header A.B: Header A.B.A"); + res[3].Split("\n")[1].Should().Be("Text A.B.A"); + + res[4].Split("\n")[0].Should().Be("Header A: Header A.B: Header A.B.B"); + res[4].Split("\n")[1].Should().Be("Text A.B.B"); + + res[5].Split("\n")[0].Should().Be("Header A: Header A.B: Header A.B.C"); + res[5].Split("\n")[1].Should().Be("Text A.B.C"); + + res[6].Split("\n")[0].Should().Be("Header A: Header A.C"); + res[6].Split("\n")[1].Should().Be("Text A.C"); + + res[7].Split("\n")[0].Should().Be("Header A: Header A.C: Header A.C.A"); + res[7].Split("\n")[1].Should().Be("Text A.C.A"); + + res[8].Split("\n")[0].Should().Be("Header A: Header A.C: Header A.C.B"); + res[8].Split("\n")[1].Should().Be("Text A.C.B"); + + res[9].Split("\n")[0].Should().Be("Header B"); + res[9].Split("\n")[1].Should().Be("Text B"); + + res[10].Split("\n")[0].Should().Be("Header B: Header B.A"); + res[10].Split("\n")[1].Should().Be("Text B.A"); + + res[11].Split("\n")[0].Should().Be("Header B: Header B.B"); + res[11].Split("\n")[1].Should().Be("Text B.B"); + + res[12].Split("\n")[0].Should().Be("Header B: Header B.B: Header B.B.A"); + res[12].Split("\n")[1].Should().Be("Text B.B.A"); + + res[13].Split("\n")[0].Should().Be("Header B: Header B.B: Header B.B.B"); + res[13].Split("\n")[1].Should().Be("Text B.B.B"); + + res[14].Split("\n")[0].Should().Be("Header B: Header B.C"); + res[14].Split("\n")[1].Should().Be("Text B.C"); + + res[15].Split("\n")[0].Should().Be("Header B: Header B.C: Header B.C.A"); + res[15].Split("\n")[1].Should().Be("Text B.C.A"); + + res[16].Split("\n")[0].Should().Be("Header B: Header B.C: Header B.C.B"); + res[16].Split("\n")[1].Should().Be("Text B.C.B"); + + res[17].Split("\n")[0].Should().Be("Header B: Header B.C: Header B.C.C"); + res[17].Split("\n")[1].Should().Be("Text B.C.C"); + } } \ No newline at end of file diff --git a/src/Splitters/CSharp/test/LangChain.Splitters.CSharp.Tests.csproj b/src/Splitters/CSharp/test/LangChain.Splitters.CSharp.Tests.csproj index 9a8ca786..a544c7af 100644 --- a/src/Splitters/CSharp/test/LangChain.Splitters.CSharp.Tests.csproj +++ b/src/Splitters/CSharp/test/LangChain.Splitters.CSharp.Tests.csproj @@ -4,6 +4,10 @@ net9.0 + + + + diff --git a/src/Utilities/Postgres/test/LangChain.Utilities.Postgres.IntegrationTests.csproj b/src/Utilities/Postgres/test/LangChain.Utilities.Postgres.IntegrationTests.csproj index 5ab5ea0b..6e9dcfed 100644 --- a/src/Utilities/Postgres/test/LangChain.Utilities.Postgres.IntegrationTests.csproj +++ b/src/Utilities/Postgres/test/LangChain.Utilities.Postgres.IntegrationTests.csproj @@ -6,6 +6,7 @@ +