From 8b724a26544dca77babe552ccee7527977ca95cc Mon Sep 17 00:00:00 2001 From: Rohit Date: Fri, 11 Oct 2019 11:42:17 +0530 Subject: [PATCH 1/2] Filtering the empty strings from substring array --- src/sentences/sentence_splitting.jl | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/sentences/sentence_splitting.jl b/src/sentences/sentence_splitting.jl index 0f58147..7e10bcb 100644 --- a/src/sentences/sentence_splitting.jl +++ b/src/sentences/sentence_splitting.jl @@ -1,8 +1,8 @@ function rulebased_split_sentences(sentences) sentences = replace(sentences, r"([?!.])\s" => Base.SubstitutionString("\\1\n")) - sentences = postproc_splits(sentences) - split(sentences, "\n") + sentences = split(sentences, "\n") + filter!(e-> e ≠ "", sentences) end @@ -38,7 +38,6 @@ function postproc_splits(sentences::AbstractString) # Before we do anything remove windows line-ends sentences = replace(sentences, "\r" => "") - # breaks sometimes missing after "?", "safe" cases sentences = replace(sentences, r"\b([a-z]+\?) ([A-Z][a-z]+)\b" => Base.SubstitutionString("\\1\n\\2")) # breaks sometimes missing after "." separated with extra space, "safe" cases From cc6f76f93c93346b975f3773e68b046ba131fbf5 Mon Sep 17 00:00:00 2001 From: Rohit Date: Fri, 11 Oct 2019 16:14:08 +0530 Subject: [PATCH 2/2] Removing the Empty string from array --- src/sentences/sentence_splitting.jl | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/sentences/sentence_splitting.jl b/src/sentences/sentence_splitting.jl index 7e10bcb..8a89a5d 100644 --- a/src/sentences/sentence_splitting.jl +++ b/src/sentences/sentence_splitting.jl @@ -1,12 +1,11 @@ function rulebased_split_sentences(sentences) sentences = replace(sentences, r"([?!.])\s" => Base.SubstitutionString("\\1\n")) + sentences = postproc_splits(sentences) - sentences = split(sentences, "\n") - filter!(e-> e ≠ "", sentences) + split(sentences, "\n"; keepempty=false) end - function replace_til_no_change(input, pattern, replacement) while(occursin(pattern, input)) input = replace(input, pattern => replacement)