diff --git a/src/sentences/sentence_splitting.jl b/src/sentences/sentence_splitting.jl index 0f58147..8a89a5d 100644 --- a/src/sentences/sentence_splitting.jl +++ b/src/sentences/sentence_splitting.jl @@ -2,11 +2,10 @@ function rulebased_split_sentences(sentences) sentences = replace(sentences, r"([?!.])\s" => Base.SubstitutionString("\\1\n")) sentences = postproc_splits(sentences) - split(sentences, "\n") + split(sentences, "\n"; keepempty=false) end - function replace_til_no_change(input, pattern, replacement) while(occursin(pattern, input)) input = replace(input, pattern => replacement) @@ -38,7 +37,6 @@ function postproc_splits(sentences::AbstractString) # Before we do anything remove windows line-ends sentences = replace(sentences, "\r" => "") - # breaks sometimes missing after "?", "safe" cases sentences = replace(sentences, r"\b([a-z]+\?) ([A-Z][a-z]+)\b" => Base.SubstitutionString("\\1\n\\2")) # breaks sometimes missing after "." separated with extra space, "safe" cases