From c6c30d1dcf4f8bda73e9ee60a3944c6ded12aec1 Mon Sep 17 00:00:00 2001 From: jason9693 Date: Tue, 18 Jul 2023 17:38:03 +0000 Subject: [PATCH] fix carriage return removed --- dps/spark/prep/korean_prep.py | 28 +++++++++- dps/spark/prep/lang_agnostic_prep.py | 84 ++++++++++++++++++---------- 2 files changed, 80 insertions(+), 32 deletions(-) diff --git a/dps/spark/prep/korean_prep.py b/dps/spark/prep/korean_prep.py index 1e5f557..ee8c01a 100644 --- a/dps/spark/prep/korean_prep.py +++ b/dps/spark/prep/korean_prep.py @@ -160,17 +160,18 @@ def pattern(idx): def remove_html_tags(text: str): def clean_space(text): - text = re.sub("[\r\n\f\v\t]", " ", text) + # text = re.sub("[\r\n\f\v\t]", " ", text) while " " in text: text = text.replace(" ", " ") return text.strip() if bool(BeautifulSoup(text, "html.parser").find()): try: - processed_html = html2text.html2text(text) + pre_process = text.replace("\n", "
") + processed_html = html2text.html2text(pre_process) except AssertionError: processed_html = text - + text = processed_html text = clean_space(text) @@ -252,3 +253,24 @@ def make_compat(text): text = unicodedata.normalize("NFC", text) text = re.sub("[\u1100-\u11FF]", "", text) return text + + + +def __test__(): + test_text = "안녕하세요.\n제 이름은 양기창 입니다.
반가워요" + print(test_text) + + function_list = [ + reduce_emoticon, + replace_korean_pii, + spam_words_filter, + remove_html_tags, + ] + + for func in function_list: + test_text = func(test_text) + print(f"Function name: {func.__name__} \n{test_text}\n\n") + + +if __name__ == "__main__": + __test__() \ No newline at end of file diff --git a/dps/spark/prep/lang_agnostic_prep.py b/dps/spark/prep/lang_agnostic_prep.py index f2b0351..ccb51dc 100644 --- a/dps/spark/prep/lang_agnostic_prep.py +++ b/dps/spark/prep/lang_agnostic_prep.py @@ -111,36 +111,40 @@ def replace_email_and_url(text: str): def remove_repeated_text(input_text, ngram_range=(3, 13), trial=3): + # TODO: Algorithm is wrong. Need to fix. def _remove_repeated_phrase(input_text, ngram_range): - words = input_text.split() - repeated_part_spans = [] - - for i, word in enumerate(words): - prev_ngrams = { - j: " ".join(words[i - j : i]) - for j in range(ngram_range[0], ngram_range[1] + 1) - } - next_ngrams = { - j: " ".join(words[i + 1 : i + j + 1]) - for j in range(ngram_range[0], ngram_range[1] + 1) - } - - for j, (prev_ngram, next_ngram) in enumerate( - zip(prev_ngrams.values(), next_ngrams.values()) - ): - if prev_ngram == next_ngram: - repeated_part_spans.append(((i - j, i), (i + 1, i + j + 1))) - - for word_pos, word in enumerate(words): - for span in repeated_part_spans: - if word_pos in range(span[0][0], span[0][1]) or word_pos in range( - span[1][0], span[1][1] - ): - words[word_pos] = "" - - input_text = " ".join(words) - input_text = re.sub(r"\s+", " ", input_text) - return input_text.strip(), repeated_part_spans + # words = input_text.replace('\n', '
').split() + # repeated_part_spans = [] + + # for i, word in enumerate(words): + # prev_ngrams = { + # j: " ".join(words[i - j : i]) + # for j in range(ngram_range[0], ngram_range[1] + 1) + # } + # next_ngrams = { + # j: " ".join(words[i + 1 : i + j + 1]) + # for j in range(ngram_range[0], ngram_range[1] + 1) + # } + + # for j, (prev_ngram, next_ngram) in enumerate( + # zip(prev_ngrams.values(), next_ngrams.values()) + # ): + # if prev_ngram == next_ngram: + # repeated_part_spans.append(((i - j, i), (i + 1, i + j + 1))) + + # for word_pos, word in enumerate(words): + # for span in repeated_part_spans: + # if word_pos in range(span[0][0], span[0][1]) or word_pos in range( + # span[1][0], span[1][1] + # ): + # print(word_pos, word, span) + # words[word_pos] = "" + + # print(words) + # input_text = " ".join(words) + # input_text = re.sub(r"\s+", " ", input_text) + # return input_text.strip(), repeated_part_spans + return input_text, [] def _remove_repeated_word_over_n_times(input_text, n=3): words = input_text.split() @@ -163,10 +167,32 @@ def _remove_repeated_word_over_n_times(input_text, n=3): input_text = re.sub(r"\s+", " ", input_text) return input_text + input_text = input_text.replace('\n', '
') total_len_spans = 0 for _ in range(trial): input_text, spans = _remove_repeated_phrase(input_text, ngram_range) total_len_spans += len(spans) input_text = _remove_repeated_word_over_n_times(input_text) + input_text = input_text.replace('
', '\n') return input_text + + +def __test__(): + func_list = [ + remove_whitespace, + process_html_and_uri_text, + replace_email_and_url, + remove_repeated_text, + ] + + text = "Hello\nMy name is Kevin.\nMy personal Ifno\nemail: ygdsag@gmail.com\n\nPhone:849-5432-1235\nBank Account:\n1234-1234-1234-1234\n\n" + + for func in func_list: + print(func.__name__) + print(func(text)) + print() + + +if __name__ == "__main__": + __test__() \ No newline at end of file