-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathpreprocess.py
More file actions
103 lines (88 loc) · 2.76 KB
/
preprocess.py
File metadata and controls
103 lines (88 loc) · 2.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# This code is taken from https://github.com/kaggarwal/ClinicalNotesICU
from nltk import sent_tokenize, word_tokenize
import re
import pandas as pd
SECTION_TITLES = re.compile(
r'('
r'ABDOMEN AND PELVIS|CLINICAL HISTORY|CLINICAL INDICATION|COMPARISON|COMPARISON STUDY DATE'
r'|EXAM|EXAMINATION|FINDINGS|HISTORY|IMPRESSION|INDICATION'
r'|MEDICAL CONDITION|PROCEDURE|REASON FOR EXAM|REASON FOR STUDY|REASON FOR THIS EXAMINATION'
r'|TECHNIQUE'
r'):|FINAL REPORT',
re.I | re.M)
def getSentences(t):
return list(preprocess_mimic(t))
def pattern_repl(matchobj):
"""
Return a replacement string to be used for match object
"""
return ' '.rjust(len(matchobj.group(0)))
def clean_text(text):
"""
Clean text
"""
# Replace [**Patterns**] with spaces.
text = re.sub(r'\[\*\*.*?\*\*\]', pattern_repl, text)
# Replace `_` with spaces.
text = re.sub(r'_', ' ', text)
start = 0
end = find_end(text)
new_text = ''
if start > 0:
new_text += ' ' * start
new_text = text[start:end]
# make sure the new text has the same length of old text.
if len(text) - end > 0:
new_text += ' ' * (len(text) - end)
return new_text
def preprocess_mimic(text):
"""
Preprocess reports in MIMIC-III.
1. remove [**Patterns**] and signature
2. split the report into sections
3. tokenize sentences and words
4. lowercase
"""
for sec in split_heading(clean_text(text)):
for sent in sent_tokenize(sec):
text = ' '.join(word_tokenize(sent))
yield text.lower()
def split_heading(text):
"""Split the report into sections"""
start = 0
for matcher in SECTION_TITLES.finditer(text):
# add last
end = matcher.start()
if end != start:
section = text[start:end].strip()
if section:
yield section
# add title
start = end
end = matcher.end()
if end != start:
section = text[start:end].strip()
if section:
yield section
start = end
# add last piece
end = len(text)
if start < end:
section = text[start:end].strip()
if section:
yield section
def find_end(text):
"""Find the end of the report."""
ends = [len(text)]
patterns = [
re.compile(r'BY ELECTRONICALLY SIGNING THIS REPORT', re.I),
re.compile(r'\n {3,}DR.', re.I),
re.compile(r'[ ]{1,}RADLINE ', re.I),
re.compile(r'.*electronically signed on', re.I),
re.compile(r'M\[0KM\[0KM')
]
for pattern in patterns:
matchobj = pattern.search(text)
if matchobj:
ends.append(matchobj.start())
return min(ends)