-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathIMSDB_Parser.py
More file actions
191 lines (129 loc) · 5.76 KB
/
IMSDB_Parser.py
File metadata and controls
191 lines (129 loc) · 5.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
from bs4 import BeautifulSoup
import requests
from collections import defaultdict, Counter
from io import StringIO
import datetime
import re
import logging
logger = logging.getLogger()
logging.basicConfig(filename="parsing.log", level=logging.INFO)
class IMSDB_Parser:
'''
A parser for IMSDB website. provides helpful functions to extract scenes and characters from movie script in plain text
Attributes
----------
movie_title : string
'''
def __init__(self, movie_title):
script_url = "https://www.imsdb.com"+ '/scripts/' + movie_title.replace(" ", "-") + ".html"
script_details_url = "https://www.imsdb.com" + "/Movie%20Scripts/" + movie_title.replace(" ","%20")+"%20Script.html"
self.characters = None
self.characters_sequence = None
self.scenes = None
self.lines = None
try:
r = requests.get(script_url)
soup = BeautifulSoup(r.text, 'html.parser')
self.scrtext = soup.find(class_ = 'scrtext')
except:
self.scrtext = None
logger.warning('failed to parse {} movie'.format(movie_title))
try:
r = requests.get(script_details_url)
soup = BeautifulSoup(r.text, 'html.parser')
movie_info = soup.find(class_= 'script-details').get_text()
movie_release_date_lines = movie_info[movie_info.find('Date'):]
movie_release_date = movie_release_date_lines.split("\n")[0].split(":")[1].strip()
movie_release_date = datetime.datetime.strptime(movie_release_date, '%B %Y')
self.release_date = movie_release_date
except:
logger.warning('failed to fetch release date for {} movie'.format(movie_title) )
self.release_date = None
if self.scrtext:
self.lines = [line for line in self.scrtext.get_text().split('\n') if len(line.strip()) > 0]
if self.lines:
self.parse_characters()
self.parse_scenes()
def parse_scenes(self):
'''
Assign to self
-------
scene: list of lists
list of scenes, each scene is a list of lines
characters_sequence: list of lists
contains actors appearing in each scene
'''
scenes = [[]]
characters_sequence = []
for line in self.lines:
# append a new scene when any of these scene separators is found
if line.strip()[:4] in ('INT.', 'EXT.', 'ESP.', 'EST.', 'SFX ', 'SFX:', 'VFX:', 'LATE') or 'CONTINUE' in line:
scenes.append([line])
if len(scenes) > 1:
# append to the character sequence from the last extracted scene
characters_sequence.append(self.who_is_in_the_scene(scenes[-2]))
else:
scenes[-1].append(line)
if len(scenes) > 1:
characters_sequence.append(self.who_is_in_the_scene(scenes[-2]))
self.characters_sequence = characters_sequence
self.scenes = scenes
return
def clean_character_string(self, x):
'''
Remove anything between parentheses and any non alphabetic character
'''
character = re.sub(r'''\([^)]*\)''', '', x).strip('VOICE').strip('\r')
return re.sub("[^a-zA-Z]+", '', character.strip())
def parse_characters(self):
'''
Finds all characters in a movie
Returns
-------
characters: set of all characters in the movie
'''
characters = []
for line in self.lines:
if line.strip().isupper() and \
line.strip()[:4] not in ('INT.', 'EXT.', 'ESP.', 'EST.', 'SFX ', 'SFX:', 'VFX:') and 'CONTINUED' not in line:
characters.append(self.clean_character_string(line))
self.characters = set(characters)
return
def who_is_in_the_scene(self, scene):
'''
Finds all characters that appears in a specifc scene
Parameters
----------
scene: list
list of lines in the scene
Returns
-------
characters_in_the_scene: list
list of characters appear in this scene
'''
characters_in_the_scene = []
for line in scene:
if line.strip().isupper():
line_cleaned = self.clean_character_string(line)
if line_cleaned in self.characters and line_cleaned not in characters_in_the_scene:
characters_in_the_scene.append(line_cleaned)
return characters_in_the_scene
def dialog_from_scene(self, scene):
'''
Extract dialog text from scene plain text
This is done by finding the two most frequent indentations, then select only the second largest indenation
Parameters:
----------
scene: list
list of lines in the scene
Returns:
-------
string: extracted dialog as a single string
'''
non_titles = [line for line in scene if not line.lstrip()[:2].isupper() and len(line.strip()) != 0]
if non_titles:
most_freq_indentations = sorted(Counter([len(line) - len(line.lstrip()) for line in non_titles]).items(), key=lambda x:-x[0])
dialog_indentation = most_freq_indentations[-2:][0][0]
return " ".join([line.strip() for line in non_titles if len(line) - len(line.lstrip()) == dialog_indentation])
else:
return ""