BechdalTest/IMSDB_Parser.py at master · amrakm/BechdalTest · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
from bs4 import BeautifulSoup
import requests
from collections import defaultdict, Counter
from io import StringIO
import datetime
import re

import logging
logger = logging.getLogger()
logging.basicConfig(filename="parsing.log", level=logging.INFO)


class IMSDB_Parser:

    '''
    A parser for IMSDB website. provides helpful functions to extract scenes and characters from movie script in plain text

    Attributes
    ----------
    movie_title : string

    '''


    def __init__(self, movie_title):

        script_url = "https://www.imsdb.com"+ '/scripts/' + movie_title.replace(" ", "-") + ".html"
        script_details_url = "https://www.imsdb.com" + "/Movie%20Scripts/" + movie_title.replace(" ","%20")+"%20Script.html"

        self.characters = None
        self.characters_sequence = None
        self.scenes = None
        self.lines = None

        try:
            r  = requests.get(script_url)
            soup = BeautifulSoup(r.text, 'html.parser')
            self.scrtext = soup.find(class_ = 'scrtext')

        except:

            self.scrtext = None
            logger.warning('failed to parse {} movie'.format(movie_title))

        try:
            r  = requests.get(script_details_url)
            soup = BeautifulSoup(r.text, 'html.parser')
            movie_info = soup.find(class_= 'script-details').get_text()
            movie_release_date_lines = movie_info[movie_info.find('Date'):]
            movie_release_date = movie_release_date_lines.split("\n")[0].split(":")[1].strip()
            movie_release_date = datetime.datetime.strptime(movie_release_date, '%B %Y')
            self.release_date = movie_release_date

        except:
            logger.warning('failed to fetch release date for {} movie'.format(movie_title) )
            self.release_date = None


        if self.scrtext:

            self.lines = [line for line in self.scrtext.get_text().split('\n') if len(line.strip()) > 0]

            if self.lines:

                self.parse_characters()
                self.parse_scenes()


    def parse_scenes(self):
        '''
        Assign to self
        -------
        scene: list of lists
                list of scenes, each scene is a list of lines
        characters_sequence: list of lists
                contains actors appearing in each scene

        '''

        scenes = [[]]
        characters_sequence = []

        for line in self.lines:
            # append a new scene when any of these scene separators is found
            if line.strip()[:4] in ('INT.', 'EXT.', 'ESP.', 'EST.', 'SFX ', 'SFX:', 'VFX:', 'LATE') or 'CONTINUE' in line:

                scenes.append([line])
                if len(scenes) > 1:
                    # append to the character sequence from the last extracted scene
                    characters_sequence.append(self.who_is_in_the_scene(scenes[-2]))

            else:
                scenes[-1].append(line)

        if len(scenes) > 1:
            characters_sequence.append(self.who_is_in_the_scene(scenes[-2]))

        self.characters_sequence = characters_sequence
        self.scenes = scenes

        return


    def clean_character_string(self, x):
        '''
        Remove anything between parentheses and any non alphabetic character
        '''
        character = re.sub(r'''\([^)]*\)''', '',  x).strip('VOICE').strip('\r')
        return re.sub("[^a-zA-Z]+", '', character.strip())

    def parse_characters(self):

        '''
        Finds all characters in a movie


        Returns
        -------
        characters: set of all characters in the movie

        '''
        characters = []

        for line in self.lines:
            if line.strip().isupper() and \
                    line.strip()[:4] not in ('INT.', 'EXT.', 'ESP.', 'EST.', 'SFX ', 'SFX:', 'VFX:') and 'CONTINUED' not in line:

                characters.append(self.clean_character_string(line))

        self.characters = set(characters)

        return


    def who_is_in_the_scene(self, scene):
        '''
        Finds all characters that appears in a specifc scene

        Parameters
        ----------
        scene: list
            list of lines in the scene


        Returns
        -------
        characters_in_the_scene: list
            list of characters appear in this scene

        '''
        characters_in_the_scene = []

        for line in scene:
            if line.strip().isupper():
                line_cleaned = self.clean_character_string(line)

                if line_cleaned in self.characters and line_cleaned not in characters_in_the_scene:
                    characters_in_the_scene.append(line_cleaned)

        return characters_in_the_scene


    def dialog_from_scene(self, scene):
        '''
        Extract dialog text from scene plain text
        This is done by finding the two most frequent indentations, then select only the second largest indenation

        Parameters:
        ----------
        scene: list
            list of lines in the scene

        Returns:
        -------
        string: extracted dialog as a single string

        '''

        non_titles = [line for line in  scene if not line.lstrip()[:2].isupper() and len(line.strip()) != 0]

        if non_titles:

            most_freq_indentations = sorted(Counter([len(line) - len(line.lstrip()) for line in non_titles]).items(), key=lambda x:-x[0])
            dialog_indentation = most_freq_indentations[-2:][0][0]
            return " ".join([line.strip() for line in non_titles if len(line) - len(line.lstrip()) == dialog_indentation])
        else:
            return ""