VideoChatBot/main.py at main · avaish1409/VideoChatBot · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
#!/usr/bin/env python
# coding: utf-8

# audio to text
import speech_recognition as sr
import pyttsx3

# text to emotion
from text2emotion import get_emotion

# Chatbot
from chatterbot import ChatBot
from chatterbot.trainers import ListTrainer
from chatterbot.trainers import ChatterBotCorpusTrainer

# display reply
import threading
# import pyglet

# user emotion
from fer import FER
import matplotlib.pyplot as plt
import cv2
import os

# display gif
import imageio

# hide logging info
import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)


# simple video-chat-bot
class VCBot():
    def __init__(self):
        # flag - false if vcbot is diplaying reply (speaking something)
        global flag
        flag = True
        self.r = sr.Recognizer()

        # Chatbot
        self.chatbot = ChatBot(
            'Charlie'
        )

        self.trainer = ChatterBotCorpusTrainer(self.chatbot)

        self.trainer.train(
            *self.get_samples()
        )
        # speak
        self.engine = pyttsx3.init()
        self.engine.setProperty('rate', 145)
        logger.info("VCbot initialized")

    def get_samples(self):
        __dir__ = os.path.dirname(os.path.realpath('__dir__'))
        sample_dir = os.path.join(__dir__, 'resources', 'samples')
        sample_list = [os.path.join(sample_dir, file)
                       for file in os.listdir(sample_dir)]
        return sample_list

    def getUserEmotion(self):
        # detector for facial emotion
        detector = FER(mtcnn=True)
        cap = cv2.VideoCapture(0)
        ret, frame = cap.read()
        cv2.imwrite('temp.jpeg', frame)
        img = plt.imread('temp.jpeg')
        res = detector.detect_emotions(img)
        os.remove('temp.jpeg')
        if len(res) == 0:
            logger.info("No face detected")
            return 'neutral'
        res_emotion = res[0]['emotions']
        return max(res_emotion, key=res_emotion.get)

    def audioToText(self):
        # convert user audio to text (language = english)
        try:
            # use the microphone as source for input.
            with sr.Microphone() as source2:

                # wait for a second to let the recognizer
                # adjust the energy threshold based on
                # the surrounding noise level
                self.r.adjust_for_ambient_noise(source2, duration=0.2)

                # listens for the user's input
                audio2 = self.r.listen(source2)

                # Using ggogle to recognize audio
                MyText = self.r.recognize_google(audio2)
                MyText = MyText.lower()

                return MyText

        except sr.RequestError as e:
            print("Could not request results; {0}".format(e))
            logger.warning(
                "Aud2text: Could not request results; {0}".format(e))
            return 'error'

        except sr.UnknownValueError:
            print("unknown error occured")
            logger.warning("Aud2text: unknown error occured")
            return 'error'

    def getTextEmotion(self, t):
        # derive emotion from any text input
        res = get_emotion(t)
        emotion = max(res, key=res.get)
        if res[emotion] == 0:
            return 'neutral'
        return emotion.lower()

    def getChatReply(self, q):
        # chatterbot reply for given text input
        return str(self.chatbot.get_response(q))

    def vid(self, lock):
        # use gif for displaying reply to user
        gif = imageio.mimread('./resources/boy-talk.gif')
        nums = len(gif)
        imgs = [cv2.cvtColor(img, cv2.COLOR_RGB2BGR) for img in gif]
        i = 0
        global flag
        while True:
            lock.acquire()
            if flag:
                # no movement if not speaking
                cv2.imshow("gif", imgs[0])
            else:
                # gif enabled for speaking
                cv2.imshow("gif", imgs[i])
            lock.release()
            pressed = cv2.waitKey(25) & 0xFF
            if pressed == ord('q'):
                # quit
                logger.info("user requested to quit!")
                break
            if pressed == ord('r'):
                # just to check if gif is used in dynamic sense (check by pressing 'r')
                lock.acquire()
                flag = not flag
                lock.release()
            i = (i+1) % nums
        cv2.destroyAllWindows()

    def SpeakText(self, command):
        # Initialize the engine to speak
        self.engine.say(command)
        self.engine.runAndWait()

    def ensemble(self, lock):
        # combined together: video-emotion, audio-to-text, text-emotion, emotion validation, chat-reply, speak-reply
        txt = ''
        while txt != 'exit':
            video_emotion = self.getUserEmotion()
            # print(video_emotion)
            logger.info("Video Emotion: " + video_emotion)
            self.SpeakText('Your Turn')
            txt = self.audioToText()
            # print('aud2txt: ', txt)
            logger.info("Audio To Text: " + txt)
            txt_emotion = self.getTextEmotion(txt)
            # print('textEmotion: ', txt_emotion)
            logger.info("Text Emotion: " + txt_emotion)
            txt_inference = ''
            if txt_emotion != video_emotion and video_emotion != 'neutral':
                txt_inference = 'I am '+video_emotion
                logger.info("Added Text: " + txt_inference)
            cbot_reply = self.getChatReply(txt+txt_inference)
            # print('cbot: ', cbot_reply)
            logger.info("Chatbot Reply: " + cbot_reply)
            lock.acquire()
            global flag
            flag = False
            lock.release()
            self.SpeakText(cbot_reply)
            logger.info("Speaking something")
            lock.acquire()
            flag = True
            lock.release()

    def run(self):
        # thread lock for critical section
        lock = threading.Lock()

        # creating thread
        # thread 1: display gif
        t1 = threading.Thread(target=self.vid, args=(lock,))
        # thread2: ensemble
        t2 = threading.Thread(target=self.ensemble, args=(lock,))

        # starting thread 1
        t1.start()
        # starting thread 2
        t2.start()

        # wait until thread 1 is completely executed
        t1.join()
        # wait until thread 2 is completely executed
        t2.join()

        # both threads completely executed
        print("Done!")
        logger.info("Successfully completed execution, terminating vcbot!")
        return


# VCBot().run()