Sentiment_project/sentiment_dl_app.py at main · Protagonist-8/Sentiment_project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import streamlit as st
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
import random

class BiLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, embedding_matrix, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.embedding.weight.data.copy_(torch.tensor(embedding_matrix))
        self.embedding.weight.requires_grad = False  # Set False to freeze
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        emb = self.embedding(x)
        output, (h, c) = self.lstm(emb)
        out = self.dropout(torch.cat((h[-2], h[-1]), dim=1))  # concat last hidden states
        return self.fc(out)

def tokenize(text):
    return text.split()  # already cleaned & lowercased

def text_to_indices(text, vocab):
    return [vocab.get(tok, vocab["<unk>"]) for tok in tokenize(text)]

def generate_tokens_from_last(last_word, vocab, embedding_matrix, num_tokens=10):
    id_to_word = {idx: word for word, idx in vocab.items()}
    emb_matrix_torch = torch.tensor(embedding_matrix)
    emb_norm = emb_matrix_torch / emb_matrix_torch.norm(dim=1, keepdim=True)

    current_word = last_word
    generated_tokens = []

    for _ in range(num_tokens):
        word_idx = vocab.get(current_word, vocab["<unk>"])
        word_vec = emb_norm[word_idx].unsqueeze(0)

        similarity = torch.mm(word_vec, emb_norm.T).squeeze(0)
        similarity[word_idx] = -1  # avoid picking itself

        next_word_idx = torch.argmax(similarity).item()
        next_word = id_to_word[next_word_idx]

        generated_tokens.append(next_word)
        current_word = next_word

    return generated_tokens

# load model, vocab, embedding matrix

with open('embedding_matrix.pkl', 'rb') as f:
  embedding_matrix = pickle.load(f)

with open('vocab_50d_freq10.pkl','rb') as v:
  vocab=pickle.load(v)

EMBED_DIM=50

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BiLSTMClassifier(len(vocab), EMBED_DIM, hidden_dim=128, num_classes=1,
                         embedding_matrix=embedding_matrix, pad_idx=vocab["<pad>"]).to(device)
state_dict = torch.load("dl_sentiment_model_3.pth", map_location=device)
model.load_state_dict(state_dict)
model.to(device)


# Streamlit UI
st.title("Deep Sentiment Analysis App")

user_input = st.text_area("Enter text here:")

if st.button("Predict"):
    if user_input.strip():
        # Preprocess
        cleaned_text = user_input.lower().strip()

        # Convert to indices
        indices = text_to_indices(cleaned_text, vocab)
        if len(indices) == 0:
            st.write("Please enter valid text.")
        else:
            input_tensor = torch.tensor([indices], dtype=torch.long).to(device)

            # Prediction
            model.eval()
            with torch.no_grad():
                output = model(input_tensor)
                prob = torch.sigmoid(output).item()
                pred = 1 if prob >= 0.5 else 0

            if(pred == 1):
                pred_label = "Positive"
                st.write(f"Wow, someone is in a good mood Today!!!")
                st.write(f"**I can say this with:** {prob:.2f} confidence")
            else:
                pred_label = "Negative"
                st.write(f"Uh Oh, someone is in a bad mood Today!!!, Don't worry, things will get better.")
                st.write(f"**I can say this with:** {1 - prob:.2f} confidence")

            # Generate tokens based on last word
            embedding_tensor = torch.tensor(embedding_matrix)
            id_to_word = {idx: word for word, idx in vocab.items()}

            # Parameters
            context_window = 3  # use last N tokens to define context
            top_k = 5           # candidates for sampling
            gen_len = 10        # tokens to generate

            # Initialize hashmap with all words from original input
            used_words = {word for word in cleaned_text.split()}

            # Start with input sequence
            current_sequence = indices.copy()
            generated_tokens = []

            for _ in range(gen_len):
                # Get indices of last N tokens in the sequence
                context_indices = current_sequence[-context_window:]

                # Average their embeddings
                context_emb = embedding_tensor[context_indices].mean(dim=0, keepdim=True)

                # Find top-k+1 similar words (self might be included)
                similarities = F.cosine_similarity(context_emb, embedding_tensor)
                top_indices = similarities.topk(top_k + 1).indices.tolist()

                # Filter out already used words
                candidates = [idx for idx in top_indices if id_to_word[idx] not in used_words]

                # If no unused candidate found, pick random word from vocab
                if not candidates:
                    chosen_idx = random.randint(0, len(vocab) - 1)
                else:
                    chosen_idx = random.choice(candidates[:top_k])  # random from top-k

                # Save token
                chosen_word = id_to_word[chosen_idx]
                generated_tokens.append(chosen_word)
                used_words.add(chosen_word)

                # Append to sequence
                current_sequence.append(chosen_idx)

            # Final text
            generated_text = cleaned_text + " " +  " ".join(generated_tokens)

            st.write(f"Okay, I understand the vibe I can extend your {pred_label} sentence:")
            st.write(generated_text)