Skip to content

Commit 8b7114d

Browse files
committed
adding algos
1 parent 017e586 commit 8b7114d

12 files changed

Lines changed: 631 additions & 0 deletions

src/my_project/interviews/nlp_coding_questions/round_1/1_word_frequency_counter.py renamed to src/my_project/interviews/nlp_coding_questions/round_1/01_word_frequency_counter.py

File renamed without changes.

src/my_project/interviews/nlp_coding_questions/round_1/2_text_cleaning_and_tokenization.py renamed to src/my_project/interviews/nlp_coding_questions/round_1/02_text_cleaning_and_tokenization.py

File renamed without changes.

src/my_project/interviews/nlp_coding_questions/round_1/3_stopword_removal.py renamed to src/my_project/interviews/nlp_coding_questions/round_1/03_stopword_removal.py

File renamed without changes.

src/my_project/interviews/nlp_coding_questions/round_1/4_sentiment_analysis.py renamed to src/my_project/interviews/nlp_coding_questions/round_1/04_sentiment_analysis.py

File renamed without changes.

src/my_project/interviews/nlp_coding_questions/round_1/5_named_entity_recognition_ner.py renamed to src/my_project/interviews/nlp_coding_questions/round_1/05_named_entity_recognition_ner.py

File renamed without changes.

src/my_project/interviews/nlp_coding_questions/round_1/6_text_similarity.py renamed to src/my_project/interviews/nlp_coding_questions/round_1/06_text_similarity.py

File renamed without changes.
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
from sklearn.decomposition import LatentDirichletAllocation
2+
from sklearn.feature_extraction.text import CountVectorizer
3+
from typing import List
4+
5+
6+
class TopicModeling:
7+
"""Class for performing topic modeling using Scikit-Learn's LDA."""
8+
9+
def __init__(self, n_topics: int = 5, max_features: int = 1000, random_state: int = 42):
10+
"""
11+
Initialize the TopicModeling class.
12+
13+
Args:
14+
n_topics: Number of topics to extract.
15+
max_features: Maximum number of features for vectorization.
16+
random_state: Random seed for reproducibility.
17+
"""
18+
self.n_topics = n_topics
19+
self.max_features = max_features
20+
self.random_state = random_state
21+
self.vectorizer = CountVectorizer(stop_words='english', max_features=self.max_features)
22+
self.lda_model = LatentDirichletAllocation(n_components=self.n_topics, random_state=self.random_state)
23+
24+
def preprocess_and_vectorize(self, documents: List[str]):
25+
"""
26+
Preprocess and vectorize the input documents.
27+
28+
Args:
29+
documents: List of text documents.
30+
31+
Returns:
32+
Document-term matrix.
33+
"""
34+
doc_term_matrix = self.vectorizer.fit_transform(documents)
35+
return doc_term_matrix
36+
37+
def fit(self, documents: List[str]):
38+
"""
39+
Fit the LDA model to the input documents.
40+
41+
Args:
42+
documents: List of text documents.
43+
"""
44+
doc_term_matrix = self.preprocess_and_vectorize(documents)
45+
self.lda_model.fit(doc_term_matrix)
46+
47+
def get_topics(self, n_words: int = 10) -> List[List[str]]:
48+
"""
49+
Extract topics and their top words.
50+
51+
Args:
52+
n_words: Number of top words to display per topic.
53+
54+
Returns:
55+
List of topics with their top words.
56+
"""
57+
topics = []
58+
feature_names = self.vectorizer.get_feature_names_out()
59+
print(feature_names, len(feature_names))
60+
61+
for topic_idx, topic in enumerate(self.lda_model.components_):
62+
print(topic)
63+
print(topic.argsort()[:-n_words - 1:-1])
64+
65+
top_words = [feature_names[i] for i in topic.argsort()[:-n_words - 1:-1]]
66+
topics.append(top_words)
67+
return topics
68+
69+
def print_topics(self, n_words: int = 10):
70+
"""
71+
Print the topics and their top words.
72+
73+
Args:
74+
n_words: Number of top words to display per topic.
75+
"""
76+
topics = self.get_topics(n_words)
77+
for idx, topic in enumerate(topics):
78+
print(f"Topic {idx + 1}: {', '.join(topic)}")
79+
80+
81+
# Example usage
82+
if __name__ == "__main__":
83+
# Sample documents
84+
documents = [
85+
"Machine learning algorithms are used for data analysis and prediction",
86+
"Deep learning neural networks process large amounts of data",
87+
"Python programming language is popular for machine learning",
88+
"Natural language processing helps computers understand human language",
89+
"Computer vision enables machines to interpret visual information",
90+
"Reinforcement learning agents learn through trial and error",
91+
"Data science combines statistics and programming for insights",
92+
"Artificial intelligence systems can solve complex problems",
93+
"Big data analytics requires distributed computing frameworks",
94+
"Neural networks mimic biological brain structure and function"
95+
]
96+
97+
# Initialize and fit the topic modeling class
98+
topic_modeling = TopicModeling(n_topics=3)
99+
topic_modeling.fit(documents)
100+
101+
# Print the topics
102+
print("\nExtracted Topics:")
103+
topic_modeling.print_topics(n_words=2)
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
import torch
2+
import torch.nn as nn
3+
import torch.optim as optim
4+
import random
5+
6+
class CharRNN(nn.Module):
7+
def __init__(self, vocab_size, hidden_size=128, num_layers=1):
8+
super().__init__()
9+
self.embedding = nn.Embedding(vocab_size, hidden_size)
10+
self.rnn = nn.RNN(hidden_size, hidden_size, num_layers, batch_first=True)
11+
self.fc = nn.Linear(hidden_size, vocab_size)
12+
self.num_layers = num_layers
13+
self.hidden_size = hidden_size
14+
15+
def forward(self, x, hidden):
16+
x = self.embedding(x)
17+
out, hidden = self.rnn(x, hidden)
18+
out = self.fc(out)
19+
return out, hidden
20+
21+
def init_hidden(self, batch_size):
22+
return torch.zeros(self.num_layers, batch_size, self.hidden_size)
23+
24+
def build_vocab(text):
25+
chars = sorted(set(text))
26+
char2idx = {ch: i for i, ch in enumerate(chars)}
27+
idx2char = {i: ch for i, ch in enumerate(chars)}
28+
return char2idx, idx2char
29+
30+
def prepare_data(text, char2idx, seq_length):
31+
data = []
32+
for i in range(len(text) - seq_length):
33+
seq = text[i:i+seq_length]
34+
target = text[i+1:i+seq_length+1]
35+
seq_idx = [char2idx[ch] for ch in seq]
36+
target_idx = [char2idx[ch] for ch in target]
37+
data.append((seq_idx, target_idx))
38+
return data
39+
40+
def train(text, seq_length=40, epochs=50, lr=0.005):
41+
char2idx, idx2char = build_vocab(text)
42+
vocab_size = len(char2idx)
43+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
44+
model = CharRNN(vocab_size).to(device)
45+
data = prepare_data(text, char2idx, seq_length)
46+
criterion = nn.CrossEntropyLoss()
47+
optimizer = optim.Adam(model.parameters(), lr=lr)
48+
for epoch in range(epochs):
49+
total_loss = 0
50+
random.shuffle(data)
51+
for seq_idx, target_idx in data:
52+
inputs = torch.tensor([seq_idx], dtype=torch.long).to(device)
53+
targets = torch.tensor([target_idx], dtype=torch.long).to(device)
54+
hidden = model.init_hidden(1).to(device)
55+
outputs, _ = model(inputs, hidden)
56+
loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))
57+
optimizer.zero_grad()
58+
loss.backward()
59+
optimizer.step()
60+
total_loss += loss.item()
61+
if (epoch+1) % 10 == 0:
62+
print(f"Epoch {epoch+1}, Loss: {total_loss/len(data):.4f}")
63+
return model, char2idx, idx2char, device
64+
65+
def generate(model, char2idx, idx2char, seed, length=200, temperature=1.0, device='cpu'):
66+
model.eval()
67+
chars = list(seed)
68+
hidden = model.init_hidden(1).to(device)
69+
for _ in range(length):
70+
input_idx = torch.tensor([[char2idx.get(chars[-1], 0)]], dtype=torch.long).to(device)
71+
output, hidden = model(input_idx, hidden)
72+
output = output[0, -1] / temperature
73+
probs = torch.softmax(output, dim=0).detach().cpu().numpy()
74+
next_idx = random.choices(range(len(probs)), weights=probs)[0]
75+
chars.append(idx2char[next_idx])
76+
return ''.join(chars)
77+
78+
if __name__ == "__main__":
79+
corpus = (
80+
"The quick brown fox jumps over the lazy dog. "
81+
"Machine learning is fun. "
82+
"PyTorch makes building neural networks easy. "
83+
"Text generation with RNNs is simple."
84+
) * 5
85+
model, char2idx, idx2char, device = train(corpus, seq_length=40, epochs=50)
86+
print("\nGenerated text:\n")
87+
print(generate(model, char2idx, idx2char, seed="The quick", length=200, temperature=0.8, device=device))

0 commit comments

Comments
 (0)