ivanpenaloza
diff --git a/‎…ions/round_1/1_word_frequency_counter.py‎ ‎…ons/round_1/01_word_frequency_counter.py‎src/my_project/interviews/nlp_coding_questions/round_1/1_word_frequency_counter.py renamed to src/my_project/interviews/nlp_coding_questions/round_1/01_word_frequency_counter.py b/‎…ions/round_1/1_word_frequency_counter.py‎ ‎…ons/round_1/01_word_frequency_counter.py‎src/my_project/interviews/nlp_coding_questions/round_1/1_word_frequency_counter.py renamed to src/my_project/interviews/nlp_coding_questions/round_1/01_word_frequency_counter.py
diff --git a/‎…nd_1/2_text_cleaning_and_tokenization.py‎ ‎…d_1/02_text_cleaning_and_tokenization.py‎src/my_project/interviews/nlp_coding_questions/round_1/2_text_cleaning_and_tokenization.py renamed to src/my_project/interviews/nlp_coding_questions/round_1/02_text_cleaning_and_tokenization.py b/‎…nd_1/2_text_cleaning_and_tokenization.py‎ ‎…d_1/02_text_cleaning_and_tokenization.py‎src/my_project/interviews/nlp_coding_questions/round_1/2_text_cleaning_and_tokenization.py renamed to src/my_project/interviews/nlp_coding_questions/round_1/02_text_cleaning_and_tokenization.py
diff --git a/‎…_questions/round_1/3_stopword_removal.py‎ ‎…questions/round_1/03_stopword_removal.py‎src/my_project/interviews/nlp_coding_questions/round_1/3_stopword_removal.py renamed to src/my_project/interviews/nlp_coding_questions/round_1/03_stopword_removal.py b/‎…_questions/round_1/3_stopword_removal.py‎ ‎…questions/round_1/03_stopword_removal.py‎src/my_project/interviews/nlp_coding_questions/round_1/3_stopword_removal.py renamed to src/my_project/interviews/nlp_coding_questions/round_1/03_stopword_removal.py
diff --git a/‎…uestions/round_1/4_sentiment_analysis.py‎ ‎…estions/round_1/04_sentiment_analysis.py‎src/my_project/interviews/nlp_coding_questions/round_1/4_sentiment_analysis.py renamed to src/my_project/interviews/nlp_coding_questions/round_1/04_sentiment_analysis.py b/‎…uestions/round_1/4_sentiment_analysis.py‎ ‎…estions/round_1/04_sentiment_analysis.py‎src/my_project/interviews/nlp_coding_questions/round_1/4_sentiment_analysis.py renamed to src/my_project/interviews/nlp_coding_questions/round_1/04_sentiment_analysis.py
diff --git a/‎…ound_1/5_named_entity_recognition_ner.py‎ ‎…und_1/05_named_entity_recognition_ner.py‎src/my_project/interviews/nlp_coding_questions/round_1/5_named_entity_recognition_ner.py renamed to src/my_project/interviews/nlp_coding_questions/round_1/05_named_entity_recognition_ner.py b/‎…ound_1/5_named_entity_recognition_ner.py‎ ‎…und_1/05_named_entity_recognition_ner.py‎src/my_project/interviews/nlp_coding_questions/round_1/5_named_entity_recognition_ner.py renamed to src/my_project/interviews/nlp_coding_questions/round_1/05_named_entity_recognition_ner.py
diff --git a/‎…g_questions/round_1/6_text_similarity.py‎ ‎…_questions/round_1/06_text_similarity.py‎src/my_project/interviews/nlp_coding_questions/round_1/6_text_similarity.py renamed to src/my_project/interviews/nlp_coding_questions/round_1/06_text_similarity.py b/‎…g_questions/round_1/6_text_similarity.py‎ ‎…_questions/round_1/06_text_similarity.py‎src/my_project/interviews/nlp_coding_questions/round_1/6_text_similarity.py renamed to src/my_project/interviews/nlp_coding_questions/round_1/06_text_similarity.py
diff --git a/‎src/my_project/interviews/nlp_coding_questions/round_1/07_topic_modeling.py‎
Lines changed: 103 additions & 0 deletions b/‎src/my_project/interviews/nlp_coding_questions/round_1/07_topic_modeling.py‎
Lines changed: 103 additions & 0 deletions
diff --git a/‎src/my_project/interviews/nlp_coding_questions/round_1/088_text_generation_rnn.py‎
Lines changed: 87 additions & 0 deletions b/‎src/my_project/interviews/nlp_coding_questions/round_1/088_text_generation_rnn.py‎
Lines changed: 87 additions & 0 deletions
@@ -0,0 +1,103 @@
+from sklearn.decomposition import LatentDirichletAllocation
+from sklearn.feature_extraction.text import CountVectorizer
+from typing import List
+
+
+class TopicModeling:
+    """Class for performing topic modeling using Scikit-Learn's LDA."""
+    
+    def __init__(self, n_topics: int = 5, max_features: int = 1000, random_state: int = 42):
+        """
+        Initialize the TopicModeling class.
+        
+        Args:
+            n_topics: Number of topics to extract.
+            max_features: Maximum number of features for vectorization.
+            random_state: Random seed for reproducibility.
+        """
+        self.n_topics = n_topics
+        self.max_features = max_features
+        self.random_state = random_state
+        self.vectorizer = CountVectorizer(stop_words='english', max_features=self.max_features)
+        self.lda_model = LatentDirichletAllocation(n_components=self.n_topics, random_state=self.random_state)
+    
+    def preprocess_and_vectorize(self, documents: List[str]):
+        """
+        Preprocess and vectorize the input documents.
+        
+        Args:
+            documents: List of text documents.
+            
+        Returns:
+            Document-term matrix.
+        """
+        doc_term_matrix = self.vectorizer.fit_transform(documents)
+        return doc_term_matrix
+    
+    def fit(self, documents: List[str]):
+        """
+        Fit the LDA model to the input documents.
+        
+        Args:
+            documents: List of text documents.
+        """
+        doc_term_matrix = self.preprocess_and_vectorize(documents)
+        self.lda_model.fit(doc_term_matrix)
+    
+    def get_topics(self, n_words: int = 10) -> List[List[str]]:
+        """
+        Extract topics and their top words.
+        
+        Args:
+            n_words: Number of top words to display per topic.
+            
+        Returns:
+            List of topics with their top words.
+        """
+        topics = []
+        feature_names = self.vectorizer.get_feature_names_out()
+        print(feature_names, len(feature_names))
+        
+        for topic_idx, topic in enumerate(self.lda_model.components_):
+            print(topic)
+            print(topic.argsort()[:-n_words - 1:-1])
+            
+            top_words = [feature_names[i] for i in topic.argsort()[:-n_words - 1:-1]]
+            topics.append(top_words)
+        return topics
+    
+    def print_topics(self, n_words: int = 10):
+        """
+        Print the topics and their top words.
+        
+        Args:
+            n_words: Number of top words to display per topic.
+        """
+        topics = self.get_topics(n_words)
+        for idx, topic in enumerate(topics):
+            print(f"Topic {idx + 1}: {', '.join(topic)}")
+
+
+# Example usage
+if __name__ == "__main__":
+    # Sample documents
+    documents = [
+        "Machine learning algorithms are used for data analysis and prediction",
+        "Deep learning neural networks process large amounts of data",
+        "Python programming language is popular for machine learning",
+        "Natural language processing helps computers understand human language",
+        "Computer vision enables machines to interpret visual information",
+        "Reinforcement learning agents learn through trial and error",
+        "Data science combines statistics and programming for insights",
+        "Artificial intelligence systems can solve complex problems",
+        "Big data analytics requires distributed computing frameworks",
+        "Neural networks mimic biological brain structure and function"
+    ]
+    
+    # Initialize and fit the topic modeling class
+    topic_modeling = TopicModeling(n_topics=3)
+    topic_modeling.fit(documents)
+    
+    # Print the topics
+    print("\nExtracted Topics:")
+    topic_modeling.print_topics(n_words=2)
@@ -0,0 +1,87 @@
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import random
+
+class CharRNN(nn.Module):
+    def __init__(self, vocab_size, hidden_size=128, num_layers=1):
+        super().__init__()
+        self.embedding = nn.Embedding(vocab_size, hidden_size)
+        self.rnn = nn.RNN(hidden_size, hidden_size, num_layers, batch_first=True)
+        self.fc = nn.Linear(hidden_size, vocab_size)
+        self.num_layers = num_layers
+        self.hidden_size = hidden_size
+
+    def forward(self, x, hidden):
+        x = self.embedding(x)
+        out, hidden = self.rnn(x, hidden)
+        out = self.fc(out)
+        return out, hidden
+
+    def init_hidden(self, batch_size):
+        return torch.zeros(self.num_layers, batch_size, self.hidden_size)
+
+def build_vocab(text):
+    chars = sorted(set(text))
+    char2idx = {ch: i for i, ch in enumerate(chars)}
+    idx2char = {i: ch for i, ch in enumerate(chars)}
+    return char2idx, idx2char
+
+def prepare_data(text, char2idx, seq_length):
+    data = []
+    for i in range(len(text) - seq_length):
+        seq = text[i:i+seq_length]
+        target = text[i+1:i+seq_length+1]
+        seq_idx = [char2idx[ch] for ch in seq]
+        target_idx = [char2idx[ch] for ch in target]
+        data.append((seq_idx, target_idx))
+    return data
+
+def train(text, seq_length=40, epochs=50, lr=0.005):
+    char2idx, idx2char = build_vocab(text)
+    vocab_size = len(char2idx)
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model = CharRNN(vocab_size).to(device)
+    data = prepare_data(text, char2idx, seq_length)
+    criterion = nn.CrossEntropyLoss()
+    optimizer = optim.Adam(model.parameters(), lr=lr)
+    for epoch in range(epochs):
+        total_loss = 0
+        random.shuffle(data)
+        for seq_idx, target_idx in data:
+            inputs = torch.tensor([seq_idx], dtype=torch.long).to(device)
+            targets = torch.tensor([target_idx], dtype=torch.long).to(device)
+            hidden = model.init_hidden(1).to(device)
+            outputs, _ = model(inputs, hidden)
+            loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            total_loss += loss.item()
+        if (epoch+1) % 10 == 0:
+            print(f"Epoch {epoch+1}, Loss: {total_loss/len(data):.4f}")
+    return model, char2idx, idx2char, device
+
+def generate(model, char2idx, idx2char, seed, length=200, temperature=1.0, device='cpu'):
+    model.eval()
+    chars = list(seed)
+    hidden = model.init_hidden(1).to(device)
+    for _ in range(length):
+        input_idx = torch.tensor([[char2idx.get(chars[-1], 0)]], dtype=torch.long).to(device)
+        output, hidden = model(input_idx, hidden)
+        output = output[0, -1] / temperature
+        probs = torch.softmax(output, dim=0).detach().cpu().numpy()
+        next_idx = random.choices(range(len(probs)), weights=probs)[0]
+        chars.append(idx2char[next_idx])
+    return ''.join(chars)
+
+if __name__ == "__main__":
+    corpus = (
+        "The quick brown fox jumps over the lazy dog. "
+        "Machine learning is fun. "
+        "PyTorch makes building neural networks easy. "
+        "Text generation with RNNs is simple."
+    ) * 5
+    model, char2idx, idx2char, device = train(corpus, seq_length=40, epochs=50)
+    print("\nGenerated text:\n")
+    print(generate(model, char2idx, idx2char, seed="The quick", length=200, temperature=0.8, device=device))