jvachier · jvachier · May 15, 2025 · May 15, 2025 · May 15, 2025 · May 15, 2025
@@ -1,4 +1,4 @@
 ruff:
-	ruff check ./modules sentiment_analysis.py voice_to_text_app.py
-	ruff check --fix ./modules sentiment_analysis.py voice_to_text_app.py
-	ruff format ./modules sentiment_analysis.py voice_to_text_app.py
+	ruff check ./src ./app ./tests 
+	ruff check --fix ./src ./app ./tests 
+	ruff format ./src ./app ./tests 
@@ -7,7 +7,7 @@
 from translation_french_english import test_translation, transformer_model
 from modules.data_processor import DatasetProcessor, TextPreprocessor
 from modules.utils import ModelPaths
-import tensorflow as tf
+from typing import Tuple, Any
 
 # Configure logging
 logging.basicConfig(
@@ -108,7 +108,9 @@
     [Input("start-record-button", "n_clicks"), Input("stop-record-button", "n_clicks")],
     [State("recording-state", "data")],
 )
-def update_output(start_n_clicks, stop_n_clicks, recording_state):
+def update_output(
+    start_n_clicks: int, stop_n_clicks: int, recording_state: bool
+) -> Tuple[Any, Any, Any, Any, bool]:
     """
     Update the app's output based on user interactions.
 

@@ -1,5 +1,6 @@
 from collections import Counter
-import nltk
+
+# import nltk
 import tensorflow as tf
 from nltk.corpus import stopwords
 from nltk.stem import WordNetLemmatizer

@@ -1,8 +1,8 @@
 import polars as pl
 import tensorflow as tf
-import random
 import string
 import re
+from typing import Tuple, Dict
 
 
 class DatasetProcessor:
@@ -62,7 +62,7 @@ def process_data(self) -> None:
     def shuffle_and_split(
         self,
         val_split: float = 0.15,
-    ) -> dict:
+    ) -> Dict[str, pl.DataFrame]:
         """
         Shuffle and split the dataset into training, validation, and test sets.
 
@@ -131,7 +131,7 @@ def custom_standardization(input_string):
             standardize=custom_standardization,
         )
 
-    def adapt(self, train_df) -> None:
+    def adapt(self, train_df: pl.DataFrame) -> None:
         """
         Adapt the vectorization layers to the training data.
 
@@ -141,7 +141,9 @@ def adapt(self, train_df) -> None:
         self.source_vectorization.adapt(train_df["en"].to_list())
         self.target_vectorization.adapt(train_df["fr"].to_list())
 
-    def format_dataset(self, eng, fr) -> tuple:
+    def format_dataset(
+        self, eng: tf.Tensor, fr: tf.Tensor
+    ) -> Tuple[Dict[str, tf.Tensor], tf.Tensor]:
         """
         Format the dataset for training.
 

@@ -1,8 +1,5 @@
 import tensorflow as tf
 from transformers import TFBertModel, BertTokenizer
-import optuna
-import json
-import numpy as np
 
 
 class SentimentModelBert:
@@ -169,258 +166,3 @@ def train_and_evaluate(self, model, train_data, valid_data, test_data):
             model.fit(train_data, validation_data=valid_data, epochs=self.epochs)
         test_results = model.evaluate(test_data)
         print("Test Acc.: {:.2f}%".format(test_results[1] * 100))
-
-
-class SentimentModel:
-    """
-    A class to define, train, and evaluate a sentiment analysis model using LSTM layers.
-
-    Attributes:
-        embedding_dim (int): Dimension of the embedding layer.
-        lstm_units (int): Number of units in the LSTM layers.
-        dropout_rate (float): Dropout rate for regularization.
-        learning_rate (float): Learning rate for the optimizer.
-        epochs (int): Number of training epochs.
-    """
-
-    def __init__(
-        self,
-        embedding_dim=50,
-        lstm_units=128,
-        dropout_rate=0.5,
-        learning_rate=0.0008659430202504234,
-        epochs=10,
-    ):
-        """
-        Initialize the SentimentModel class with hyperparameters.
-
-        Args:
-            embedding_dim (int): Dimension of the embedding layer.
-            lstm_units (int): Number of units in the LSTM layers.
-            dropout_rate (float): Dropout rate for regularization.
-            learning_rate (float): Learning rate for the optimizer.
-            epochs (int): Number of training epochs.
-        """
-        self.embedding_dim = embedding_dim
-        self.lstm_units = lstm_units
-        self.dropout_rate = dropout_rate
-        self.learning_rate = learning_rate
-        self.epochs = epochs
-
-    def build_model(self, vocab_size, num_classes):
-        """
-        Build and compile the LSTM-based sentiment analysis model.
-
-        Args:
-            vocab_size (int): Size of the vocabulary.
-            num_classes (int): Number of output classes.
-
-        Returns:
-            tf.keras.Model: The compiled LSTM model.
-        """
-        model = tf.keras.Sequential()
-        model.add(tf.keras.Input(shape=(None,), dtype="int32"))
-        model.add(
-            tf.keras.layers.Embedding(
-                input_dim=vocab_size, output_dim=self.embedding_dim, name="embed-layer"
-            )
-        )
-        model.add(
-            tf.keras.layers.Bidirectional(
-                tf.keras.layers.LSTM(80, return_sequences=True, name="lstm-layer"),
-                name="bidir-lstm1",
-            )
-        )
-        model.add(
-            tf.keras.layers.Bidirectional(
-                tf.keras.layers.LSTM(121, return_sequences=False, name="lstm-layer"),
-                name="bidir-lstm2",
-            )
-        )
-        model.add(tf.keras.layers.Dropout(self.dropout_rate))
-        model.add(tf.keras.layers.Dense(67, activation="gelu"))
-        model.add(tf.keras.layers.Dense(75, activation="gelu"))
-        model.add(tf.keras.layers.Dropout(self.dropout_rate))
-        model.add(tf.keras.layers.Dense(32, activation="gelu"))
-        model.add(tf.keras.layers.Dense(1, activation="sigmoid"))
-        model.compile(
-            optimizer=tf.keras.optimizers.legacy.RMSprop(
-                learning_rate=self.learning_rate
-            ),
-            loss=tf.keras.losses.BinaryCrossentropy(),
-            metrics=["accuracy"],
-        )
-        return model
-
-    def train_and_evaluate(self, model, train_data, valid_data, test_data):
-        """
-        Train and evaluate the LSTM model.
-
-        Args:
-            model (tf.keras.Model): The LSTM model to train.
-            train_data (tf.data.Dataset): The training dataset.
-            valid_data (tf.data.Dataset): The validation dataset.
-            test_data (tf.data.Dataset): The test dataset.
-        """
-        model.summary()
-        early_stopping_callback = tf.keras.callbacks.EarlyStopping(
-            monitor="val_loss",
-            patience=2,
-            mode="min",
-            verbose=0,
-            restore_best_weights=True,
-        )
-        with tf.device("/device:GPU:0"):
-            model.fit(
-                train_data,
-                validation_data=valid_data,
-                epochs=self.epochs,
-                callbacks=[early_stopping_callback],
-            )
-            model.save("./models/sentiment_binary.keras")
-            test_results = model.evaluate(test_data)
-        print("Test Acc.: {:.2f}%".format(test_results[1] * 100))
-
-    def evaluate(self, test_data):
-        """
-        Evaluate the saved LSTM model on the test dataset.
-
-        Args:
-            test_data (tf.data.Dataset): The test dataset.
-        """
-        model = tf.keras.models.load_model("./models/sentiment_binary.keras")
-        test_results = model.evaluate(test_data)
-        print("Test Acc.: {:.2f}%".format(test_results[1] * 100))
-
-    def evaluate_text(self, test_data):
-        """
-        Evaluate the saved LSTM model and return the accuracy.
-
-        Args:
-            test_data (tf.data.Dataset): The test dataset.
-
-        Returns:
-            float: The accuracy of the model on the test dataset.
-        """
-        model = tf.keras.models.load_model("./models/sentiment_binary.keras")
-        test_results = model.evaluate(test_data)
-        return test_results[1]
-
-    def predict_text(self, predict_data):
-        """
-        Predict sentiment for the given data using the saved LSTM model.
-
-        Args:
-            predict_data (tf.data.Dataset): The dataset for prediction.
-
-        Returns:
-            tuple: Predicted classes and probabilities.
-        """
-        model = tf.keras.models.load_model("./models/sentiment_binary.keras")
-        predictions = model.predict(predict_data)
-        y_classes = predictions.argmax(axis=-1)
-        return y_classes, predictions
-
-    def Optuna(self, vocab_size, num_classes, train_data, valid_data, test_data):
-        """
-        Perform hyperparameter optimization using Optuna.
-
-        Args:
-            vocab_size (int): Size of the vocabulary.
-            num_classes (int): Number of output classes.
-            train_data (tf.data.Dataset): The training dataset.
-            valid_data (tf.data.Dataset): The validation dataset.
-            test_data (tf.data.Dataset): The test dataset.
-        """
-
-        def _objective(trial):
-            """
-            Objective function for Optuna to optimize the model's hyperparameters.
-
-            Args:
-                trial (optuna.trial.Trial): An Optuna trial object.
-
-            Returns:
-                float: Validation accuracy of the model.
-            """
-            tf.keras.backend.clear_session()
-            model = tf.keras.Sequential()
-            model.add(tf.keras.Input(shape=(None,), dtype="int32"))
-            model.add(
-                tf.keras.layers.Embedding(
-                    input_dim=vocab_size,
-                    output_dim=self.embedding_dim,
-                )
-            )
-            n_layers_bidirectional = trial.suggest_int("n_units_bidirectional", 1, 3)
-            for i in range(n_layers_bidirectional):
-                num_hidden_bidirectional = trial.suggest_int(
-                    "n_units_bidirectional_l{}".format(i), 64, 128, log=True
-                )
-                model.add(
-                    tf.keras.layers.Bidirectional(
-                        tf.keras.layers.LSTM(
-                            num_hidden_bidirectional,
-                            return_sequences=True,
-                        ),
-                    )
-                )
-            num_hidden_lstm = trial.suggest_int(
-                "n_units_lstm_l{}".format(i), 64, 128, log=True
-            )
-            model.add(
-                tf.keras.layers.Bidirectional(
-                    tf.keras.layers.LSTM(
-                        num_hidden_lstm,
-                        return_sequences=False,
-                    ),
-                )
-            )
-
-            model.add(tf.keras.layers.Dropout(self.dropout_rate))
-            n_layers_nn = trial.suggest_int("n_layers_nn", 1, 2)
-            for i in range(n_layers_nn):
-                num_hidden_nn = trial.suggest_int(
-                    "n_units_nn_l{}".format(i), 64, 128, log=True
-                )
-                model.add(tf.keras.layers.Dense(num_hidden_nn, activation="gelu"))
-
-            model.add(tf.keras.layers.Dropout(self.dropout_rate))
-            model.add(tf.keras.layers.Dense(32, activation="gelu"))
-            model.add(tf.keras.layers.Dense(1, activation="sigmoid"))
-
-            learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True)
-            model.compile(
-                optimizer=tf.keras.optimizers.legacy.RMSprop(
-                    learning_rate=learning_rate
-                ),
-                loss=tf.keras.losses.BinaryCrossentropy(),
-                metrics=["accuracy"],
-            )
-
-            early_stopping_callback = tf.keras.callbacks.EarlyStopping(
-                monitor="val_loss",
-                patience=2,
-                mode="min",
-                verbose=0,
-                restore_best_weights=True,
-            )
-            with tf.device("/device:GPU:0"):
-                model.fit(
-                    train_data,
-                    validation_data=valid_data,
-                    epochs=int(self.epochs / 2),
-                    callbacks=[early_stopping_callback],
-                    verbose=1,
-                )
-            # Evaluate the model accuracy on the validation set.
-            score = model.evaluate(test_data, verbose=1)
-            return score[1]
-
-        study = optuna.create_study(direction="maximize")
-        study.optimize(
-            _objective,
-            n_trials=5,
-        )
-        with open("./models/optuna_model_binary.json", "w") as outfile:
-            json.dump(study.best_params, outfile)
@@ -18,7 +18,9 @@
 )
 
 
-def build_transformer_model(trial, preprocessor):
+def build_transformer_model(
+    trial: optuna.trial.Trial, preprocessor: TextPreprocessor
+) -> tf.keras.Model:
     """
     Build a Transformer model with hyperparameters suggested by Optuna.
 
@@ -71,7 +73,7 @@ def build_transformer_model(trial, preprocessor):
     return transformer
 
 
-def objective(trial):
+def objective(trial: optuna.trial.Trial) -> float:
     """
     Objective function for Optuna to optimize the Transformer model using BLEU score.
 
@@ -131,7 +133,7 @@ def objective(trial):
     return bleu_score
 
 
-def main():
+def main() -> None:
     """
     Main function to run the Optuna optimization.
     """