From 2234a71864214c44e039eff7763bf71f4f977de7 Mon Sep 17 00:00:00 2001 From: Jeremy Vachier <89128100+jvachier@users.noreply.github.com> Date: Thu, 15 May 2025 19:52:18 +0200 Subject: [PATCH 1/6] Fixing Makefile, Types, Docstring and updating test with the updated architecture. --- Makefile | 6 +++--- app/voice_to_text_app.py | 6 ++++-- src/modules/data_preprocess_nltk.py | 3 ++- src/modules/data_processor.py | 1 - src/modules/model_bert_other.py | 1 - src/modules/speech_to_text.py | 12 ++++++------ src/modules/transformer_components.py | 5 ++++- tests/test_data_processor.py | 1 - tests/test_transformer_model.py | 4 +--- 9 files changed, 20 insertions(+), 19 deletions(-) diff --git a/Makefile b/Makefile index 336b82f..5346de4 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ ruff: - ruff check ./modules sentiment_analysis.py voice_to_text_app.py - ruff check --fix ./modules sentiment_analysis.py voice_to_text_app.py - ruff format ./modules sentiment_analysis.py voice_to_text_app.py \ No newline at end of file + ruff check ./src ./app ./tests + ruff check --fix ./src ./app ./tests + ruff format ./src ./app ./tests \ No newline at end of file diff --git a/app/voice_to_text_app.py b/app/voice_to_text_app.py index db44ae4..866af92 100644 --- a/app/voice_to_text_app.py +++ b/app/voice_to_text_app.py @@ -7,7 +7,7 @@ from translation_french_english import test_translation, transformer_model from modules.data_processor import DatasetProcessor, TextPreprocessor from modules.utils import ModelPaths -import tensorflow as tf +from typing import Tuple, Any # Configure logging logging.basicConfig( @@ -108,7 +108,9 @@ [Input("start-record-button", "n_clicks"), Input("stop-record-button", "n_clicks")], [State("recording-state", "data")], ) -def update_output(start_n_clicks, stop_n_clicks, recording_state): +def update_output( + start_n_clicks: int, stop_n_clicks: int, recording_state: bool +) -> Tuple[Any, Any, Any, Any, bool]: """ Update the app's output based on user interactions. diff --git a/src/modules/data_preprocess_nltk.py b/src/modules/data_preprocess_nltk.py index 36b6809..f322826 100644 --- a/src/modules/data_preprocess_nltk.py +++ b/src/modules/data_preprocess_nltk.py @@ -1,5 +1,6 @@ from collections import Counter -import nltk + +# import nltk import tensorflow as tf from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer diff --git a/src/modules/data_processor.py b/src/modules/data_processor.py index 03b8b22..7ed99bc 100644 --- a/src/modules/data_processor.py +++ b/src/modules/data_processor.py @@ -1,6 +1,5 @@ import polars as pl import tensorflow as tf -import random import string import re diff --git a/src/modules/model_bert_other.py b/src/modules/model_bert_other.py index 39fd55b..516f426 100644 --- a/src/modules/model_bert_other.py +++ b/src/modules/model_bert_other.py @@ -2,7 +2,6 @@ from transformers import TFBertModel, BertTokenizer import optuna import json -import numpy as np class SentimentModelBert: diff --git a/src/modules/speech_to_text.py b/src/modules/speech_to_text.py index 3e9c617..39c0724 100644 --- a/src/modules/speech_to_text.py +++ b/src/modules/speech_to_text.py @@ -21,7 +21,7 @@ class SpeechToText: recording (bool): Flag to indicate if recording is active. """ - def __init__(self, model_path): + def __init__(self, model_path: str): """ Initialize the SpeechToText class. @@ -48,7 +48,7 @@ def __init__(self, model_path): self.recognized_text = [] self.recording = False - def start_recording(self): + def start_recording(self) -> None: """ Start recording audio and process it for speech-to-text conversion. """ @@ -57,14 +57,14 @@ def start_recording(self): threading.Thread(target=self.record_audio).start() logging.info("Recording started.") - def stop_recording(self): + def stop_recording(self) -> None: """ Stop recording audio. """ self.recording = False logging.info("Recording stopped.") - def record_audio(self): + def record_audio(self) -> None: """ Record audio from the microphone and convert it to text using Vosk. """ @@ -89,7 +89,7 @@ def record_audio(self): text_file.write("\n".join(self.recognized_text)) logging.info("Text written to recognized_text.txt") - def get_recognized_text(self): + def get_recognized_text(self) -> str: """ Get the full recognized text. @@ -98,7 +98,7 @@ def get_recognized_text(self): """ return " ".join(self.recognized_text) - def predict_sentiment(self, text): + def predict_sentiment(self, text: str) -> str: """ Predict the sentiment of the given text using a pre-trained model. diff --git a/src/modules/transformer_components.py b/src/modules/transformer_components.py index 81a4d30..2ea1867 100644 --- a/src/modules/transformer_components.py +++ b/src/modules/transformer_components.py @@ -1,6 +1,7 @@ import tensorflow as tf import logging from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction +from typing import Any @tf.keras.utils.register_keras_serializable(package="Custom") @@ -163,7 +164,9 @@ def get_config(self): return config -def evaluate_bleu(model, dataset, preprocessor): +def evaluate_bleu( + model: tf.keras.Model, dataset: tf.data.Dataset, preprocessor: Any +) -> float: """ Evaluate the BLEU score for the model on the given dataset. diff --git a/tests/test_data_processor.py b/tests/test_data_processor.py index 2038c51..6125ab9 100644 --- a/tests/test_data_processor.py +++ b/tests/test_data_processor.py @@ -1,6 +1,5 @@ import pytest import polars as pl -import tensorflow as tf from modules.data_processor import DatasetProcessor, TextPreprocessor diff --git a/tests/test_transformer_model.py b/tests/test_transformer_model.py index 363f42d..d173e13 100644 --- a/tests/test_transformer_model.py +++ b/tests/test_transformer_model.py @@ -1,6 +1,6 @@ import pytest import tensorflow as tf -from modules.data_processor import DatasetProcessor, TextPreprocessor +from modules.data_processor import TextPreprocessor from modules.transformer_components import ( PositionalEmbedding, TransformerEncoder, @@ -8,8 +8,6 @@ evaluate_bleu, ) from translation_french_english import transformer_model -from modules.utils import ModelPaths -import os @pytest.fixture From 8ffe5ea8771629d83c1921d481de1e363c88038b Mon Sep 17 00:00:00 2001 From: Jeremy Vachier <89128100+jvachier@users.noreply.github.com> Date: Thu, 15 May 2025 20:01:56 +0200 Subject: [PATCH 2/6] Matching docstring and Types. --- src/modules/optuna_transformer.py | 8 ++- src/translation_french_english.py | 8 ++- tests/test_transformer_model.py | 105 +++++++++++++++++++++++------- 3 files changed, 92 insertions(+), 29 deletions(-) diff --git a/src/modules/optuna_transformer.py b/src/modules/optuna_transformer.py index 8a68f61..97ae668 100644 --- a/src/modules/optuna_transformer.py +++ b/src/modules/optuna_transformer.py @@ -18,7 +18,9 @@ ) -def build_transformer_model(trial, preprocessor): +def build_transformer_model( + trial: optuna.trial.Trial, preprocessor: TextPreprocessor +) -> tf.keras.Model: """ Build a Transformer model with hyperparameters suggested by Optuna. @@ -71,7 +73,7 @@ def build_transformer_model(trial, preprocessor): return transformer -def objective(trial): +def objective(trial: optuna.trial.Trial) -> float: """ Objective function for Optuna to optimize the Transformer model using BLEU score. @@ -131,7 +133,7 @@ def objective(trial): return bleu_score -def main(): +def main() -> None: """ Main function to run the Optuna optimization. """ diff --git a/src/translation_french_english.py b/src/translation_french_english.py index 9c65b3b..1441bd5 100644 --- a/src/translation_french_english.py +++ b/src/translation_french_english.py @@ -21,7 +21,7 @@ def transformer_model( - transformer_model_path, + transformer_model_path: str, preprocessor: TextPreprocessor, train_ds: tf.data.Dataset, val_ds: tf.data.Dataset, @@ -134,7 +134,11 @@ def transformer_model( return transformer -def test_translation(transformer, preprocessor, input_sentence="Hello") -> str: +def test_translation( + transformer: tf.keras.Model, + preprocessor: TextPreprocessor, + input_sentence: str = "Hello", +) -> str: """ Test the Transformer model by translating an input sentence. diff --git a/tests/test_transformer_model.py b/tests/test_transformer_model.py index d173e13..9ccb5e2 100644 --- a/tests/test_transformer_model.py +++ b/tests/test_transformer_model.py @@ -1,5 +1,7 @@ import pytest import tensorflow as tf +import pandas as pd +from typing import Tuple from modules.data_processor import TextPreprocessor from modules.transformer_components import ( PositionalEmbedding, @@ -11,45 +13,64 @@ @pytest.fixture -def setup_data(): +def setup_data() -> ( + Tuple[TextPreprocessor, tf.data.Dataset, tf.data.Dataset, tf.data.Dataset] +): """ Fixture to set up a mocked dataset and preprocessor for testing. + + Returns: + Tuple containing: + - TextPreprocessor: The initialized text preprocessor + - tf.data.Dataset: Training dataset + - tf.data.Dataset: Validation dataset + - tf.data.Dataset: Test dataset """ - import pandas as pd # Create a small mock dataset - mock_data = { + mock_data: dict[str, list[str]] = { "en": ["hello", "how are you", "good morning", "thank you", "goodbye"], "fr": ["bonjour", "comment ça va", "bon matin", "merci", "au revoir"], } mock_df = pd.DataFrame(mock_data) # Split the mock dataset - train_df = mock_df.sample(frac=0.6, random_state=42) - val_df = mock_df.drop(train_df.index).sample(frac=0.5, random_state=42) - test_df = mock_df.drop(train_df.index).drop(val_df.index) + train_df: pd.DataFrame = mock_df.sample(frac=0.6, random_state=42) + val_df: pd.DataFrame = mock_df.drop(train_df.index).sample( + frac=0.5, random_state=42 + ) + test_df: pd.DataFrame = mock_df.drop(train_df.index).drop(val_df.index) # Initialize the preprocessor - preprocessor = TextPreprocessor() + preprocessor: TextPreprocessor = TextPreprocessor() preprocessor.adapt(train_df) # Create TensorFlow datasets - train_ds = preprocessor.make_dataset(train_df) - val_ds = preprocessor.make_dataset(val_df) - test_ds = preprocessor.make_dataset(test_df) + train_ds: tf.data.Dataset = preprocessor.make_dataset(train_df) + val_ds: tf.data.Dataset = preprocessor.make_dataset(val_df) + test_ds: tf.data.Dataset = preprocessor.make_dataset(test_df) return preprocessor, train_ds, val_ds, test_ds -def test_transformer_model_build(setup_data): +def test_transformer_model_build( + setup_data: Tuple[ + TextPreprocessor, tf.data.Dataset, tf.data.Dataset, tf.data.Dataset + ], +) -> None: """ Test if the Transformer model is built correctly. + + Args: + setup_data: Fixture providing preprocessor and datasets """ preprocessor, train_ds, val_ds, _ = setup_data transformer_model_path = "src/models/test_transformer_model.keras" # Build the model - model = transformer_model(transformer_model_path, preprocessor, train_ds, val_ds) + model: tf.keras.Model = transformer_model( + transformer_model_path, preprocessor, train_ds, val_ds + ) # Check if the model is compiled assert model.optimizer is not None, "Model is not compiled." @@ -57,18 +78,27 @@ def test_transformer_model_build(setup_data): assert model.metrics is not None, "Metrics are not defined." -def test_transformer_model_training(setup_data): +def test_transformer_model_training( + setup_data: Tuple[ + TextPreprocessor, tf.data.Dataset, tf.data.Dataset, tf.data.Dataset + ], +) -> None: """ Test if the Transformer model can be trained without errors. + + Args: + setup_data: Fixture providing preprocessor and datasets """ preprocessor, train_ds, val_ds, _ = setup_data transformer_model_path = "src/models/test_transformer_model.keras" # Build the model - model = transformer_model(transformer_model_path, preprocessor, train_ds, val_ds) + model: tf.keras.Model = transformer_model( + transformer_model_path, preprocessor, train_ds, val_ds + ) # Train the model for 1 epoch - history = model.fit( + history: tf.keras.callbacks.History = model.fit( train_ds, validation_data=val_ds, epochs=1, @@ -80,18 +110,27 @@ def test_transformer_model_training(setup_data): assert "val_loss" in history.history, "Validation loss is not recorded." -def test_transformer_model_evaluation(setup_data): +def test_transformer_model_evaluation( + setup_data: Tuple[ + TextPreprocessor, tf.data.Dataset, tf.data.Dataset, tf.data.Dataset + ], +) -> None: """ Test if the Transformer model can be evaluated without errors. + + Args: + setup_data: Fixture providing preprocessor and datasets """ preprocessor, train_ds, val_ds, test_ds = setup_data transformer_model_path = "src/models/test_transformer_model.keras" # Build the model - model = transformer_model(transformer_model_path, preprocessor, train_ds, val_ds) + model: tf.keras.Model = transformer_model( + transformer_model_path, preprocessor, train_ds, val_ds + ) # Evaluate the model - results = model.evaluate(test_ds, verbose=0) + results: list[float] = model.evaluate(test_ds, verbose=0) # Check if evaluation results are returned assert len(results) == 2, "Evaluation did not return loss and accuracy." @@ -99,36 +138,54 @@ def test_transformer_model_evaluation(setup_data): assert 0 <= results[1] <= 1, "Test accuracy is invalid." -def test_transformer_model_bleu_score(setup_data): +def test_transformer_model_bleu_score( + setup_data: Tuple[ + TextPreprocessor, tf.data.Dataset, tf.data.Dataset, tf.data.Dataset + ], +) -> None: """ Test if the BLEU score can be calculated for the Transformer model. + + Args: + setup_data: Fixture providing preprocessor and datasets """ preprocessor, train_ds, val_ds, test_ds = setup_data transformer_model_path = "src/models/test_transformer_model.keras" # Build the model - model = transformer_model(transformer_model_path, preprocessor, train_ds, val_ds) + model: tf.keras.Model = transformer_model( + transformer_model_path, preprocessor, train_ds, val_ds + ) # Calculate BLEU score - bleu_score = evaluate_bleu(model, test_ds, preprocessor) + bleu_score: float = evaluate_bleu(model, test_ds, preprocessor) # Check if BLEU score is valid assert 0 <= bleu_score <= 1, "BLEU score is invalid." -def test_transformer_model_loading(setup_data): +def test_transformer_model_loading( + setup_data: Tuple[ + TextPreprocessor, tf.data.Dataset, tf.data.Dataset, tf.data.Dataset + ], +) -> None: """ Test if the Transformer model can be loaded from a saved file. + + Args: + setup_data: Fixture providing preprocessor and datasets """ preprocessor, train_ds, val_ds, _ = setup_data transformer_model_path = "src/models/test_transformer_model.keras" # Build and save the model - model = transformer_model(transformer_model_path, preprocessor, train_ds, val_ds) + model: tf.keras.Model = transformer_model( + transformer_model_path, preprocessor, train_ds, val_ds + ) model.save(transformer_model_path) # Load the model - loaded_model = tf.keras.models.load_model( + loaded_model: tf.keras.Model = tf.keras.models.load_model( transformer_model_path, custom_objects={ "PositionalEmbedding": PositionalEmbedding, From 18204051c8b946f349e4103f9943bf13b90617eb Mon Sep 17 00:00:00 2001 From: Jeremy Vachier <89128100+jvachier@users.noreply.github.com> Date: Thu, 15 May 2025 20:03:34 +0200 Subject: [PATCH 3/6] Removing unncessary code. --- src/modules/model_bert_other.py | 255 -------------------------------- 1 file changed, 255 deletions(-) diff --git a/src/modules/model_bert_other.py b/src/modules/model_bert_other.py index 516f426..45141e1 100644 --- a/src/modules/model_bert_other.py +++ b/src/modules/model_bert_other.py @@ -168,258 +168,3 @@ def train_and_evaluate(self, model, train_data, valid_data, test_data): model.fit(train_data, validation_data=valid_data, epochs=self.epochs) test_results = model.evaluate(test_data) print("Test Acc.: {:.2f}%".format(test_results[1] * 100)) - - -class SentimentModel: - """ - A class to define, train, and evaluate a sentiment analysis model using LSTM layers. - - Attributes: - embedding_dim (int): Dimension of the embedding layer. - lstm_units (int): Number of units in the LSTM layers. - dropout_rate (float): Dropout rate for regularization. - learning_rate (float): Learning rate for the optimizer. - epochs (int): Number of training epochs. - """ - - def __init__( - self, - embedding_dim=50, - lstm_units=128, - dropout_rate=0.5, - learning_rate=0.0008659430202504234, - epochs=10, - ): - """ - Initialize the SentimentModel class with hyperparameters. - - Args: - embedding_dim (int): Dimension of the embedding layer. - lstm_units (int): Number of units in the LSTM layers. - dropout_rate (float): Dropout rate for regularization. - learning_rate (float): Learning rate for the optimizer. - epochs (int): Number of training epochs. - """ - self.embedding_dim = embedding_dim - self.lstm_units = lstm_units - self.dropout_rate = dropout_rate - self.learning_rate = learning_rate - self.epochs = epochs - - def build_model(self, vocab_size, num_classes): - """ - Build and compile the LSTM-based sentiment analysis model. - - Args: - vocab_size (int): Size of the vocabulary. - num_classes (int): Number of output classes. - - Returns: - tf.keras.Model: The compiled LSTM model. - """ - model = tf.keras.Sequential() - model.add(tf.keras.Input(shape=(None,), dtype="int32")) - model.add( - tf.keras.layers.Embedding( - input_dim=vocab_size, output_dim=self.embedding_dim, name="embed-layer" - ) - ) - model.add( - tf.keras.layers.Bidirectional( - tf.keras.layers.LSTM(80, return_sequences=True, name="lstm-layer"), - name="bidir-lstm1", - ) - ) - model.add( - tf.keras.layers.Bidirectional( - tf.keras.layers.LSTM(121, return_sequences=False, name="lstm-layer"), - name="bidir-lstm2", - ) - ) - model.add(tf.keras.layers.Dropout(self.dropout_rate)) - model.add(tf.keras.layers.Dense(67, activation="gelu")) - model.add(tf.keras.layers.Dense(75, activation="gelu")) - model.add(tf.keras.layers.Dropout(self.dropout_rate)) - model.add(tf.keras.layers.Dense(32, activation="gelu")) - model.add(tf.keras.layers.Dense(1, activation="sigmoid")) - model.compile( - optimizer=tf.keras.optimizers.legacy.RMSprop( - learning_rate=self.learning_rate - ), - loss=tf.keras.losses.BinaryCrossentropy(), - metrics=["accuracy"], - ) - return model - - def train_and_evaluate(self, model, train_data, valid_data, test_data): - """ - Train and evaluate the LSTM model. - - Args: - model (tf.keras.Model): The LSTM model to train. - train_data (tf.data.Dataset): The training dataset. - valid_data (tf.data.Dataset): The validation dataset. - test_data (tf.data.Dataset): The test dataset. - """ - model.summary() - early_stopping_callback = tf.keras.callbacks.EarlyStopping( - monitor="val_loss", - patience=2, - mode="min", - verbose=0, - restore_best_weights=True, - ) - with tf.device("/device:GPU:0"): - model.fit( - train_data, - validation_data=valid_data, - epochs=self.epochs, - callbacks=[early_stopping_callback], - ) - model.save("./models/sentiment_binary.keras") - test_results = model.evaluate(test_data) - print("Test Acc.: {:.2f}%".format(test_results[1] * 100)) - - def evaluate(self, test_data): - """ - Evaluate the saved LSTM model on the test dataset. - - Args: - test_data (tf.data.Dataset): The test dataset. - """ - model = tf.keras.models.load_model("./models/sentiment_binary.keras") - test_results = model.evaluate(test_data) - print("Test Acc.: {:.2f}%".format(test_results[1] * 100)) - - def evaluate_text(self, test_data): - """ - Evaluate the saved LSTM model and return the accuracy. - - Args: - test_data (tf.data.Dataset): The test dataset. - - Returns: - float: The accuracy of the model on the test dataset. - """ - model = tf.keras.models.load_model("./models/sentiment_binary.keras") - test_results = model.evaluate(test_data) - return test_results[1] - - def predict_text(self, predict_data): - """ - Predict sentiment for the given data using the saved LSTM model. - - Args: - predict_data (tf.data.Dataset): The dataset for prediction. - - Returns: - tuple: Predicted classes and probabilities. - """ - model = tf.keras.models.load_model("./models/sentiment_binary.keras") - predictions = model.predict(predict_data) - y_classes = predictions.argmax(axis=-1) - return y_classes, predictions - - def Optuna(self, vocab_size, num_classes, train_data, valid_data, test_data): - """ - Perform hyperparameter optimization using Optuna. - - Args: - vocab_size (int): Size of the vocabulary. - num_classes (int): Number of output classes. - train_data (tf.data.Dataset): The training dataset. - valid_data (tf.data.Dataset): The validation dataset. - test_data (tf.data.Dataset): The test dataset. - """ - - def _objective(trial): - """ - Objective function for Optuna to optimize the model's hyperparameters. - - Args: - trial (optuna.trial.Trial): An Optuna trial object. - - Returns: - float: Validation accuracy of the model. - """ - tf.keras.backend.clear_session() - model = tf.keras.Sequential() - model.add(tf.keras.Input(shape=(None,), dtype="int32")) - model.add( - tf.keras.layers.Embedding( - input_dim=vocab_size, - output_dim=self.embedding_dim, - ) - ) - n_layers_bidirectional = trial.suggest_int("n_units_bidirectional", 1, 3) - for i in range(n_layers_bidirectional): - num_hidden_bidirectional = trial.suggest_int( - "n_units_bidirectional_l{}".format(i), 64, 128, log=True - ) - model.add( - tf.keras.layers.Bidirectional( - tf.keras.layers.LSTM( - num_hidden_bidirectional, - return_sequences=True, - ), - ) - ) - num_hidden_lstm = trial.suggest_int( - "n_units_lstm_l{}".format(i), 64, 128, log=True - ) - model.add( - tf.keras.layers.Bidirectional( - tf.keras.layers.LSTM( - num_hidden_lstm, - return_sequences=False, - ), - ) - ) - - model.add(tf.keras.layers.Dropout(self.dropout_rate)) - n_layers_nn = trial.suggest_int("n_layers_nn", 1, 2) - for i in range(n_layers_nn): - num_hidden_nn = trial.suggest_int( - "n_units_nn_l{}".format(i), 64, 128, log=True - ) - model.add(tf.keras.layers.Dense(num_hidden_nn, activation="gelu")) - - model.add(tf.keras.layers.Dropout(self.dropout_rate)) - model.add(tf.keras.layers.Dense(32, activation="gelu")) - model.add(tf.keras.layers.Dense(1, activation="sigmoid")) - - learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True) - model.compile( - optimizer=tf.keras.optimizers.legacy.RMSprop( - learning_rate=learning_rate - ), - loss=tf.keras.losses.BinaryCrossentropy(), - metrics=["accuracy"], - ) - - early_stopping_callback = tf.keras.callbacks.EarlyStopping( - monitor="val_loss", - patience=2, - mode="min", - verbose=0, - restore_best_weights=True, - ) - with tf.device("/device:GPU:0"): - model.fit( - train_data, - validation_data=valid_data, - epochs=int(self.epochs / 2), - callbacks=[early_stopping_callback], - verbose=1, - ) - # Evaluate the model accuracy on the validation set. - score = model.evaluate(test_data, verbose=1) - return score[1] - - study = optuna.create_study(direction="maximize") - study.optimize( - _objective, - n_trials=5, - ) - with open("./models/optuna_model_binary.json", "w") as outfile: - json.dump(study.best_params, outfile) From a9e45960b72c11e59d8567979029fffb75aafd5b Mon Sep 17 00:00:00 2001 From: Jeremy Vachier <89128100+jvachier@users.noreply.github.com> Date: Thu, 15 May 2025 20:13:13 +0200 Subject: [PATCH 4/6] Making sure the type are matching the docstrings. --- src/modules/data_processor.py | 9 ++++--- src/modules/model_bert_other.py | 2 -- src/sentiment_analysis.py | 2 +- tests/test_data_processor.py | 46 ++++++++++++++++++++++----------- 4 files changed, 38 insertions(+), 21 deletions(-) diff --git a/src/modules/data_processor.py b/src/modules/data_processor.py index 7ed99bc..63dcd1a 100644 --- a/src/modules/data_processor.py +++ b/src/modules/data_processor.py @@ -2,6 +2,7 @@ import tensorflow as tf import string import re +from typing import Tuple, Dict class DatasetProcessor: @@ -61,7 +62,7 @@ def process_data(self) -> None: def shuffle_and_split( self, val_split: float = 0.15, - ) -> dict: + ) -> Dict[str, pl.DataFrame]: """ Shuffle and split the dataset into training, validation, and test sets. @@ -130,7 +131,7 @@ def custom_standardization(input_string): standardize=custom_standardization, ) - def adapt(self, train_df) -> None: + def adapt(self, train_df: pl.DataFrame) -> None: """ Adapt the vectorization layers to the training data. @@ -140,7 +141,9 @@ def adapt(self, train_df) -> None: self.source_vectorization.adapt(train_df["en"].to_list()) self.target_vectorization.adapt(train_df["fr"].to_list()) - def format_dataset(self, eng, fr) -> tuple: + def format_dataset( + self, eng: tf.Tensor, fr: tf.Tensor + ) -> Tuple[Dict[str, tf.Tensor], tf.Tensor]: """ Format the dataset for training. diff --git a/src/modules/model_bert_other.py b/src/modules/model_bert_other.py index 45141e1..f72c9b3 100644 --- a/src/modules/model_bert_other.py +++ b/src/modules/model_bert_other.py @@ -1,7 +1,5 @@ import tensorflow as tf from transformers import TFBertModel, BertTokenizer -import optuna -import json class SentimentModelBert: diff --git a/src/sentiment_analysis.py b/src/sentiment_analysis.py index b86d1d3..16695de 100644 --- a/src/sentiment_analysis.py +++ b/src/sentiment_analysis.py @@ -21,7 +21,7 @@ ) -def main(): +def main() -> None: """ Main function to execute the sentiment analysis pipeline. diff --git a/tests/test_data_processor.py b/tests/test_data_processor.py index 6125ab9..fbdc997 100644 --- a/tests/test_data_processor.py +++ b/tests/test_data_processor.py @@ -1,28 +1,39 @@ import pytest import polars as pl +from typing import Dict, Any from modules.data_processor import DatasetProcessor, TextPreprocessor @pytest.fixture -def sample_data(): - """Fixture to create a sample dataset.""" - data = { +def sample_data() -> pl.DataFrame: + """ + Fixture to create a sample dataset. + + Returns: + pl.DataFrame: A sample dataset with English and French text. + """ + data: Dict[str, list[str]] = { "en": ["Hello|Hi", "Goodbye|Bye"], "fr": ["Bonjour|Salut", "Au revoir|Adieu"], } return pl.DataFrame(data) -def test_dataset_processor(sample_data): - """Test the DatasetProcessor class.""" +def test_dataset_processor(sample_data: pl.DataFrame) -> None: + """ + Test the DatasetProcessor class. + + Args: + sample_data (pl.DataFrame): A sample dataset fixture. + """ # Save sample data to a temporary Parquet file sample_data.write_parquet("test_data.parquet") # Initialize and process the data - processor = DatasetProcessor("test_data.parquet") + processor: DatasetProcessor = DatasetProcessor("test_data.parquet") processor.load_data() processor.process_data() - data_splits = processor.shuffle_and_split() + data_splits: Dict[str, pl.DataFrame] = processor.shuffle_and_split() # Check if the data is processed correctly assert len(processor.split_df) > 0, "Processed dataset is empty!" @@ -30,7 +41,7 @@ def test_dataset_processor(sample_data): assert processor.split_df["fr"][0].startswith("[start]"), "Start token missing!" # Check if the splits are correct - total_rows = len(processor.split_df) + total_rows: int = len(processor.split_df) assert ( len(data_splits["train"]) + len(data_splits["validation"]) @@ -49,24 +60,29 @@ def test_dataset_processor(sample_data): os.remove("test_data.parquet") -def test_text_preprocessor(sample_data): - """Test the TextPreprocessor class.""" +def test_text_preprocessor(sample_data: pl.DataFrame) -> None: + """ + Test the TextPreprocessor class. + + Args: + sample_data (pl.DataFrame): A sample dataset fixture. + """ # Save sample data to a temporary Parquet file sample_data.write_parquet("test_data.parquet") # Initialize and process the data - processor = DatasetProcessor("test_data.parquet") + processor: DatasetProcessor = DatasetProcessor("test_data.parquet") processor.load_data() processor.process_data() - data_splits = processor.shuffle_and_split() - train_df = data_splits["train"] + data_splits: Dict[str, pl.DataFrame] = processor.shuffle_and_split() + train_df: pl.DataFrame = data_splits["train"] # Initialize the TextPreprocessor - preprocessor = TextPreprocessor() + preprocessor: TextPreprocessor = TextPreprocessor() preprocessor.adapt(train_df) # Check if vectorization works - train_ds = preprocessor.make_dataset(train_df) + train_ds: Any = preprocessor.make_dataset(train_df) for inputs, targets in train_ds.take(1): assert inputs["english"].shape[0] > 0, "English input is empty!" assert inputs["french"].shape[0] > 0, "French input is empty!" From 6136c20a4e3656168abb9b55d3db219e9ecee68e Mon Sep 17 00:00:00 2001 From: Jeremy Vachier <89128100+jvachier@users.noreply.github.com> Date: Thu, 15 May 2025 20:18:38 +0200 Subject: [PATCH 5/6] changing name of function to avoid confusion with pytest functions. --- src/translation_french_english.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/translation_french_english.py b/src/translation_french_english.py index 1441bd5..441c967 100644 --- a/src/translation_french_english.py +++ b/src/translation_french_english.py @@ -134,7 +134,7 @@ def transformer_model( return transformer -def test_translation( +def translation_test( transformer: tf.keras.Model, preprocessor: TextPreprocessor, input_sentence: str = "Hello", @@ -201,7 +201,7 @@ def main() -> None: ) # Test the translation - test_translation( + translation_test( transformer, preprocessor, input_sentence="How are you?", From 24ff1d481f51611f0714a3346d266e8faff5260e Mon Sep 17 00:00:00 2001 From: Jeremy Vachier <89128100+jvachier@users.noreply.github.com> Date: Thu, 15 May 2025 20:21:08 +0200 Subject: [PATCH 6/6] Addressing comment. --- tests/test_data_processor.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_data_processor.py b/tests/test_data_processor.py index fbdc997..39f4a12 100644 --- a/tests/test_data_processor.py +++ b/tests/test_data_processor.py @@ -1,5 +1,6 @@ import pytest import polars as pl +import tensorflow as tf from typing import Dict, Any from modules.data_processor import DatasetProcessor, TextPreprocessor @@ -82,7 +83,7 @@ def test_text_preprocessor(sample_data: pl.DataFrame) -> None: preprocessor.adapt(train_df) # Check if vectorization works - train_ds: Any = preprocessor.make_dataset(train_df) + train_ds: tf.data.Dataset = preprocessor.make_dataset(train_df) for inputs, targets in train_ds.take(1): assert inputs["english"].shape[0] > 0, "English input is empty!" assert inputs["french"].shape[0] > 0, "French input is empty!"