Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
ruff:
ruff check ./modules sentiment_analysis.py voice_to_text_app.py
ruff check --fix ./modules sentiment_analysis.py voice_to_text_app.py
ruff format ./modules sentiment_analysis.py voice_to_text_app.py
ruff check ./src ./app ./tests
ruff check --fix ./src ./app ./tests
ruff format ./src ./app ./tests
6 changes: 4 additions & 2 deletions app/voice_to_text_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from translation_french_english import test_translation, transformer_model
from modules.data_processor import DatasetProcessor, TextPreprocessor
from modules.utils import ModelPaths
import tensorflow as tf
from typing import Tuple, Any

# Configure logging
logging.basicConfig(
Expand Down Expand Up @@ -108,7 +108,9 @@
[Input("start-record-button", "n_clicks"), Input("stop-record-button", "n_clicks")],
[State("recording-state", "data")],
)
def update_output(start_n_clicks, stop_n_clicks, recording_state):
def update_output(
start_n_clicks: int, stop_n_clicks: int, recording_state: bool
) -> Tuple[Any, Any, Any, Any, bool]:
"""
Update the app's output based on user interactions.

Expand Down
3 changes: 2 additions & 1 deletion src/modules/data_preprocess_nltk.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from collections import Counter
import nltk

# import nltk
import tensorflow as tf
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
Expand Down
10 changes: 6 additions & 4 deletions src/modules/data_processor.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import polars as pl
import tensorflow as tf
import random
import string
import re
from typing import Tuple, Dict


class DatasetProcessor:
Expand Down Expand Up @@ -62,7 +62,7 @@ def process_data(self) -> None:
def shuffle_and_split(
self,
val_split: float = 0.15,
) -> dict:
) -> Dict[str, pl.DataFrame]:
"""
Shuffle and split the dataset into training, validation, and test sets.

Expand Down Expand Up @@ -131,7 +131,7 @@ def custom_standardization(input_string):
standardize=custom_standardization,
)

def adapt(self, train_df) -> None:
def adapt(self, train_df: pl.DataFrame) -> None:
"""
Adapt the vectorization layers to the training data.

Expand All @@ -141,7 +141,9 @@ def adapt(self, train_df) -> None:
self.source_vectorization.adapt(train_df["en"].to_list())
self.target_vectorization.adapt(train_df["fr"].to_list())

def format_dataset(self, eng, fr) -> tuple:
def format_dataset(
self, eng: tf.Tensor, fr: tf.Tensor
) -> Tuple[Dict[str, tf.Tensor], tf.Tensor]:
"""
Format the dataset for training.

Expand Down
258 changes: 0 additions & 258 deletions src/modules/model_bert_other.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
import tensorflow as tf
from transformers import TFBertModel, BertTokenizer
import optuna
import json
import numpy as np


class SentimentModelBert:
Expand Down Expand Up @@ -169,258 +166,3 @@ def train_and_evaluate(self, model, train_data, valid_data, test_data):
model.fit(train_data, validation_data=valid_data, epochs=self.epochs)
test_results = model.evaluate(test_data)
print("Test Acc.: {:.2f}%".format(test_results[1] * 100))


class SentimentModel:
"""
A class to define, train, and evaluate a sentiment analysis model using LSTM layers.

Attributes:
embedding_dim (int): Dimension of the embedding layer.
lstm_units (int): Number of units in the LSTM layers.
dropout_rate (float): Dropout rate for regularization.
learning_rate (float): Learning rate for the optimizer.
epochs (int): Number of training epochs.
"""

def __init__(
self,
embedding_dim=50,
lstm_units=128,
dropout_rate=0.5,
learning_rate=0.0008659430202504234,
epochs=10,
):
"""
Initialize the SentimentModel class with hyperparameters.

Args:
embedding_dim (int): Dimension of the embedding layer.
lstm_units (int): Number of units in the LSTM layers.
dropout_rate (float): Dropout rate for regularization.
learning_rate (float): Learning rate for the optimizer.
epochs (int): Number of training epochs.
"""
self.embedding_dim = embedding_dim
self.lstm_units = lstm_units
self.dropout_rate = dropout_rate
self.learning_rate = learning_rate
self.epochs = epochs

def build_model(self, vocab_size, num_classes):
"""
Build and compile the LSTM-based sentiment analysis model.

Args:
vocab_size (int): Size of the vocabulary.
num_classes (int): Number of output classes.

Returns:
tf.keras.Model: The compiled LSTM model.
"""
model = tf.keras.Sequential()
model.add(tf.keras.Input(shape=(None,), dtype="int32"))
model.add(
tf.keras.layers.Embedding(
input_dim=vocab_size, output_dim=self.embedding_dim, name="embed-layer"
)
)
model.add(
tf.keras.layers.Bidirectional(
tf.keras.layers.LSTM(80, return_sequences=True, name="lstm-layer"),
name="bidir-lstm1",
)
)
model.add(
tf.keras.layers.Bidirectional(
tf.keras.layers.LSTM(121, return_sequences=False, name="lstm-layer"),
name="bidir-lstm2",
)
)
model.add(tf.keras.layers.Dropout(self.dropout_rate))
model.add(tf.keras.layers.Dense(67, activation="gelu"))
model.add(tf.keras.layers.Dense(75, activation="gelu"))
model.add(tf.keras.layers.Dropout(self.dropout_rate))
model.add(tf.keras.layers.Dense(32, activation="gelu"))
model.add(tf.keras.layers.Dense(1, activation="sigmoid"))
model.compile(
optimizer=tf.keras.optimizers.legacy.RMSprop(
learning_rate=self.learning_rate
),
loss=tf.keras.losses.BinaryCrossentropy(),
metrics=["accuracy"],
)
return model

def train_and_evaluate(self, model, train_data, valid_data, test_data):
"""
Train and evaluate the LSTM model.

Args:
model (tf.keras.Model): The LSTM model to train.
train_data (tf.data.Dataset): The training dataset.
valid_data (tf.data.Dataset): The validation dataset.
test_data (tf.data.Dataset): The test dataset.
"""
model.summary()
early_stopping_callback = tf.keras.callbacks.EarlyStopping(
monitor="val_loss",
patience=2,
mode="min",
verbose=0,
restore_best_weights=True,
)
with tf.device("/device:GPU:0"):
model.fit(
train_data,
validation_data=valid_data,
epochs=self.epochs,
callbacks=[early_stopping_callback],
)
model.save("./models/sentiment_binary.keras")
test_results = model.evaluate(test_data)
print("Test Acc.: {:.2f}%".format(test_results[1] * 100))

def evaluate(self, test_data):
"""
Evaluate the saved LSTM model on the test dataset.

Args:
test_data (tf.data.Dataset): The test dataset.
"""
model = tf.keras.models.load_model("./models/sentiment_binary.keras")
test_results = model.evaluate(test_data)
print("Test Acc.: {:.2f}%".format(test_results[1] * 100))

def evaluate_text(self, test_data):
"""
Evaluate the saved LSTM model and return the accuracy.

Args:
test_data (tf.data.Dataset): The test dataset.

Returns:
float: The accuracy of the model on the test dataset.
"""
model = tf.keras.models.load_model("./models/sentiment_binary.keras")
test_results = model.evaluate(test_data)
return test_results[1]

def predict_text(self, predict_data):
"""
Predict sentiment for the given data using the saved LSTM model.

Args:
predict_data (tf.data.Dataset): The dataset for prediction.

Returns:
tuple: Predicted classes and probabilities.
"""
model = tf.keras.models.load_model("./models/sentiment_binary.keras")
predictions = model.predict(predict_data)
y_classes = predictions.argmax(axis=-1)
return y_classes, predictions

def Optuna(self, vocab_size, num_classes, train_data, valid_data, test_data):
"""
Perform hyperparameter optimization using Optuna.

Args:
vocab_size (int): Size of the vocabulary.
num_classes (int): Number of output classes.
train_data (tf.data.Dataset): The training dataset.
valid_data (tf.data.Dataset): The validation dataset.
test_data (tf.data.Dataset): The test dataset.
"""

def _objective(trial):
"""
Objective function for Optuna to optimize the model's hyperparameters.

Args:
trial (optuna.trial.Trial): An Optuna trial object.

Returns:
float: Validation accuracy of the model.
"""
tf.keras.backend.clear_session()
model = tf.keras.Sequential()
model.add(tf.keras.Input(shape=(None,), dtype="int32"))
model.add(
tf.keras.layers.Embedding(
input_dim=vocab_size,
output_dim=self.embedding_dim,
)
)
n_layers_bidirectional = trial.suggest_int("n_units_bidirectional", 1, 3)
for i in range(n_layers_bidirectional):
num_hidden_bidirectional = trial.suggest_int(
"n_units_bidirectional_l{}".format(i), 64, 128, log=True
)
model.add(
tf.keras.layers.Bidirectional(
tf.keras.layers.LSTM(
num_hidden_bidirectional,
return_sequences=True,
),
)
)
num_hidden_lstm = trial.suggest_int(
"n_units_lstm_l{}".format(i), 64, 128, log=True
)
model.add(
tf.keras.layers.Bidirectional(
tf.keras.layers.LSTM(
num_hidden_lstm,
return_sequences=False,
),
)
)

model.add(tf.keras.layers.Dropout(self.dropout_rate))
n_layers_nn = trial.suggest_int("n_layers_nn", 1, 2)
for i in range(n_layers_nn):
num_hidden_nn = trial.suggest_int(
"n_units_nn_l{}".format(i), 64, 128, log=True
)
model.add(tf.keras.layers.Dense(num_hidden_nn, activation="gelu"))

model.add(tf.keras.layers.Dropout(self.dropout_rate))
model.add(tf.keras.layers.Dense(32, activation="gelu"))
model.add(tf.keras.layers.Dense(1, activation="sigmoid"))

learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True)
model.compile(
optimizer=tf.keras.optimizers.legacy.RMSprop(
learning_rate=learning_rate
),
loss=tf.keras.losses.BinaryCrossentropy(),
metrics=["accuracy"],
)

early_stopping_callback = tf.keras.callbacks.EarlyStopping(
monitor="val_loss",
patience=2,
mode="min",
verbose=0,
restore_best_weights=True,
)
with tf.device("/device:GPU:0"):
model.fit(
train_data,
validation_data=valid_data,
epochs=int(self.epochs / 2),
callbacks=[early_stopping_callback],
verbose=1,
)
# Evaluate the model accuracy on the validation set.
score = model.evaluate(test_data, verbose=1)
return score[1]

study = optuna.create_study(direction="maximize")
study.optimize(
_objective,
n_trials=5,
)
with open("./models/optuna_model_binary.json", "w") as outfile:
json.dump(study.best_params, outfile)
8 changes: 5 additions & 3 deletions src/modules/optuna_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@
)


def build_transformer_model(trial, preprocessor):
def build_transformer_model(
trial: optuna.trial.Trial, preprocessor: TextPreprocessor
) -> tf.keras.Model:
"""
Build a Transformer model with hyperparameters suggested by Optuna.

Expand Down Expand Up @@ -71,7 +73,7 @@ def build_transformer_model(trial, preprocessor):
return transformer


def objective(trial):
def objective(trial: optuna.trial.Trial) -> float:
"""
Objective function for Optuna to optimize the Transformer model using BLEU score.

Expand Down Expand Up @@ -131,7 +133,7 @@ def objective(trial):
return bleu_score


def main():
def main() -> None:
"""
Main function to run the Optuna optimization.
"""
Expand Down
Loading