diff --git a/Roshan_senti.ipynb b/Roshan_senti.ipynb
new file mode 100644
index 0000000..c45993d
--- /dev/null
+++ b/Roshan_senti.ipynb
@@ -0,0 +1,1554 @@
+
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "provenance": []
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ },
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "id": "tglWjtAwVwJD"
+ },
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import nltk\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n",
+ "from sklearn.naive_bayes import MultinomialNB\n",
+ "from sklearn.metrics import classification_report, confusion_matrix"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df=pd.read_csv('/content/yelp_tip.csv')\n",
+ "df.head()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 293
+ },
+ "id": "U2Wd1VVfV6SC",
+ "outputId": "f59faa21-8542-4f66-e443-ef97f6dfc736"
+ },
+ "execution_count": 3,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " _id user_id business_id \\\n",
+ "0 66ea489ae59c7c5b6d8be1ac AGNUgVwnZUey3gcPCJ76iw 3uLgwr0qeCNMjKenHJwPGQ \n",
+ "1 66ea489ae59c7c5b6d8be1ad NBN4MgHP9D3cw--SnauTkA QoezRbYQncpRqyrLH6Iqjg \n",
+ "2 66ea489ae59c7c5b6d8be1ae -copOvldyKh1qr-vzkDEvw MYoRNLb5chwjQe3c_k37Gg \n",
+ "3 66ea489ae59c7c5b6d8be1af FjMQVZjSqY8syIO-53KFKw hV-bABTK-glh5wj31ps_Jw \n",
+ "4 66ea489ae59c7c5b6d8be1b0 ld0AperBXk1h6UbqmM80zw _uN0OudeJ3Zl_tf6nxg5ww \n",
+ "\n",
+ " text date \\\n",
+ "0 Avengers time with the ladies. 2012-05-18 02:17:21 \n",
+ "1 They have lots of good deserts and tasty cuban... 2013-02-05 18:35:10 \n",
+ "2 It's open even when you think it isn't 2013-08-18 00:56:08 \n",
+ "3 Very decent fried chicken 2017-06-27 23:05:38 \n",
+ "4 Appetizers.. platter special for lunch 2012-10-06 19:43:09 \n",
+ "\n",
+ " compliment_count \n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 "
+ ],
+ "text/html": [
+ "\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " _id | \n",
+ " user_id | \n",
+ " business_id | \n",
+ " text | \n",
+ " date | \n",
+ " compliment_count | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 66ea489ae59c7c5b6d8be1ac | \n",
+ " AGNUgVwnZUey3gcPCJ76iw | \n",
+ " 3uLgwr0qeCNMjKenHJwPGQ | \n",
+ " Avengers time with the ladies. | \n",
+ " 2012-05-18 02:17:21 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 66ea489ae59c7c5b6d8be1ad | \n",
+ " NBN4MgHP9D3cw--SnauTkA | \n",
+ " QoezRbYQncpRqyrLH6Iqjg | \n",
+ " They have lots of good deserts and tasty cuban... | \n",
+ " 2013-02-05 18:35:10 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 66ea489ae59c7c5b6d8be1ae | \n",
+ " -copOvldyKh1qr-vzkDEvw | \n",
+ " MYoRNLb5chwjQe3c_k37Gg | \n",
+ " It's open even when you think it isn't | \n",
+ " 2013-08-18 00:56:08 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 66ea489ae59c7c5b6d8be1af | \n",
+ " FjMQVZjSqY8syIO-53KFKw | \n",
+ " hV-bABTK-glh5wj31ps_Jw | \n",
+ " Very decent fried chicken | \n",
+ " 2017-06-27 23:05:38 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 66ea489ae59c7c5b6d8be1b0 | \n",
+ " ld0AperBXk1h6UbqmM80zw | \n",
+ " _uN0OudeJ3Zl_tf6nxg5ww | \n",
+ " Appetizers.. platter special for lunch | \n",
+ " 2012-10-06 19:43:09 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "df"
+ }
+ },
+ "metadata": {},
+ "execution_count": 3
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# prompt: print schema of df\n",
+ "\n",
+ "df.info()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "_Pj6djUXeiOk",
+ "outputId": "26220c08-1781-40e0-9708-1a3bed3356d6"
+ },
+ "execution_count": 4,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "\n",
+ "RangeIndex: 908915 entries, 0 to 908914\n",
+ "Data columns (total 6 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 _id 908915 non-null object\n",
+ " 1 user_id 908915 non-null object\n",
+ " 2 business_id 908915 non-null object\n",
+ " 3 text 908901 non-null object\n",
+ " 4 date 908915 non-null object\n",
+ " 5 compliment_count 908915 non-null int64 \n",
+ "dtypes: int64(1), object(5)\n",
+ "memory usage: 41.6+ MB\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import nltk\n",
+ "\n",
+ "# Download 'stopwords' and 'wordnet' from NLTK\n",
+ "nltk.download('stopwords')\n",
+ "nltk.download('wordnet')\n"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "xx2FfzDGZWiv",
+ "outputId": "b80973d4-8821-4b8f-9565-fe4defaa6718"
+ },
+ "execution_count": 5,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
+ "[nltk_data] Unzipping corpora/stopwords.zip.\n",
+ "[nltk_data] Downloading package wordnet to /root/nltk_data...\n"
+ ]
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 5
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from nltk.corpus import stopwords\n",
+ "from nltk.corpus import wordnet"
+ ],
+ "metadata": {
+ "id": "KCEbdZbdaaUf"
+ },
+ "execution_count": 6,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import nltk\n",
+ "from nltk.corpus import stopwords\n",
+ "from nltk.stem import WordNetLemmatizer\n",
+ "import string\n",
+ "import pandas as pd\n",
+ "\n",
+ "# Download resources if not already downloaded\n",
+ "nltk.download('stopwords')\n",
+ "nltk.download('wordnet')\n",
+ "\n",
+ "# Initialize lemmatizer and stop words\n",
+ "lemmatizer = WordNetLemmatizer()\n",
+ "stop_words = set(stopwords.words('english'))\n",
+ "\n",
+ "# Sample DataFrame for demonstration (replace with your actual DataFrame)\n",
+ "data = {'text': [\"Example sentence for preprocessing.\", None, \"Another example text.\", 123.45]}\n",
+ "df = pd.DataFrame(data)\n",
+ "\n",
+ "def preprocess_text(text):\n",
+ " # Convert non-string types to an empty string\n",
+ " if not isinstance(text, str):\n",
+ " text = ''\n",
+ " # Remove punctuation\n",
+ " text = text.translate(str.maketrans('', '', string.punctuation))\n",
+ " # Tokenize and lemmatize\n",
+ " words = text.split()\n",
+ " words = [lemmatizer.lemmatize(word.lower()) for word in words if word.lower() not in stop_words]\n",
+ " return ' '.join(words)\n",
+ "\n",
+ "# Fill NaNs with empty strings before applying preprocessing\n",
+ "df['text'] = df['text'].fillna('')\n",
+ "df['cleaned_text'] = df['text'].apply(preprocess_text)\n",
+ "\n",
+ "df # Display the DataFrame to confirm results\n"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 244
+ },
+ "id": "5yXQoR--XpAu",
+ "outputId": "300aa3cf-b676-4970-8da4-2478f30848f6"
+ },
+ "execution_count": 7,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
+ "[nltk_data] Package stopwords is already up-to-date!\n",
+ "[nltk_data] Downloading package wordnet to /root/nltk_data...\n",
+ "[nltk_data] Package wordnet is already up-to-date!\n"
+ ]
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " text cleaned_text\n",
+ "0 Example sentence for preprocessing. example sentence preprocessing\n",
+ "1 \n",
+ "2 Another example text. another example text\n",
+ "3 123.45 "
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " text | \n",
+ " cleaned_text | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " Example sentence for preprocessing. | \n",
+ " example sentence preprocessing | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " Another example text. | \n",
+ " another example text | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 123.45 | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "df",
+ "summary": "{\n \"name\": \"df\",\n \"rows\": 4,\n \"fields\": [\n {\n \"column\": \"text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"\",\n 123.45,\n \"Example sentence for preprocessing.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"cleaned_text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"example sentence preprocessing\",\n \"\",\n \"another example text\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {},
+ "execution_count": 7
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# # Define sentiment labels\n",
+ "# df['sentiment'] = df['compliment_count'].apply(lambda x: 'pos' if x > 0 else 'neg')"
+ ],
+ "metadata": {
+ "id": "T_HW0B8JXo4M"
+ },
+ "execution_count": 8,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "print(df.columns)\n"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "VidGzeH0XoxH",
+ "outputId": "d3ba5d0d-673b-4b30-bb4c-be81cb4bc45a"
+ },
+ "execution_count": 9,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Index(['text', 'cleaned_text'], dtype='object')\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "if 'compliment_count' not in df.columns:\n",
+ " df['compliment_count'] = 0 # or any other default value\n"
+ ],
+ "metadata": {
+ "id": "qLwvF41aXotk"
+ },
+ "execution_count": 10,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Ensure 'compliment_count' exists in the DataFrame or add it if necessary\n",
+ "if 'compliment_count' not in df.columns:\n",
+ " df['compliment_count'] = 0 # Add with a default value, e.g., 0\n",
+ "\n",
+ "# Define sentiment labels based on 'compliment_count' values\n",
+ "df['sentiment'] = df['compliment_count'].apply(lambda x: 'pos' if x > 0 else 'neg')\n"
+ ],
+ "metadata": {
+ "id": "5THjo_VSXoq7"
+ },
+ "execution_count": 11,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#Splitting the Dataset\n",
+ "X = df['cleaned_text']\n",
+ "y = df['sentiment']\n",
+ "\n",
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)"
+ ],
+ "metadata": {
+ "id": "yA1BKoVjXooT"
+ },
+ "execution_count": 12,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Count Vectorization\n",
+ "count_vect = CountVectorizer()\n",
+ "X_train_counts = count_vect.fit_transform(X_train)\n",
+ "\n",
+ "# TF-IDF Transformation\n",
+ "tfidf_transformer = TfidfTransformer()\n",
+ "X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)"
+ ],
+ "metadata": {
+ "id": "X7D3RUJ6XolV"
+ },
+ "execution_count": 13,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Train the classifier\n",
+ "clf = MultinomialNB()\n",
+ "clf.fit(X_train_tfidf, y_train)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 80
+ },
+ "id": "kINv1yVFXoim",
+ "outputId": "5c8d2ad0-8c96-4a09-881d-3d3d3192d4b6"
+ },
+ "execution_count": 14,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "MultinomialNB()"
+ ],
+ "text/html": [
+ "MultinomialNB()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 14
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Transform the test data\n",
+ "X_test_counts = count_vect.transform(X_test)\n",
+ "X_test_tfidf = tfidf_transformer.transform(X_test_counts)\n",
+ "\n",
+ "# Make predictions\n",
+ "predicted = clf.predict(X_test_tfidf)"
+ ],
+ "metadata": {
+ "id": "qd759H5rdGRP"
+ },
+ "execution_count": 15,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Evaluate the model\n",
+ "print(confusion_matrix(y_test, predicted))\n",
+ "print(classification_report(y_test, predicted))"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "txfIXKGFdGN3",
+ "outputId": "e4cd2448-a358-4799-8595-96edddbc5b39"
+ },
+ "execution_count": 16,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "[[2]]\n",
+ " precision recall f1-score support\n",
+ "\n",
+ " neg 1.00 1.00 1.00 2\n",
+ "\n",
+ " accuracy 1.00 2\n",
+ " macro avg 1.00 1.00 1.00 2\n",
+ "weighted avg 1.00 1.00 1.00 2\n",
+ "\n"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:409: UserWarning: A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.\n",
+ " warnings.warn(\n"
+ ]
+ }
+ ]
+ },
+ {
+ "source": [
+ "# Import necessary libraries\n",
+ "import pandas as pd\n",
+ "import nltk\n",
+ "from nltk.sentiment import SentimentIntensityAnalyzer\n",
+ "\n",
+ "\n",
+ "df = pd.read_csv('/content/yelp_tip.csv')\n",
+ "\n",
+ "# Download VADER lexicon\n",
+ "nltk.download('vader_lexicon')\n",
+ "\n",
+ "# Initialize the SentimentIntensityAnalyzer\n",
+ "sia = SentimentIntensityAnalyzer()\n",
+ "\n",
+ "# Define a function for sentiment analysis\n",
+ "def analyze_sentiment(text):\n",
+ " # Convert the input to string if it's not already\n",
+ " if not isinstance(text, str):\n",
+ " text = str(text)\n",
+ "\n",
+ " score = sia.polarity_scores(text)\n",
+ " return score\n",
+ "\n",
+ "# Apply sentiment analysis to the text column\n",
+ "df['sentiment_scores'] = df['text'].apply(analyze_sentiment)\n",
+ "\n",
+ "# Expand the sentiment scores into separate columns\n",
+ "df_sentiment = df['sentiment_scores'].apply(pd.Series)\n",
+ "\n",
+ "# Combine the original DataFrame with the sentiment scores\n",
+ "df = pd.concat([df, df_sentiment], axis=1)\n",
+ "\n",
+ "# Function to classify sentiment based on compound score\n",
+ "def classify_sentiment(compound_score):\n",
+ " if compound_score >= 0.05:\n",
+ " return 'positive'\n",
+ " elif compound_score <= -0.05:\n",
+ " return 'negative'\n",
+ " else:\n",
+ " return 'neutral'\n",
+ "\n",
+ "# Apply the classification function\n",
+ "df['sentiment'] = df['compound'].apply(classify_sentiment)\n",
+ "\n",
+ "# Display the DataFrame with sentiment analysis results\n",
+ "print(df[['text', 'neg', 'neu', 'pos', 'compound', 'sentiment']])"
+ ],
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "UUkfQI5fflxJ",
+ "outputId": "39b3458d-c1f0-442a-94e4-dae4c94fde42"
+ },
+ "execution_count": 17,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "[nltk_data] Downloading package vader_lexicon to /root/nltk_data...\n"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ " text neg neu \\\n",
+ "0 Avengers time with the ladies. 0.000 1.000 \n",
+ "1 They have lots of good deserts and tasty cuban... 0.000 0.756 \n",
+ "2 It's open even when you think it isn't 0.000 1.000 \n",
+ "3 Very decent fried chicken 0.000 1.000 \n",
+ "4 Appetizers.. platter special for lunch 0.000 0.597 \n",
+ "... ... ... ... \n",
+ "908910 Disappointed in one of your managers. 0.383 0.617 \n",
+ "908911 Great food and service. 0.000 0.423 \n",
+ "908912 Love their Cubans!! 0.000 0.295 \n",
+ "908913 Great pizza great price 0.000 0.196 \n",
+ "908914 Food is good value but a bit hot! 0.000 0.559 \n",
+ "\n",
+ " pos compound sentiment \n",
+ "0 0.000 0.0000 neutral \n",
+ "1 0.244 0.4404 positive \n",
+ "2 0.000 0.0000 neutral \n",
+ "3 0.000 0.0000 neutral \n",
+ "4 0.403 0.4019 positive \n",
+ "... ... ... ... \n",
+ "908910 0.000 -0.4767 negative \n",
+ "908911 0.577 0.6249 positive \n",
+ "908912 0.705 0.6988 positive \n",
+ "908913 0.804 0.8481 positive \n",
+ "908914 0.441 0.4482 positive \n",
+ "\n",
+ "[908915 rows x 6 columns]\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [],
+ "metadata": {
+ "id": "PTXBFouifYDd"
+ },
+ "execution_count": null,
+ "outputs": []
+ }
+ ]
+}