diff --git a/Zee_Recommendation_System_Business_Case_Study.ipynb b/Zee_Recommendation_System_Business_Case_Study.ipynb new file mode 100644 index 0000000..4296f13 --- /dev/null +++ b/Zee_Recommendation_System_Business_Case_Study.ipynb @@ -0,0 +1,12483 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "authorship_tag": "ABX9TyNe7afbLrevXcjouEzcgHf7", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "source": [ + "Define Problem Statement and Formatting the Data\n", + "\n", + "1. Definition of the problem (as per the given problem statement with additional views)\n", + "\n", + "2. Formatting the data files to bring them into a workable format\n", + "\n", + "3. Merging the data files and creating a single consolidated dataframe" + ], + "metadata": { + "id": "X2mdix5Ecedn" + } + }, + { + "cell_type": "markdown", + "source": [ + "Definition of problem : Recommend the movies to users on the basis of movie rating suggested by other users" + ], + "metadata": { + "id": "7ZbSFXxWdWR2" + } + }, + { + "cell_type": "markdown", + "source": [ + "data file of dat extension are formatted to csv file using python code and stored into dataframe" + ], + "metadata": { + "id": "Mw6eaFoyakl1" + } + }, + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "Movie_ID =list()\n", + "Title = list()\n", + "genres = list()\n", + "with open('/content/zee-movies.dat', 'rb') as file:\n", + " # Read the first line\n", + " for line in file:\n", + " header = str(line).split(\"::\")\n", + " Movie_ID.append(header[0].lstrip('b\"').replace(\"'\",'',1))\n", + " Title.append(header[1])\n", + " genres.append(header[2].replace(\"\\\\n'\",'',1).replace('\\\\n\"','',1))\n", + "\n", + "df_movie = pd.DataFrame({'Movie Id':Movie_ID[1:],'Title':Title[1:],'Genres':genres[1:]})\n", + "df_movie.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "qLUfsqECuz7I", + "outputId": "17c363dd-3884-41cf-9067-e2fb16c059a2" + }, + "execution_count": 2, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Movie Id Title Genres\n", + "0 1 Toy Story (1995) Animation|Children's|Comedy\n", + "1 2 Jumanji (1995) Adventure|Children's|Fantasy\n", + "2 3 Grumpier Old Men (1995) Comedy|Romance\n", + "3 4 Waiting to Exhale (1995) Comedy|Drama\n", + "4 5 Father of the Bride Part II (1995) Comedy" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Movie IdTitleGenres
01Toy Story (1995)Animation|Children's|Comedy
12Jumanji (1995)Adventure|Children's|Fantasy
23Grumpier Old Men (1995)Comedy|Romance
34Waiting to Exhale (1995)Comedy|Drama
45Father of the Bride Part II (1995)Comedy
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df_movie", + "summary": "{\n \"name\": \"df_movie\",\n \"rows\": 3883,\n \"fields\": [\n {\n \"column\": \"Movie Id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3883,\n \"samples\": [\n \"1365\",\n \"2706\",\n \"3667\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3883,\n \"samples\": [\n \"Ridicule (1996)\",\n \"American Pie (1999)\",\n \"Rent-A-Cop (1988)\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Genres\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 301,\n \"samples\": [\n \"Action|Adventure|Comedy|Horror\",\n \"Romance|Western\",\n \"Action|Adventure|Children's|Comedy\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 2 + } + ] + }, + { + "cell_type": "code", + "source": [ + "UserID =list()\n", + "MovieID = list()\n", + "Rating = list()\n", + "Timestamp = list()\n", + "with open('/content/zee-ratings.dat', 'rb') as file:\n", + " # Read the first line\n", + " for line in file:\n", + " header = str(line).split(\"::\")\n", + " UserID.append(header[0].lstrip(\"b'\"))\n", + " MovieID.append(header[1])\n", + " Rating.append(header[2])\n", + " Timestamp.append(header[3].replace(\"\\\\n'\",'',1))\n", + "\n", + "df_rating = pd.DataFrame({'UserID':UserID[1:],'MovieID':MovieID[1:],'Rating':Rating[1:],'Timestamp':Timestamp[1:]})\n", + "df_rating.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "4xWrlhA_BSkn", + "outputId": "a634744c-a71d-4c96-dde6-49262d14a3c5" + }, + "execution_count": 3, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " UserID MovieID Rating Timestamp\n", + "0 1 1193 5 978300760\n", + "1 1 661 3 978302109\n", + "2 1 914 3 978301968\n", + "3 1 3408 4 978300275\n", + "4 1 2355 5 978824291" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
UserIDMovieIDRatingTimestamp
0111935978300760
116613978302109
219143978301968
3134084978300275
4123555978824291
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df_rating" + } + }, + "metadata": {}, + "execution_count": 3 + } + ] + }, + { + "cell_type": "code", + "source": [ + "UserID =list()\n", + "Gender = list()\n", + "Age = list()\n", + "Occupation=list()\n", + "Zipcode = list()\n", + "with open('/content/zee-users.dat', 'rb') as file:\n", + " # Read the first line\n", + " for line in file:\n", + " header = str(line).split(\"::\")\n", + " UserID.append(header[0].lstrip(\"b'\"))\n", + " Gender.append(header[1])\n", + " Age.append(header[2])\n", + " Occupation.append(header[3])\n", + " Zipcode.append(header[4].replace(\"\\\\n'\",'',1))\n", + "\n", + "\n", + "df_users = pd.DataFrame({'UserID':UserID[1:],'Gender':Gender[1:],'Age':Age[1:],'Occupation':Occupation[1:],'Zip-code':Zipcode[1:]})\n", + "df_users.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "jlAdlPU3BTpG", + "outputId": "1b1b6950-5984-4fcf-d217-b15d8ca356cb" + }, + "execution_count": 4, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " UserID Gender Age Occupation Zip-code\n", + "0 1 F 1 10 48067\n", + "1 2 M 56 16 70072\n", + "2 3 M 25 15 55117\n", + "3 4 M 45 7 02460\n", + "4 5 M 25 20 55455" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
UserIDGenderAgeOccupationZip-code
01F11048067
12M561670072
23M251555117
34M45702460
45M252055455
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df_users", + "summary": "{\n \"name\": \"df_users\",\n \"rows\": 6040,\n \"fields\": [\n {\n \"column\": \"UserID\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 6040,\n \"samples\": [\n \"5530\",\n \"711\",\n \"4924\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Gender\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"M\",\n \"F\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Age\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 7,\n \"samples\": [\n \"1\",\n \"56\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Occupation\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 21,\n \"samples\": [\n \"10\",\n \"18\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Zip-code\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3439,\n \"samples\": [\n \"02865\",\n \"43213\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 4 + } + ] + }, + { + "cell_type": "code", + "source": [ + "movie = df_movie.copy()" + ], + "metadata": { + "id": "xLzyL2f_fwJD" + }, + "execution_count": 5, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "movie['Genres'] = movie['Genres'].str.split('|')" + ], + "metadata": { + "id": "lnYUeR6qc4cz" + }, + "execution_count": 6, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "movie.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "TmqswkzIeZq4", + "outputId": "e40ea4a7-22a4-48a4-916c-6ae437d46007" + }, + "execution_count": 7, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Movie Id Title \\\n", + "0 1 Toy Story (1995) \n", + "1 2 Jumanji (1995) \n", + "2 3 Grumpier Old Men (1995) \n", + "3 4 Waiting to Exhale (1995) \n", + "4 5 Father of the Bride Part II (1995) \n", + "\n", + " Genres \n", + "0 [Animation, Children's, Comedy] \n", + "1 [Adventure, Children's, Fantasy] \n", + "2 [Comedy, Romance] \n", + "3 [Comedy, Drama] \n", + "4 [Comedy] " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Movie IdTitleGenres
01Toy Story (1995)[Animation, Children's, Comedy]
12Jumanji (1995)[Adventure, Children's, Fantasy]
23Grumpier Old Men (1995)[Comedy, Romance]
34Waiting to Exhale (1995)[Comedy, Drama]
45Father of the Bride Part II (1995)[Comedy]
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "movie", + "summary": "{\n \"name\": \"movie\",\n \"rows\": 3883,\n \"fields\": [\n {\n \"column\": \"Movie Id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3883,\n \"samples\": [\n \"1365\",\n \"2706\",\n \"3667\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3883,\n \"samples\": [\n \"Ridicule (1996)\",\n \"American Pie (1999)\",\n \"Rent-A-Cop (1988)\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Genres\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 7 + } + ] + }, + { + "cell_type": "code", + "source": [ + "movie = movie.explode('Genres')" + ], + "metadata": { + "id": "x1wv9zLFeeHB" + }, + "execution_count": 8, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "movie genres are transform from '|' pipe separated row to multiple rows" + ], + "metadata": { + "id": "iBP8s1pac7j9" + } + }, + { + "cell_type": "code", + "source": [ + "movie.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "U0dkCPlugDJs", + "outputId": "79b5c9b4-4c47-4b06-8f4e-a7f30856d31b" + }, + "execution_count": 9, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Movie Id Title Genres\n", + "0 1 Toy Story (1995) Animation\n", + "0 1 Toy Story (1995) Children's\n", + "0 1 Toy Story (1995) Comedy\n", + "1 2 Jumanji (1995) Adventure\n", + "1 2 Jumanji (1995) Children's" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Movie IdTitleGenres
01Toy Story (1995)Animation
01Toy Story (1995)Children's
01Toy Story (1995)Comedy
12Jumanji (1995)Adventure
12Jumanji (1995)Children's
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "movie", + "summary": "{\n \"name\": \"movie\",\n \"rows\": 6408,\n \"fields\": [\n {\n \"column\": \"Movie Id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3883,\n \"samples\": [\n \"1365\",\n \"2706\",\n \"3667\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3883,\n \"samples\": [\n \"Ridicule (1996)\",\n \"American Pie (1999)\",\n \"Rent-A-Cop (1988)\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Genres\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 18,\n \"samples\": [\n \"Animation\",\n \"Children's\",\n \"Crime\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 9 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Performing EDA, Data Cleaning, and Feature Engineering\n", + "\n", + "1. Reviewing the shape and structure of the dataset\n", + "\n", + "2. Performing necessary type conversion and deriving new features\n", + "\n", + "3. Investigating the data for any inconsistency\n", + "\n", + "4. Group the data according to the average rating and no. of ratings" + ], + "metadata": { + "id": "zHJz1i5Ti7t1" + } + }, + { + "cell_type": "code", + "source": [ + "movie.shape" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "fUAzJ6ewjDHa", + "outputId": "060ba5d3-05c3-4565-95e9-d920f3e003a3" + }, + "execution_count": 10, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(6408, 3)" + ] + }, + "metadata": {}, + "execution_count": 10 + } + ] + }, + { + "cell_type": "code", + "source": [ + "df_rating.shape" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Isc4TmZTlkbn", + "outputId": "f484e743-a81f-4291-90cd-604c56fba2fd" + }, + "execution_count": 11, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(1000209, 4)" + ] + }, + "metadata": {}, + "execution_count": 11 + } + ] + }, + { + "cell_type": "code", + "source": [ + "df_users.shape" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "hL-xGbr2l0Cj", + "outputId": "d496413e-a542-43bb-f560-320c2a42463b" + }, + "execution_count": 12, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(6040, 5)" + ] + }, + "metadata": {}, + "execution_count": 12 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "to overcome from memory utilization we have converted the columns of dataframe as per the data's specified" + ], + "metadata": { + "id": "4dsrJ1DVdX6V" + } + }, + { + "cell_type": "code", + "source": [ + "movie['Movie Id'] = movie['Movie Id'].astype(int)" + ], + "metadata": { + "id": "7O3hztufl_2_" + }, + "execution_count": 13, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df_rating['UserID'] = df_rating['UserID'].astype(int)\n", + "df_rating['MovieID'] = df_rating['MovieID'].astype(int)\n", + "df_rating['Rating'] = df_rating['Rating'].astype(int)\n", + "df_rating['Timestamp'] = df_rating['Timestamp'].astype(int)" + ], + "metadata": { + "id": "yHiJ4lo7nnuS" + }, + "execution_count": 14, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df_rating.info()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "EslNrr-epd2q", + "outputId": "45b4b220-8c2f-4b4d-ef27-0be1115bd0a2" + }, + "execution_count": 15, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "RangeIndex: 1000209 entries, 0 to 1000208\n", + "Data columns (total 4 columns):\n", + " # Column Non-Null Count Dtype\n", + "--- ------ -------------- -----\n", + " 0 UserID 1000209 non-null int64\n", + " 1 MovieID 1000209 non-null int64\n", + " 2 Rating 1000209 non-null int64\n", + " 3 Timestamp 1000209 non-null int64\n", + "dtypes: int64(4)\n", + "memory usage: 30.5 MB\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "df_users['UserID'] = df_users['UserID'].astype(int)\n", + "df_users['Age'] = df_users['Age'].astype(int)\n", + "df_users['Occupation'] = df_users['Occupation'].astype(int)" + ], + "metadata": { + "id": "issJpDjJpjHm" + }, + "execution_count": 16, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df_users.info()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ECq9mVgbqfTU", + "outputId": "02cdefff-5d25-4a3d-ddd5-fb288047a1f1" + }, + "execution_count": 17, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "RangeIndex: 6040 entries, 0 to 6039\n", + "Data columns (total 5 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 UserID 6040 non-null int64 \n", + " 1 Gender 6040 non-null object\n", + " 2 Age 6040 non-null int64 \n", + " 3 Occupation 6040 non-null int64 \n", + " 4 Zip-code 6040 non-null object\n", + "dtypes: int64(3), object(2)\n", + "memory usage: 236.1+ KB\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "movie.describe()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 300 + }, + "id": "oVDPZrCFsZrh", + "outputId": "6e730b39-a17d-491d-c63e-a267a78e0137" + }, + "execution_count": 18, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Movie Id\n", + "count 6408.000000\n", + "mean 1949.873283\n", + "std 1136.449312\n", + "min 1.000000\n", + "25% 977.750000\n", + "50% 1962.500000\n", + "75% 2911.250000\n", + "max 3952.000000" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Movie Id
count6408.000000
mean1949.873283
std1136.449312
min1.000000
25%977.750000
50%1962.500000
75%2911.250000
max3952.000000
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"movie\",\n \"rows\": 8,\n \"fields\": [\n {\n \"column\": \"Movie Id\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2017.8312452340883,\n \"min\": 1.0,\n \"max\": 6408.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 1949.8732833957554,\n 1962.5,\n 6408.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 18 + } + ] + }, + { + "cell_type": "code", + "source": [ + "df_rating.describe()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 300 + }, + "id": "CMvlQ2Q7snBo", + "outputId": "f0381e7f-8a87-4f09-cdc3-36c5e648b7dc" + }, + "execution_count": 19, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " UserID MovieID Rating Timestamp\n", + "count 1.000209e+06 1.000209e+06 1.000209e+06 1.000209e+06\n", + "mean 3.024512e+03 1.865540e+03 3.581564e+00 9.722437e+08\n", + "std 1.728413e+03 1.096041e+03 1.117102e+00 1.215256e+07\n", + "min 1.000000e+00 1.000000e+00 1.000000e+00 9.567039e+08\n", + "25% 1.506000e+03 1.030000e+03 3.000000e+00 9.653026e+08\n", + "50% 3.070000e+03 1.835000e+03 4.000000e+00 9.730180e+08\n", + "75% 4.476000e+03 2.770000e+03 4.000000e+00 9.752209e+08\n", + "max 6.040000e+03 3.952000e+03 5.000000e+00 1.046455e+09" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
UserIDMovieIDRatingTimestamp
count1.000209e+061.000209e+061.000209e+061.000209e+06
mean3.024512e+031.865540e+033.581564e+009.722437e+08
std1.728413e+031.096041e+031.117102e+001.215256e+07
min1.000000e+001.000000e+001.000000e+009.567039e+08
25%1.506000e+031.030000e+033.000000e+009.653026e+08
50%3.070000e+031.835000e+034.000000e+009.730180e+08
75%4.476000e+032.770000e+034.000000e+009.752209e+08
max6.040000e+033.952000e+035.000000e+001.046455e+09
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"df_rating\",\n \"rows\": 8,\n \"fields\": [\n {\n \"column\": \"UserID\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 352629.79000977636,\n \"min\": 1.0,\n \"max\": 1000209.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 3024.512347919285,\n 3070.0,\n 1000209.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"MovieID\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 352995.4373918155,\n \"min\": 1.0,\n \"max\": 1000209.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 1865.5398981612843,\n 1835.0,\n 1000209.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Rating\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 353626.1873065492,\n \"min\": 1.0,\n \"max\": 1000209.0,\n \"num_unique_values\": 7,\n \"samples\": [\n 1000209.0,\n 3.581564453029317,\n 4.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Timestamp\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 452144449.95536953,\n \"min\": 1000209.0,\n \"max\": 1046454590.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 972243695.4046655,\n 973018006.0,\n 1000209.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 19 + } + ] + }, + { + "cell_type": "code", + "source": [ + "df_users.describe()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 300 + }, + "id": "dDw46wQqsyog", + "outputId": "dd59a6b7-7ab7-42d7-d2ac-1565544ad44b" + }, + "execution_count": 20, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " UserID Age Occupation\n", + "count 6040.000000 6040.000000 6040.000000\n", + "mean 3020.500000 30.639238 8.146854\n", + "std 1743.742145 12.895962 6.329511\n", + "min 1.000000 1.000000 0.000000\n", + "25% 1510.750000 25.000000 3.000000\n", + "50% 3020.500000 25.000000 7.000000\n", + "75% 4530.250000 35.000000 14.000000\n", + "max 6040.000000 56.000000 20.000000" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
UserIDAgeOccupation
count6040.0000006040.0000006040.000000
mean3020.50000030.6392388.146854
std1743.74214512.8959626.329511
min1.0000001.0000000.000000
25%1510.75000025.0000003.000000
50%3020.50000025.0000007.000000
75%4530.25000035.00000014.000000
max6040.00000056.00000020.000000
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"df_users\",\n \"rows\": 8,\n \"fields\": [\n {\n \"column\": \"UserID\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2176.5404306462956,\n \"min\": 1.0,\n \"max\": 6040.0,\n \"num_unique_values\": 6,\n \"samples\": [\n 6040.0,\n 3020.5,\n 4530.25\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Age\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2126.1519903585836,\n \"min\": 1.0,\n \"max\": 6040.0,\n \"num_unique_values\": 7,\n \"samples\": [\n 6040.0,\n 30.639238410596025,\n 35.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Occupation\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2132.5180851997875,\n \"min\": 0.0,\n \"max\": 6040.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 8.146854304635761,\n 7.0,\n 6040.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 20 + } + ] + }, + { + "cell_type": "code", + "source": [ + "df_users[df_users['Zip-code'].str.contains('-')]['Zip-code']" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 458 + }, + "id": "VDC2ty3BtA8H", + "outputId": "c05d04be-27d9-46b5-d4d5-353ed7f8d7f2" + }, + "execution_count": 21, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "160 98107-2117\n", + "232 37919-4204\n", + "292 55337-4056\n", + "457 55405-2546\n", + "505 55103-1006\n", + " ... \n", + "5664 10461-1301\n", + "5681 23455-4959\n", + "5924 90035-4444\n", + "5966 73069-5429\n", + "5984 78705-5221\n", + "Name: Zip-code, Length: 66, dtype: object" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Zip-code
16098107-2117
23237919-4204
29255337-4056
45755405-2546
50555103-1006
......
566410461-1301
568123455-4959
592490035-4444
596673069-5429
598478705-5221
\n", + "

66 rows × 1 columns

\n", + "

" + ] + }, + "metadata": {}, + "execution_count": 21 + } + ] + }, + { + "cell_type": "code", + "source": [ + "df_users['Zip-code'] = df_users['Zip-code'].apply(lambda x: x.split('-')[0] if '-' in x else x)" + ], + "metadata": { + "id": "LalytBGg6kvm" + }, + "execution_count": 22, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Just did the standardation of zip column and converted the data to int data type" + ], + "metadata": { + "id": "8ZKn53cEd5up" + } + }, + { + "cell_type": "code", + "source": [ + "df_users['Zip-code'] = df_users['Zip-code'].astype(int)" + ], + "metadata": { + "id": "zW08ZzCp8rqr" + }, + "execution_count": 23, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Did the creation of features like year and month from timestamp" + ], + "metadata": { + "id": "EDFCdVjveVo1" + } + }, + { + "cell_type": "code", + "source": [ + "df_rating['Release_year'] = pd.to_datetime(df_rating['Timestamp'], unit='s').dt.year" + ], + "metadata": { + "id": "7vRftc1ZAaHZ" + }, + "execution_count": 24, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df_rating['Release_month'] = pd.to_datetime(df_rating['Timestamp'], unit='s').dt.month" + ], + "metadata": { + "id": "gHr6NLjoB7XW" + }, + "execution_count": 25, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df_rating_new = df_rating.merge(df_rating.groupby('UserID')['Rating'].mean().reset_index().rename(columns={'Rating': 'Average Rating'}),on='UserID')" + ], + "metadata": { + "id": "w16n05iqFhbf" + }, + "execution_count": 26, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df_rating_new['Average Rating'] = df_rating_new['Average Rating'].round(2)" + ], + "metadata": { + "id": "cLn_yscJHS6p" + }, + "execution_count": 27, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df_rating_new" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "id": "RzlGx_7qHo0L", + "outputId": "9bcbcd93-ed63-4ae8-b6b7-dcb1cac70fac" + }, + "execution_count": 28, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " UserID MovieID Rating Timestamp Release_year Release_month \\\n", + "0 1 1193 5 978300760 2000 12 \n", + "1 1 661 3 978302109 2000 12 \n", + "2 1 914 3 978301968 2000 12 \n", + "3 1 3408 4 978300275 2000 12 \n", + "4 1 2355 5 978824291 2001 1 \n", + "... ... ... ... ... ... ... \n", + "1000204 6040 1091 1 956716541 2000 4 \n", + "1000205 6040 1094 5 956704887 2000 4 \n", + "1000206 6040 562 5 956704746 2000 4 \n", + "1000207 6040 1096 4 956715648 2000 4 \n", + "1000208 6040 1097 4 956715569 2000 4 \n", + "\n", + " Average Rating \n", + "0 4.19 \n", + "1 4.19 \n", + "2 4.19 \n", + "3 4.19 \n", + "4 4.19 \n", + "... ... \n", + "1000204 3.58 \n", + "1000205 3.58 \n", + "1000206 3.58 \n", + "1000207 3.58 \n", + "1000208 3.58 \n", + "\n", + "[1000209 rows x 7 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
UserIDMovieIDRatingTimestampRelease_yearRelease_monthAverage Rating
01119359783007602000124.19
1166139783021092000124.19
2191439783019682000124.19
31340849783002752000124.19
4123555978824291200114.19
........................
1000204604010911956716541200043.58
1000205604010945956704887200043.58
100020660405625956704746200043.58
1000207604010964956715648200043.58
1000208604010974956715569200043.58
\n", + "

1000209 rows × 7 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df_rating_new" + } + }, + "metadata": {}, + "execution_count": 28 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Build a Recommender System based on Cosine Similarity\n", + "\n", + "1. Print the user similarity matrix and item similarity matrix\n", + "\n", + "2. Use the Item-based approach to create a recommender system that uses Nearest Neighbors algorithm and Cosine Similarity" + ], + "metadata": { + "id": "otLOO6OrCv1j" + } + }, + { + "cell_type": "code", + "source": [ + "highest_movie_id=df_rating_new['MovieID'].value_counts().reset_index()['MovieID'][:300].index.to_list()" + ], + "metadata": { + "id": "TsJHUm5MN0BR" + }, + "execution_count": 29, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "rating = df_rating_new[df_rating_new['MovieID'].isin(highest_movie_id)]" + ], + "metadata": { + "id": "iQsQ9aNgWXMz" + }, + "execution_count": 30, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "tooked most highest rated movie data of 300" + ], + "metadata": { + "id": "vEVxOpmifrJD" + } + }, + { + "cell_type": "code", + "source": [ + "rating" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "id": "vIWyKCM0Wmux", + "outputId": "27dba428-4e84-4ecf-96e6-b6f4fd8ea8ae" + }, + "execution_count": 31, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " UserID MovieID Rating Timestamp Release_year Release_month \\\n", + "25 1 48 5 978824351 2001 1 \n", + "39 1 150 5 978301777 2000 12 \n", + "40 1 1 5 978824268 2001 1 \n", + "44 1 260 4 978300760 2000 12 \n", + "80 2 292 3 978300123 2000 12 \n", + "... ... ... ... ... ... ... \n", + "1000152 6040 111 5 957716717 2000 5 \n", + "1000163 6040 150 3 956704716 2000 4 \n", + "1000164 6040 154 2 957717678 2000 5 \n", + "1000167 6040 161 3 997454486 2001 8 \n", + "1000168 6040 162 4 956704953 2000 4 \n", + "\n", + " Average Rating \n", + "25 4.19 \n", + "39 4.19 \n", + "40 4.19 \n", + "44 4.19 \n", + "80 3.71 \n", + "... ... \n", + "1000152 3.58 \n", + "1000163 3.58 \n", + "1000164 3.58 \n", + "1000167 3.58 \n", + "1000168 3.58 \n", + "\n", + "[79877 rows x 7 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
UserIDMovieIDRatingTimestampRelease_yearRelease_monthAverage Rating
251485978824351200114.19
39115059783017772000124.19
40115978824268200114.19
44126049783007602000124.19
80229239783001232000123.71
........................
100015260401115957716717200053.58
100016360401503956704716200043.58
100016460401542957717678200053.58
100016760401613997454486200183.58
100016860401624956704953200043.58
\n", + "

79877 rows × 7 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "rating", + "summary": "{\n \"name\": \"rating\",\n \"rows\": 79877,\n \"fields\": [\n {\n \"column\": \"UserID\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1731,\n \"min\": 1,\n \"max\": 6040,\n \"num_unique_values\": 5731,\n \"samples\": [\n 1682,\n 4328,\n 2798\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"MovieID\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 95,\n \"min\": 1,\n \"max\": 299,\n \"num_unique_values\": 291,\n \"samples\": [\n 222,\n 37,\n 73\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Rating\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 1,\n \"max\": 5,\n \"num_unique_values\": 5,\n \"samples\": [\n 4,\n 1,\n 3\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Timestamp\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 11922982,\n \"min\": 956704056,\n \"max\": 1046368241,\n \"num_unique_values\": 71720,\n \"samples\": [\n 975142502,\n 973644621,\n 965113404\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Release_year\",\n \"properties\": {\n \"dtype\": \"int32\",\n \"num_unique_values\": 4,\n \"samples\": [\n 2000,\n 2003,\n 2001\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Release_month\",\n \"properties\": {\n \"dtype\": \"int32\",\n \"num_unique_values\": 12,\n \"samples\": [\n 9,\n 8,\n 1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Average Rating\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.4332926866514135,\n \"min\": 1.02,\n \"max\": 4.96,\n \"num_unique_values\": 264,\n \"samples\": [\n 4.68,\n 4.8,\n 4.49\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 31 + } + ] + }, + { + "cell_type": "code", + "source": [ + "movie = movie.rename(columns={'Movie Id':'MovieID'})" + ], + "metadata": { + "id": "v7k4-pZDXOl_" + }, + "execution_count": 32, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "rating = rating.merge(movie, on='MovieID')" + ], + "metadata": { + "id": "Bjj6sRqzWyP7" + }, + "execution_count": 33, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "rating.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "sf3EJWt7YU_1", + "outputId": "f356bf8d-9718-4d36-8356-ae75cd706374" + }, + "execution_count": 34, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " UserID MovieID Rating Timestamp Release_year Release_month \\\n", + "0 1 48 5 978824351 2001 1 \n", + "1 1 48 5 978824351 2001 1 \n", + "2 1 48 5 978824351 2001 1 \n", + "3 1 48 5 978824351 2001 1 \n", + "4 1 150 5 978301777 2000 12 \n", + "\n", + " Average Rating Title Genres \n", + "0 4.19 Pocahontas (1995) Animation \n", + "1 4.19 Pocahontas (1995) Children's \n", + "2 4.19 Pocahontas (1995) Musical \n", + "3 4.19 Pocahontas (1995) Romance \n", + "4 4.19 Apollo 13 (1995) Drama " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
UserIDMovieIDRatingTimestampRelease_yearRelease_monthAverage RatingTitleGenres
01485978824351200114.19Pocahontas (1995)Animation
11485978824351200114.19Pocahontas (1995)Children's
21485978824351200114.19Pocahontas (1995)Musical
31485978824351200114.19Pocahontas (1995)Romance
4115059783017772000124.19Apollo 13 (1995)Drama
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "rating" + } + }, + "metadata": {}, + "execution_count": 34 + } + ] + }, + { + "cell_type": "code", + "source": [ + "r = rating[['UserID','Rating','Title']].drop_duplicates()" + ], + "metadata": { + "id": "QRM-ZDsiZ4nf" + }, + "execution_count": 35, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "r_user_item = r.pivot(index='UserID', columns='Title', values='Rating')" + ], + "metadata": { + "id": "jAvypl08X_WZ" + }, + "execution_count": 36, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "r_user_item = r_user_item.dropna(how='all')" + ], + "metadata": { + "id": "7Ehts3g9awnr" + }, + "execution_count": 37, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "r_user_item = r_user_item.fillna(0)" + ], + "metadata": { + "id": "Lt97w-e6bzm6" + }, + "execution_count": 38, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "This is interection table or sparse matrix" + ], + "metadata": { + "id": "IamJXT11hCS6" + } + }, + { + "cell_type": "code", + "source": [ + "r_user_item.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 374 + }, + "id": "KAgENo3FezZ-", + "outputId": "e1ec4476-01c9-4ead-f87e-6f940986030d" + }, + "execution_count": 39, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Title Ace Ventura: When Nature Calls (1995) Across the Sea of Time (1995) \\\n", + "UserID \n", + "1 0.0 0.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "4 0.0 0.0 \n", + "5 0.0 0.0 \n", + "\n", + "Title Addiction, The (1995) Amateur (1994) \\\n", + "UserID \n", + "1 0.0 0.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "4 0.0 0.0 \n", + "5 0.0 0.0 \n", + "\n", + "Title Amazing Panda Adventure, The (1995) American President, The (1995) \\\n", + "UserID \n", + "1 0.0 0.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "4 0.0 0.0 \n", + "5 0.0 0.0 \n", + "\n", + "Title Angela (1995) Angels and Insects (1995) \\\n", + "UserID \n", + "1 0.0 0.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "4 0.0 0.0 \n", + "5 0.0 0.0 \n", + "\n", + "Title Anne Frank Remembered (1995) Antonia's Line (Antonia) (1995) ... \\\n", + "UserID ... \n", + "1 0.0 0.0 ... \n", + "2 0.0 0.0 ... \n", + "3 0.0 0.0 ... \n", + "4 0.0 0.0 ... \n", + "5 0.0 0.0 ... \n", + "\n", + "Title Waiting to Exhale (1995) Walk in the Clouds, A (1995) \\\n", + "UserID \n", + "1 0.0 0.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "4 0.0 0.0 \n", + "5 0.0 0.0 \n", + "\n", + "Title Waterworld (1995) When Night Is Falling (1995) \\\n", + "UserID \n", + "1 0.0 0.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "4 0.0 0.0 \n", + "5 0.0 0.0 \n", + "\n", + "Title White Balloon, The (Badkonake Sefid ) (1995) \\\n", + "UserID \n", + "1 0.0 \n", + "2 0.0 \n", + "3 0.0 \n", + "4 0.0 \n", + "5 0.0 \n", + "\n", + "Title White Man's Burden (1995) White Squall (1996) Wild Bill (1995) \\\n", + "UserID \n", + "1 0.0 0.0 0.0 \n", + "2 0.0 0.0 0.0 \n", + "3 0.0 0.0 0.0 \n", + "4 0.0 0.0 0.0 \n", + "5 0.0 0.0 0.0 \n", + "\n", + "Title Wings of Courage (1995) Young Poisoner's Handbook, The (1995) \n", + "UserID \n", + "1 0.0 0.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "4 0.0 0.0 \n", + "5 0.0 0.0 \n", + "\n", + "[5 rows x 291 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TitleAce Ventura: When Nature Calls (1995)Across the Sea of Time (1995)Addiction, The (1995)Amateur (1994)Amazing Panda Adventure, The (1995)American President, The (1995)Angela (1995)Angels and Insects (1995)Anne Frank Remembered (1995)Antonia's Line (Antonia) (1995)...Waiting to Exhale (1995)Walk in the Clouds, A (1995)Waterworld (1995)When Night Is Falling (1995)White Balloon, The (Badkonake Sefid ) (1995)White Man's Burden (1995)White Squall (1996)Wild Bill (1995)Wings of Courage (1995)Young Poisoner's Handbook, The (1995)
UserID
10.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
20.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
30.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
40.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
50.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
\n", + "

5 rows × 291 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "r_user_item" + } + }, + "metadata": {}, + "execution_count": 39 + } + ] + }, + { + "cell_type": "code", + "source": [ + "data = rating.merge(df_users, on='UserID')\n", + "data.head() # data[['UserID','MovieID','Rating']]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 330 + }, + "id": "sALBTwu9cJAg", + "outputId": "f8d9db8e-60a2-43dd-d5cf-6a37cbfee533" + }, + "execution_count": 40, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " UserID MovieID Rating Timestamp Release_year Release_month \\\n", + "0 1 48 5 978824351 2001 1 \n", + "1 1 48 5 978824351 2001 1 \n", + "2 1 48 5 978824351 2001 1 \n", + "3 1 48 5 978824351 2001 1 \n", + "4 1 150 5 978301777 2000 12 \n", + "\n", + " Average Rating Title Genres Gender Age Occupation \\\n", + "0 4.19 Pocahontas (1995) Animation F 1 10 \n", + "1 4.19 Pocahontas (1995) Children's F 1 10 \n", + "2 4.19 Pocahontas (1995) Musical F 1 10 \n", + "3 4.19 Pocahontas (1995) Romance F 1 10 \n", + "4 4.19 Apollo 13 (1995) Drama F 1 10 \n", + "\n", + " Zip-code \n", + "0 48067 \n", + "1 48067 \n", + "2 48067 \n", + "3 48067 \n", + "4 48067 " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
UserIDMovieIDRatingTimestampRelease_yearRelease_monthAverage RatingTitleGenresGenderAgeOccupationZip-code
01485978824351200114.19Pocahontas (1995)AnimationF11048067
11485978824351200114.19Pocahontas (1995)Children'sF11048067
21485978824351200114.19Pocahontas (1995)MusicalF11048067
31485978824351200114.19Pocahontas (1995)RomanceF11048067
4115059783017772000124.19Apollo 13 (1995)DramaF11048067
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "data" + } + }, + "metadata": {}, + "execution_count": 40 + } + ] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.metrics.pairwise import cosine_similarity\n", + "\n", + "rank = []\n", + "for i in r_user_item[:50].index:\n", + " for j in r_user_item[:50].index:\n", + " if i == j:\n", + " continue\n", + " user1 = np.array(r_user_item.loc[i].values).reshape(1, -1)\n", + " user2 = np.array(r_user_item.loc[j].values).reshape(1, -1)\n", + " rank.append([i, j, cosine_similarity(user1, user2)[0][0].round(2)])\n", + "\n", + "user_data = pd.DataFrame(rank, columns=['UserID', 'User2', 'distance'])\n", + "user_data.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "Jp_0zksD2bgU", + "outputId": "217e168c-953e-4db2-ce60-110f8b5739d9" + }, + "execution_count": 42, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " UserID User2 distance\n", + "0 1 2 0.00\n", + "1 1 3 0.33\n", + "2 1 4 0.42\n", + "3 1 5 0.06\n", + "4 1 6 0.43" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
UserIDUser2distance
0120.00
1130.33
2140.42
3150.06
4160.43
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "user_data", + "summary": "{\n \"name\": \"user_data\",\n \"rows\": 2450,\n \"fields\": [\n {\n \"column\": \"UserID\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 14,\n \"min\": 1,\n \"max\": 52,\n \"num_unique_values\": 50,\n \"samples\": [\n 14,\n 40,\n 31\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"User2\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 14,\n \"min\": 1,\n \"max\": 52,\n \"num_unique_values\": 50,\n \"samples\": [\n 15,\n 41,\n 32\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"distance\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.16254108681589358,\n \"min\": 0.0,\n \"max\": 0.98,\n \"num_unique_values\": 69,\n \"samples\": [\n 0.29,\n 0.0,\n 0.4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 42 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Did the cosine similariy and calculated the distance for each users" + ], + "metadata": { + "id": "L5AHp-ghkdMK" + } + }, + { + "cell_type": "code", + "source": [ + "user_data.shape" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "tFlOAPcLqkXA", + "outputId": "a30d542d-0b4b-4874-8cf7-0302fcaa732a" + }, + "execution_count": 43, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(2450, 3)" + ] + }, + "metadata": {}, + "execution_count": 43 + } + ] + }, + { + "cell_type": "code", + "source": [ + "data[['UserID','Title']].drop_duplicates()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "id": "ST4yKJr9rnYb", + "outputId": "d7bf7ca9-856e-49a5-b088-7711e81148ff" + }, + "execution_count": 44, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " UserID Title\n", + "0 1 Pocahontas (1995)\n", + "4 1 Apollo 13 (1995)\n", + "5 1 Toy Story (1995)\n", + "8 1 Star Wars: Episode IV - A New Hope (1977)\n", + "12 2 Outbreak (1995)\n", + "... ... ...\n", + "176856 6040 Taxi Driver (1976)\n", + "176858 6040 Apollo 13 (1995)\n", + "176859 6040 Belle de jour (1967)\n", + "176860 6040 Crimson Tide (1995)\n", + "176863 6040 Crumb (1994)\n", + "\n", + "[79877 rows x 2 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
UserIDTitle
01Pocahontas (1995)
41Apollo 13 (1995)
51Toy Story (1995)
81Star Wars: Episode IV - A New Hope (1977)
122Outbreak (1995)
.........
1768566040Taxi Driver (1976)
1768586040Apollo 13 (1995)
1768596040Belle de jour (1967)
1768606040Crimson Tide (1995)
1768636040Crumb (1994)
\n", + "

79877 rows × 2 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"data[['UserID','Title']]\",\n \"rows\": 79877,\n \"fields\": [\n {\n \"column\": \"UserID\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1731,\n \"min\": 1,\n \"max\": 6040,\n \"num_unique_values\": 5731,\n \"samples\": [\n 1682,\n 4328,\n 2798\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Title\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 291,\n \"samples\": [\n \"Circle of Friends (1995)\",\n \"Across the Sea of Time (1995)\",\n \"Mis\\\\xe9rables, Les (1995)\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 44 + } + ] + }, + { + "cell_type": "code", + "source": [ + "user_data = user_data.merge(data[['UserID','Title']].drop_duplicates(),on='UserID').rename(columns={'Title': 'User1_tittle','UserID':'User1'})\n", + "#pd.merge(user_data, data[['UserID','Title']].drop_duplicates(), how='inner', on='UserID')\n", + "\n", + "user_data = user_data.merge(data[['UserID','Title']].drop_duplicates(),left_on='User2', right_on='UserID').rename(columns={'Title': 'User2_tittle'}).drop(columns=['UserID'])" + ], + "metadata": { + "id": "V8BnIvngp20U" + }, + "execution_count": 45, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "user_data.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "c2mMq4rzuDbe", + "outputId": "b961c780-90bd-4762-a1e7-c6e3d6c6d528" + }, + "execution_count": 46, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " User1 User2 distance User1_tittle \\\n", + "0 1 2 0.0 Pocahontas (1995) \n", + "1 1 2 0.0 Pocahontas (1995) \n", + "2 1 2 0.0 Pocahontas (1995) \n", + "3 1 2 0.0 Pocahontas (1995) \n", + "4 1 2 0.0 Pocahontas (1995) \n", + "\n", + " User2_tittle \n", + "0 Outbreak (1995) \n", + "1 Braveheart (1995) \n", + "2 Like Water for Chocolate (Como agua para choco... \n", + "3 Broken Arrow (1996) \n", + "4 Ed Wood (1994) " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
User1User2distanceUser1_tittleUser2_tittle
0120.0Pocahontas (1995)Outbreak (1995)
1120.0Pocahontas (1995)Braveheart (1995)
2120.0Pocahontas (1995)Like Water for Chocolate (Como agua para choco...
3120.0Pocahontas (1995)Broken Arrow (1996)
4120.0Pocahontas (1995)Ed Wood (1994)
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "user_data" + } + }, + "metadata": {}, + "execution_count": 46 + } + ] + }, + { + "cell_type": "code", + "source": [ + "user_data = user_data[~(user_data['User1_tittle'] == user_data['User2_tittle'])]" + ], + "metadata": { + "id": "OnfpqFN2x2xO" + }, + "execution_count": 47, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "This is the top highest recommandation for each user using users user cosine similarity" + ], + "metadata": { + "id": "1pfU9uJrkpcx" + } + }, + { + "cell_type": "code", + "source": [ + "user_data[user_data['distance']>0.5]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "id": "VeJOtW9Xw_xV", + "outputId": "3b7c42bc-445e-4896-e434-89463f03b61b" + }, + "execution_count": 48, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " User1 User2 distance User1_tittle \\\n", + "832 1 21 0.52 Pocahontas (1995) \n", + "833 1 21 0.52 Apollo 13 (1995) \n", + "835 1 21 0.52 Star Wars: Episode IV - A New Hope (1977) \n", + "7120 3 4 0.78 Happy Gilmore (1996) \n", + "7748 3 27 0.77 Happy Gilmore (1996) \n", + "... ... ... ... ... \n", + "337739 52 7 0.64 Outbreak (1995) \n", + "337740 52 7 0.64 From Dusk Till Dawn (1996) \n", + "337741 52 7 0.64 From Dusk Till Dawn (1996) \n", + "337742 52 7 0.64 Vampire in Brooklyn (1995) \n", + "337743 52 7 0.64 Vampire in Brooklyn (1995) \n", + "\n", + " User2_tittle \n", + "832 Toy Story (1995) \n", + "833 Toy Story (1995) \n", + "835 Toy Story (1995) \n", + "7120 Star Wars: Episode IV - A New Hope (1977) \n", + "7748 Jumanji (1995) \n", + "... ... \n", + "337739 Braveheart (1995) \n", + "337740 Heat (1995) \n", + "337741 Braveheart (1995) \n", + "337742 Heat (1995) \n", + "337743 Braveheart (1995) \n", + "\n", + "[4944 rows x 5 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
User1User2distanceUser1_tittleUser2_tittle
8321210.52Pocahontas (1995)Toy Story (1995)
8331210.52Apollo 13 (1995)Toy Story (1995)
8351210.52Star Wars: Episode IV - A New Hope (1977)Toy Story (1995)
7120340.78Happy Gilmore (1996)Star Wars: Episode IV - A New Hope (1977)
77483270.77Happy Gilmore (1996)Jumanji (1995)
..................
3377395270.64Outbreak (1995)Braveheart (1995)
3377405270.64From Dusk Till Dawn (1996)Heat (1995)
3377415270.64From Dusk Till Dawn (1996)Braveheart (1995)
3377425270.64Vampire in Brooklyn (1995)Heat (1995)
3377435270.64Vampire in Brooklyn (1995)Braveheart (1995)
\n", + "

4944 rows × 5 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"user_data[user_data['distance']>0\",\n \"rows\": 4944,\n \"fields\": [\n {\n \"column\": \"User1\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 11,\n \"min\": 1,\n \"max\": 52,\n \"num_unique_values\": 23,\n \"samples\": [\n 32,\n 23,\n 1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"User2\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 11,\n \"min\": 1,\n \"max\": 52,\n \"num_unique_values\": 23,\n \"samples\": [\n 18,\n 48,\n 21\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"distance\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.053472811701157336,\n \"min\": 0.51,\n \"max\": 0.98,\n \"num_unique_values\": 18,\n \"samples\": [\n 0.52,\n 0.78,\n 0.7\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"User1_tittle\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 98,\n \"samples\": [\n \"Death and the Maiden (1994)\",\n \"Taxi Driver (1976)\",\n \"Assassins (1995)\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"User2_tittle\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 98,\n \"samples\": [\n \"Mr. Holland's Opus (1995)\",\n \"Judge Dredd (1995)\",\n \"Party Girl (1995)\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 48 + } + ] + }, + { + "cell_type": "code", + "source": [ + "u = user_data[['User1','User2','distance']].drop_duplicates()" + ], + "metadata": { + "id": "JqLdEfbHzpId" + }, + "execution_count": 49, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "u = u[u['distance']>0.5]" + ], + "metadata": { + "id": "Yq37wlRo0GLB" + }, + "execution_count": 50, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "User - user sparse similarity matrix" + ], + "metadata": { + "id": "3FO5pWqwk8WT" + } + }, + { + "cell_type": "code", + "source": [ + "u.pivot(index='User1', columns='User2', values='distance').fillna(0)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 851 + }, + "id": "n70XetOlwZX5", + "outputId": "b6f1bc7e-228b-4644-af9f-533792adabb1" + }, + "execution_count": 51, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "User2 1 3 4 7 13 16 18 19 21 23 ... 28 \\\n", + "User1 ... \n", + "1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.52 0.00 ... 0.00 \n", + "3 0.00 0.00 0.78 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.00 \n", + "4 0.00 0.78 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.00 \n", + "7 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.00 \n", + "13 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.00 \n", + "16 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.00 \n", + "18 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.00 \n", + "19 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.00 \n", + "21 0.52 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.00 \n", + "23 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.00 \n", + "24 0.00 0.00 0.00 0.00 0.70 0.00 0.53 0.00 0.00 0.00 ... 0.00 \n", + "25 0.00 0.00 0.55 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.00 \n", + "27 0.00 0.77 0.98 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.00 \n", + "28 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.00 \n", + "29 0.00 0.00 0.00 0.00 0.62 0.00 0.00 0.00 0.00 0.00 ... 0.00 \n", + "32 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.57 \n", + "34 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.00 \n", + "36 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.57 0.00 0.52 ... 0.00 \n", + "41 0.00 0.71 0.91 0.00 0.55 0.00 0.00 0.00 0.00 0.00 ... 0.00 \n", + "48 0.00 0.00 0.00 0.00 0.51 0.00 0.00 0.00 0.00 0.00 ... 0.00 \n", + "50 0.00 0.00 0.00 0.00 0.00 0.69 0.00 0.00 0.00 0.00 ... 0.00 \n", + "51 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.62 0.00 ... 0.00 \n", + "52 0.00 0.00 0.00 0.64 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.00 \n", + "\n", + "User2 29 32 34 36 41 48 50 51 52 \n", + "User1 \n", + "1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "3 0.00 0.00 0.00 0.00 0.71 0.00 0.00 0.00 0.00 \n", + "4 0.00 0.00 0.00 0.00 0.91 0.00 0.00 0.00 0.00 \n", + "7 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.64 \n", + "13 0.62 0.00 0.00 0.00 0.55 0.51 0.00 0.00 0.00 \n", + "16 0.00 0.00 0.00 0.00 0.00 0.00 0.69 0.00 0.00 \n", + "18 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "19 0.00 0.00 0.00 0.57 0.00 0.00 0.00 0.00 0.00 \n", + "21 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.62 0.00 \n", + "23 0.00 0.00 0.00 0.52 0.00 0.00 0.00 0.00 0.00 \n", + "24 0.56 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "25 0.00 0.00 0.00 0.00 0.62 0.00 0.00 0.00 0.00 \n", + "27 0.00 0.00 0.00 0.00 0.90 0.00 0.00 0.00 0.00 \n", + "28 0.00 0.57 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "29 0.00 0.00 0.00 0.00 0.61 0.00 0.00 0.00 0.00 \n", + "32 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "34 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.75 0.00 \n", + "36 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "41 0.61 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "48 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "50 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "51 0.00 0.00 0.75 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "52 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "\n", + "[23 rows x 23 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
User21347131618192123...28293234364148505152
User1
10.000.000.000.000.000.000.000.000.520.00...0.000.000.000.000.000.000.000.000.000.00
30.000.000.780.000.000.000.000.000.000.00...0.000.000.000.000.000.710.000.000.000.00
40.000.780.000.000.000.000.000.000.000.00...0.000.000.000.000.000.910.000.000.000.00
70.000.000.000.000.000.000.000.000.000.00...0.000.000.000.000.000.000.000.000.000.64
130.000.000.000.000.000.000.000.000.000.00...0.000.620.000.000.000.550.510.000.000.00
160.000.000.000.000.000.000.000.000.000.00...0.000.000.000.000.000.000.000.690.000.00
180.000.000.000.000.000.000.000.000.000.00...0.000.000.000.000.000.000.000.000.000.00
190.000.000.000.000.000.000.000.000.000.00...0.000.000.000.000.570.000.000.000.000.00
210.520.000.000.000.000.000.000.000.000.00...0.000.000.000.000.000.000.000.000.620.00
230.000.000.000.000.000.000.000.000.000.00...0.000.000.000.000.520.000.000.000.000.00
240.000.000.000.000.700.000.530.000.000.00...0.000.560.000.000.000.000.000.000.000.00
250.000.000.550.000.000.000.000.000.000.00...0.000.000.000.000.000.620.000.000.000.00
270.000.770.980.000.000.000.000.000.000.00...0.000.000.000.000.000.900.000.000.000.00
280.000.000.000.000.000.000.000.000.000.00...0.000.000.570.000.000.000.000.000.000.00
290.000.000.000.000.620.000.000.000.000.00...0.000.000.000.000.000.610.000.000.000.00
320.000.000.000.000.000.000.000.000.000.00...0.570.000.000.000.000.000.000.000.000.00
340.000.000.000.000.000.000.000.000.000.00...0.000.000.000.000.000.000.000.000.750.00
360.000.000.000.000.000.000.000.570.000.52...0.000.000.000.000.000.000.000.000.000.00
410.000.710.910.000.550.000.000.000.000.00...0.000.610.000.000.000.000.000.000.000.00
480.000.000.000.000.510.000.000.000.000.00...0.000.000.000.000.000.000.000.000.000.00
500.000.000.000.000.000.690.000.000.000.00...0.000.000.000.000.000.000.000.000.000.00
510.000.000.000.000.000.000.000.000.620.00...0.000.000.000.750.000.000.000.000.000.00
520.000.000.000.640.000.000.000.000.000.00...0.000.000.000.000.000.000.000.000.000.00
\n", + "

23 rows × 23 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe" + } + }, + "metadata": {}, + "execution_count": 51 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "created the Item - item similarity matrix using cosine similarity" + ], + "metadata": { + "id": "CwiBdQQOlI6J" + } + }, + { + "cell_type": "code", + "source": [ + "item=[]\n", + "\n", + "for i in r_user_item.columns:\n", + " for j in r_user_item.columns:\n", + " if i==j:\n", + " continue\n", + " item1 = np.array(r_user_item[i].values).reshape(1, -1)\n", + " item2 = np.array(r_user_item[j].values).reshape(1, -1)\n", + " item.append([i, j, cosine_similarity(item1, item2)[0][0].round(2)])\n", + "\n", + "\n", + "item_data = pd.DataFrame(item, columns=['item1', 'item2', 'distance'])\n", + "item_data.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "ORYNIvlK08Xz", + "outputId": "f0257b33-00c7-4566-e149-6d4d6723bbc7" + }, + "execution_count": 52, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " item1 item2 \\\n", + "0 Ace Ventura: When Nature Calls (1995) Across the Sea of Time (1995) \n", + "1 Ace Ventura: When Nature Calls (1995) Addiction, The (1995) \n", + "2 Ace Ventura: When Nature Calls (1995) Amateur (1994) \n", + "3 Ace Ventura: When Nature Calls (1995) Amazing Panda Adventure, The (1995) \n", + "4 Ace Ventura: When Nature Calls (1995) American President, The (1995) \n", + "\n", + " distance \n", + "0 0.07 \n", + "1 0.04 \n", + "2 0.05 \n", + "3 0.08 \n", + "4 0.24 " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
item1item2distance
0Ace Ventura: When Nature Calls (1995)Across the Sea of Time (1995)0.07
1Ace Ventura: When Nature Calls (1995)Addiction, The (1995)0.04
2Ace Ventura: When Nature Calls (1995)Amateur (1994)0.05
3Ace Ventura: When Nature Calls (1995)Amazing Panda Adventure, The (1995)0.08
4Ace Ventura: When Nature Calls (1995)American President, The (1995)0.24
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "item_data", + "summary": "{\n \"name\": \"item_data\",\n \"rows\": 84390,\n \"fields\": [\n {\n \"column\": \"item1\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 291,\n \"samples\": [\n \"Dunston Checks In (1996)\",\n \"Sudden Death (1995)\",\n \"Carrington (1995)\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"item2\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 291,\n \"samples\": [\n \"Eat Drink Man Woman (1994)\",\n \"Target (1995)\",\n \"Casino (1995)\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"distance\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.08078784044234677,\n \"min\": 0.0,\n \"max\": 0.66,\n \"num_unique_values\": 60,\n \"samples\": [\n 0.07,\n 0.0,\n 0.41\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 52 + } + ] + }, + { + "cell_type": "code", + "source": [ + "item_data[item_data['distance']>0.5]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "Hhyzpjkg4q3M", + "outputId": "f407dbfd-c11b-4a7a-8fc8-93681e56e8f8" + }, + "execution_count": 53, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " item1 \\\n", + "1999 Angela (1995) \n", + "4038 Babe (1995) \n", + "10958 Braveheart (1995) \n", + "10984 Braveheart (1995) \n", + "11382 Broken Arrow (1996) \n", + "11525 Broken Arrow (1996) \n", + "21209 Die Hard: With a Vengeance (1995) \n", + "21280 Die Hard: With a Vengeance (1995) \n", + "21385 Die Hard: With a Vengeance (1995) \n", + "32263 GoldenEye (1995) \n", + "34752 Heat (1995) \n", + "40741 Johnny Mnemonic (1995) \n", + "41320 Judge Dredd (1995) \n", + "62679 Outbreak (1995) \n", + "62713 Outbreak (1995) \n", + "66447 Pulp Fiction (1994) \n", + "66652 Pulp Fiction (1994) \n", + "66664 Pulp Fiction (1994) \n", + "66678 Pulp Fiction (1994) \n", + "66679 Pulp Fiction (1994) \n", + "66688 Pulp Fiction (1994) \n", + "70589 Seven (Se7en) (1995) \n", + "70699 Seven (Se7en) (1995) \n", + "70748 Seven (Se7en) (1995) \n", + "73987 Star Wars: Episode IV - A New Hope (1977) \n", + "74179 Star Wars: Episode IV - A New Hope (1977) \n", + "74218 Star Wars: Episode IV - A New Hope (1977) \n", + "74219 Star Wars: Episode IV - A New Hope (1977) \n", + "74228 Star Wars: Episode IV - A New Hope (1977) \n", + "75406 Target (1995) \n", + "78023 Toy Story (1995) \n", + "78239 Toy Story (1995) \n", + "78265 Toy Story (1995) \n", + "78529 Twelve Monkeys (1995) \n", + "78555 Twelve Monkeys (1995) \n", + "81139 Usual Suspects, The (1995) \n", + "81153 Usual Suspects, The (1995) \n", + "81165 Usual Suspects, The (1995) \n", + "\n", + " item2 distance \n", + "1999 Target (1995) 0.51 \n", + "4038 Toy Story (1995) 0.56 \n", + "10958 Pulp Fiction (1994) 0.53 \n", + "10984 Star Wars: Episode IV - A New Hope (1977) 0.57 \n", + "11382 Die Hard: With a Vengeance (1995) 0.56 \n", + "11525 Outbreak (1995) 0.53 \n", + "21209 Broken Arrow (1996) 0.56 \n", + "21280 GoldenEye (1995) 0.55 \n", + "21385 Outbreak (1995) 0.55 \n", + "32263 Die Hard: With a Vengeance (1995) 0.55 \n", + "34752 Seven (Se7en) (1995) 0.52 \n", + "40741 Judge Dredd (1995) 0.53 \n", + "41320 Johnny Mnemonic (1995) 0.53 \n", + "62679 Broken Arrow (1996) 0.53 \n", + "62713 Die Hard: With a Vengeance (1995) 0.55 \n", + "66447 Braveheart (1995) 0.53 \n", + "66652 Seven (Se7en) (1995) 0.55 \n", + "66664 Star Wars: Episode IV - A New Hope (1977) 0.54 \n", + "66678 Toy Story (1995) 0.51 \n", + "66679 Twelve Monkeys (1995) 0.53 \n", + "66688 Usual Suspects, The (1995) 0.66 \n", + "70589 Heat (1995) 0.52 \n", + "70699 Pulp Fiction (1994) 0.55 \n", + "70748 Usual Suspects, The (1995) 0.60 \n", + "73987 Braveheart (1995) 0.57 \n", + "74179 Pulp Fiction (1994) 0.54 \n", + "74218 Toy Story (1995) 0.54 \n", + "74219 Twelve Monkeys (1995) 0.53 \n", + "74228 Usual Suspects, The (1995) 0.51 \n", + "75406 Angela (1995) 0.51 \n", + "78023 Babe (1995) 0.56 \n", + "78239 Pulp Fiction (1994) 0.51 \n", + "78265 Star Wars: Episode IV - A New Hope (1977) 0.54 \n", + "78529 Pulp Fiction (1994) 0.53 \n", + "78555 Star Wars: Episode IV - A New Hope (1977) 0.53 \n", + "81139 Pulp Fiction (1994) 0.66 \n", + "81153 Seven (Se7en) (1995) 0.60 \n", + "81165 Star Wars: Episode IV - A New Hope (1977) 0.51 " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
item1item2distance
1999Angela (1995)Target (1995)0.51
4038Babe (1995)Toy Story (1995)0.56
10958Braveheart (1995)Pulp Fiction (1994)0.53
10984Braveheart (1995)Star Wars: Episode IV - A New Hope (1977)0.57
11382Broken Arrow (1996)Die Hard: With a Vengeance (1995)0.56
11525Broken Arrow (1996)Outbreak (1995)0.53
21209Die Hard: With a Vengeance (1995)Broken Arrow (1996)0.56
21280Die Hard: With a Vengeance (1995)GoldenEye (1995)0.55
21385Die Hard: With a Vengeance (1995)Outbreak (1995)0.55
32263GoldenEye (1995)Die Hard: With a Vengeance (1995)0.55
34752Heat (1995)Seven (Se7en) (1995)0.52
40741Johnny Mnemonic (1995)Judge Dredd (1995)0.53
41320Judge Dredd (1995)Johnny Mnemonic (1995)0.53
62679Outbreak (1995)Broken Arrow (1996)0.53
62713Outbreak (1995)Die Hard: With a Vengeance (1995)0.55
66447Pulp Fiction (1994)Braveheart (1995)0.53
66652Pulp Fiction (1994)Seven (Se7en) (1995)0.55
66664Pulp Fiction (1994)Star Wars: Episode IV - A New Hope (1977)0.54
66678Pulp Fiction (1994)Toy Story (1995)0.51
66679Pulp Fiction (1994)Twelve Monkeys (1995)0.53
66688Pulp Fiction (1994)Usual Suspects, The (1995)0.66
70589Seven (Se7en) (1995)Heat (1995)0.52
70699Seven (Se7en) (1995)Pulp Fiction (1994)0.55
70748Seven (Se7en) (1995)Usual Suspects, The (1995)0.60
73987Star Wars: Episode IV - A New Hope (1977)Braveheart (1995)0.57
74179Star Wars: Episode IV - A New Hope (1977)Pulp Fiction (1994)0.54
74218Star Wars: Episode IV - A New Hope (1977)Toy Story (1995)0.54
74219Star Wars: Episode IV - A New Hope (1977)Twelve Monkeys (1995)0.53
74228Star Wars: Episode IV - A New Hope (1977)Usual Suspects, The (1995)0.51
75406Target (1995)Angela (1995)0.51
78023Toy Story (1995)Babe (1995)0.56
78239Toy Story (1995)Pulp Fiction (1994)0.51
78265Toy Story (1995)Star Wars: Episode IV - A New Hope (1977)0.54
78529Twelve Monkeys (1995)Pulp Fiction (1994)0.53
78555Twelve Monkeys (1995)Star Wars: Episode IV - A New Hope (1977)0.53
81139Usual Suspects, The (1995)Pulp Fiction (1994)0.66
81153Usual Suspects, The (1995)Seven (Se7en) (1995)0.60
81165Usual Suspects, The (1995)Star Wars: Episode IV - A New Hope (1977)0.51
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"item_data[item_data['distance']>0\",\n \"rows\": 38,\n \"fields\": [\n {\n \"column\": \"item1\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 17,\n \"samples\": [\n \"Angela (1995)\",\n \"Babe (1995)\",\n \"GoldenEye (1995)\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"item2\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 17,\n \"samples\": [\n \"Target (1995)\",\n \"Toy Story (1995)\",\n \"Outbreak (1995)\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"distance\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.03513841761666466,\n \"min\": 0.51,\n \"max\": 0.66,\n \"num_unique_values\": 9,\n \"samples\": [\n 0.66,\n 0.56,\n 0.52\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 53 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Build a Recommender System based on Pearson Correlation\n", + "\n", + "1. Creating a pivot table of movie titles & user id and imputing the NaN values\n", + "\n", + "2. Use the Item-based approach to create a simple recommender system that uses Pearson Correlation\n", + "\n", + "\n", + "Build a Recommender System based Pearson Correlation. (Optional)\n", + "\n", + "Use the User-based approach to create a recommender system that uses Pearson Correlation" + ], + "metadata": { + "id": "UTSFIbWW7KfF" + } + }, + { + "cell_type": "code", + "source": [ + "from scipy.stats import pearsonr\n", + "\n", + "rank_pearsonr = []\n", + "for i in r_user_item[:50].index:\n", + " for j in r_user_item[:50].index:\n", + " if i == j:\n", + " continue\n", + " user1 = np.array(r_user_item.loc[i].values)\n", + " user2 = np.array(r_user_item.loc[j].values)\n", + " coff, p_value = pearsonr(user1, user2)\n", + " rank_pearsonr.append([i, j, coff.round(2)])\n", + "\n", + "rank_pearsonr_data = pd.DataFrame(rank_pearsonr, columns=['User1', 'User2', 'distance'])\n", + "rank_pearsonr_data.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "afUtg3c26zeL", + "outputId": "5cbf9cda-e299-4b97-fd30-69e43713f2c0" + }, + "execution_count": 54, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " User1 User2 distance\n", + "0 1 2 -0.02\n", + "1 1 3 0.32\n", + "2 1 4 0.42\n", + "3 1 5 0.03\n", + "4 1 6 0.42" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
User1User2distance
012-0.02
1130.32
2140.42
3150.03
4160.42
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "rank_pearsonr_data", + "summary": "{\n \"name\": \"rank_pearsonr_data\",\n \"rows\": 2450,\n \"fields\": [\n {\n \"column\": \"User1\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 14,\n \"min\": 1,\n \"max\": 52,\n \"num_unique_values\": 50,\n \"samples\": [\n 14,\n 40,\n 31\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"User2\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 14,\n \"min\": 1,\n \"max\": 52,\n \"num_unique_values\": 50,\n \"samples\": [\n 15,\n 41,\n 32\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"distance\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.16195201664753248,\n \"min\": -0.07,\n \"max\": 0.98,\n \"num_unique_values\": 74,\n \"samples\": [\n -0.01,\n 0.61,\n 0.29\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 54 + } + ] + }, + { + "cell_type": "code", + "source": [ + "rank_pearsonr_data = rank_pearsonr_data.merge(data[['UserID','Title']].drop_duplicates(), left_on='User1',right_on='UserID').rename(columns={'Title': 'User1_tittle'}).drop(columns=['UserID'])\n", + "#pd.merge(user_data, data[['UserID','Title']].drop_duplicates(), how='inner', on='UserID')\n", + "\n", + "rank_pearsonr_data = rank_pearsonr_data.merge(data[['UserID','Title']].drop_duplicates(),left_on='User2', right_on='UserID').rename(columns={'Title': 'User2_tittle'}).drop(columns=['UserID'])" + ], + "metadata": { + "id": "S_ZMeL8nSPlZ" + }, + "execution_count": 55, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "rank_pearsonr_data = rank_pearsonr_data[~(rank_pearsonr_data['User1_tittle'] == rank_pearsonr_data['User2_tittle'])]" + ], + "metadata": { + "id": "hR1KV2moS_EJ" + }, + "execution_count": 56, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Top highest similar user rated movie recommandated for other users using pearsonr" + ], + "metadata": { + "id": "KHGmiPOWnKxN" + } + }, + { + "cell_type": "code", + "source": [ + "rank_pearsonr_data[rank_pearsonr_data['distance']>0.5]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "id": "aATUO1Q8TQnr", + "outputId": "1c3315c3-5691-418b-9fe1-42b0aacbde34" + }, + "execution_count": 57, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " User1 User2 distance User1_tittle \\\n", + "832 1 21 0.52 Pocahontas (1995) \n", + "833 1 21 0.52 Apollo 13 (1995) \n", + "835 1 21 0.52 Star Wars: Episode IV - A New Hope (1977) \n", + "7120 3 4 0.78 Happy Gilmore (1996) \n", + "7748 3 27 0.76 Happy Gilmore (1996) \n", + "... ... ... ... ... \n", + "337739 52 7 0.64 Outbreak (1995) \n", + "337740 52 7 0.64 From Dusk Till Dawn (1996) \n", + "337741 52 7 0.64 From Dusk Till Dawn (1996) \n", + "337742 52 7 0.64 Vampire in Brooklyn (1995) \n", + "337743 52 7 0.64 Vampire in Brooklyn (1995) \n", + "\n", + " User2_tittle \n", + "832 Toy Story (1995) \n", + "833 Toy Story (1995) \n", + "835 Toy Story (1995) \n", + "7120 Star Wars: Episode IV - A New Hope (1977) \n", + "7748 Jumanji (1995) \n", + "... ... \n", + "337739 Braveheart (1995) \n", + "337740 Heat (1995) \n", + "337741 Braveheart (1995) \n", + "337742 Heat (1995) \n", + "337743 Braveheart (1995) \n", + "\n", + "[2142 rows x 5 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
User1User2distanceUser1_tittleUser2_tittle
8321210.52Pocahontas (1995)Toy Story (1995)
8331210.52Apollo 13 (1995)Toy Story (1995)
8351210.52Star Wars: Episode IV - A New Hope (1977)Toy Story (1995)
7120340.78Happy Gilmore (1996)Star Wars: Episode IV - A New Hope (1977)
77483270.76Happy Gilmore (1996)Jumanji (1995)
..................
3377395270.64Outbreak (1995)Braveheart (1995)
3377405270.64From Dusk Till Dawn (1996)Heat (1995)
3377415270.64From Dusk Till Dawn (1996)Braveheart (1995)
3377425270.64Vampire in Brooklyn (1995)Heat (1995)
3377435270.64Vampire in Brooklyn (1995)Braveheart (1995)
\n", + "

2142 rows × 5 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"rank_pearsonr_data[rank_pearsonr_data['distance']>0\",\n \"rows\": 2142,\n \"fields\": [\n {\n \"column\": \"User1\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 9,\n \"min\": 1,\n \"max\": 52,\n \"num_unique_values\": 21,\n \"samples\": [\n 1,\n 41,\n 34\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"User2\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 9,\n \"min\": 1,\n \"max\": 52,\n \"num_unique_values\": 21,\n \"samples\": [\n 21,\n 19,\n 32\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"distance\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.06869827749034985,\n \"min\": 0.51,\n \"max\": 0.98,\n \"num_unique_values\": 16,\n \"samples\": [\n 0.52,\n 0.78,\n 0.98\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"User1_tittle\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 70,\n \"samples\": [\n \"Little Women (1994)\",\n \"Pocahontas (1995)\",\n \"Sabrina (1995)\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"User2_tittle\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 70,\n \"samples\": [\n \"Legends of the Fall (1994)\",\n \"Toy Story (1995)\",\n \"Taxi Driver (1976)\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 57 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Item - Item similarity pearsonr matrix" + ], + "metadata": { + "id": "YCsJNttGncT9" + } + }, + { + "cell_type": "code", + "source": [ + "item_pearsonr=[]\n", + "\n", + "for i in r_user_item.columns:\n", + " for j in r_user_item.columns:\n", + " if i==j:\n", + " continue\n", + " item1 = np.array(r_user_item[i].values)\n", + " item2 = np.array(r_user_item[j].values)\n", + " coff, p_value = pearsonr(item1, item2)\n", + " item_pearsonr.append([i, j, coff.round(2)])\n", + "\n", + "\n", + "item_pearsonr_data = pd.DataFrame(item_pearsonr, columns=['item1', 'item2', 'distance'])\n", + "item_pearsonr_data.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "aoWorbbDT5Wx", + "outputId": "92c73492-b7ff-4418-bf90-dbdaa7152897" + }, + "execution_count": 58, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " item1 item2 \\\n", + "0 Ace Ventura: When Nature Calls (1995) Across the Sea of Time (1995) \n", + "1 Ace Ventura: When Nature Calls (1995) Addiction, The (1995) \n", + "2 Ace Ventura: When Nature Calls (1995) Amateur (1994) \n", + "3 Ace Ventura: When Nature Calls (1995) Amazing Panda Adventure, The (1995) \n", + "4 Ace Ventura: When Nature Calls (1995) American President, The (1995) \n", + "\n", + " distance \n", + "0 0.06 \n", + "1 0.02 \n", + "2 0.03 \n", + "3 0.07 \n", + "4 0.16 " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
item1item2distance
0Ace Ventura: When Nature Calls (1995)Across the Sea of Time (1995)0.06
1Ace Ventura: When Nature Calls (1995)Addiction, The (1995)0.02
2Ace Ventura: When Nature Calls (1995)Amateur (1994)0.03
3Ace Ventura: When Nature Calls (1995)Amazing Panda Adventure, The (1995)0.07
4Ace Ventura: When Nature Calls (1995)American President, The (1995)0.16
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "item_pearsonr_data", + "summary": "{\n \"name\": \"item_pearsonr_data\",\n \"rows\": 84390,\n \"fields\": [\n {\n \"column\": \"item1\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 291,\n \"samples\": [\n \"Dunston Checks In (1996)\",\n \"Sudden Death (1995)\",\n \"Carrington (1995)\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"item2\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 291,\n \"samples\": [\n \"Eat Drink Man Woman (1994)\",\n \"Target (1995)\",\n \"Casino (1995)\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"distance\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.06511276894451852,\n \"min\": -0.03,\n \"max\": 0.51,\n \"num_unique_values\": 54,\n \"samples\": [\n 0.23,\n 0.47,\n 0.42\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 58 + } + ] + }, + { + "cell_type": "code", + "source": [ + "item_pearsonr_data[item_pearsonr_data['distance']>0.5]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 175 + }, + "id": "kGXn5Lq_Vdag", + "outputId": "c9b59038-8119-4cc7-be7a-aeef632c996c" + }, + "execution_count": 59, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " item1 item2 \\\n", + "1999 Angela (1995) Target (1995) \n", + "11382 Broken Arrow (1996) Die Hard: With a Vengeance (1995) \n", + "21209 Die Hard: With a Vengeance (1995) Broken Arrow (1996) \n", + "75406 Target (1995) Angela (1995) \n", + "\n", + " distance \n", + "1999 0.51 \n", + "11382 0.51 \n", + "21209 0.51 \n", + "75406 0.51 " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
item1item2distance
1999Angela (1995)Target (1995)0.51
11382Broken Arrow (1996)Die Hard: With a Vengeance (1995)0.51
21209Die Hard: With a Vengeance (1995)Broken Arrow (1996)0.51
75406Target (1995)Angela (1995)0.51
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"item_pearsonr_data[item_pearsonr_data['distance']>0\",\n \"rows\": 4,\n \"fields\": [\n {\n \"column\": \"item1\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"Broken Arrow (1996)\",\n \"Target (1995)\",\n \"Angela (1995)\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"item2\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"Die Hard: With a Vengeance (1995)\",\n \"Angela (1995)\",\n \"Target (1995)\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"distance\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.0,\n \"min\": 0.51,\n \"max\": 0.51,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.51\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 59 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Build a Recommender System based on Matrix Factorization.\n", + "\n", + "1. Create a Recommender System using the Matrix Factorization method\n", + "\n", + "2. Evaluate the model in terms of the Root Mean Squared Error and Mean Absolute Percentage Error\n", + "\n", + "3. Use embeddings for visualization and similarity-based models." + ], + "metadata": { + "id": "ws8U2RZPYxKh" + } + }, + { + "cell_type": "code", + "source": [ + "!pip install cmfrec" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "sjiRL58KY5YP", + "outputId": "1e6a57eb-ccc9-40fd-ae3c-67ea0b4bcfb7" + }, + "execution_count": 60, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Collecting cmfrec\n", + " Downloading cmfrec-3.5.1.post10.tar.gz (268 kB)\n", + "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/268.3 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m122.9/268.3 kB\u001b[0m \u001b[31m4.4 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m268.3/268.3 kB\u001b[0m \u001b[31m4.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", + " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + "Requirement already satisfied: cython in /usr/local/lib/python3.10/dist-packages (from cmfrec) (3.0.11)\n", + "Requirement already satisfied: numpy>=1.25 in /usr/local/lib/python3.10/dist-packages (from cmfrec) (1.26.4)\n", + "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from cmfrec) (1.13.1)\n", + "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from cmfrec) (2.2.2)\n", + "Collecting findblas (from cmfrec)\n", + " Using cached findblas-0.1.26.post1-py3-none-any.whl\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->cmfrec) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->cmfrec) (2024.2)\n", + "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas->cmfrec) (2024.2)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas->cmfrec) (1.16.0)\n", + "Building wheels for collected packages: cmfrec\n", + " Building wheel for cmfrec (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for cmfrec: filename=cmfrec-3.5.1.post10-cp310-cp310-linux_x86_64.whl size=5669729 sha256=c73d50760afcd4c1e99918f2e3e83803e82875950bad299b7908307ec278a9d9\n", + " Stored in directory: /root/.cache/pip/wheels/cc/80/d7/9b7d9361970eb499c0227a3fac504240f7793dec0d9793bee6\n", + "Successfully built cmfrec\n", + "Installing collected packages: findblas, cmfrec\n", + "Successfully installed cmfrec-3.5.1.post10 findblas-0.1.26.post1\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from cmfrec import CMF" + ], + "metadata": { + "id": "Kr0drX3LD4bD" + }, + "execution_count": 61, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "rm_raw = data[['UserID','MovieID','Rating']].drop_duplicates()\n", + "rm_raw.columns = ['UserId', 'ItemId', 'Rating']\n", + "rm_raw.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "CyWrU1ZOD7xk", + "outputId": "36aa7230-e5e6-4274-b244-0ee64412cbf4" + }, + "execution_count": 62, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " UserId ItemId Rating\n", + "0 1 48 5\n", + "4 1 150 5\n", + "5 1 1 5\n", + "8 1 260 4\n", + "12 2 292 3" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
UserIdItemIdRating
01485
411505
5115
812604
1222923
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "rm_raw", + "summary": "{\n \"name\": \"rm_raw\",\n \"rows\": 79877,\n \"fields\": [\n {\n \"column\": \"UserId\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1731,\n \"min\": 1,\n \"max\": 6040,\n \"num_unique_values\": 5731,\n \"samples\": [\n 1682,\n 4328,\n 2798\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"ItemId\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 95,\n \"min\": 1,\n \"max\": 299,\n \"num_unique_values\": 291,\n \"samples\": [\n 222,\n 37,\n 73\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Rating\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 1,\n \"max\": 5,\n \"num_unique_values\": 5,\n \"samples\": [\n 4,\n 1,\n 3\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 62 + } + ] + }, + { + "cell_type": "code", + "source": [ + "model = CMF(k = 2, lambda_ = 0.1, verbose = False, user_bias= False, item_bias=False)\n", + "model.fit(rm_raw)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "abr7GIbJD7j2", + "outputId": "24467954-5a27-4734-e4fa-31488f144adb" + }, + "execution_count": 63, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Collective matrix factorization model\n", + "(explicit-feedback variant)\n" + ] + }, + "metadata": {}, + "execution_count": 63 + } + ] + }, + { + "cell_type": "code", + "source": [ + "model.A_.shape" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "LqYXQJo2FGcF", + "outputId": "848c00bc-4a87-4b43-dd83-d7c0160d8677" + }, + "execution_count": 64, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(5731, 2)" + ] + }, + "metadata": {}, + "execution_count": 64 + } + ] + }, + { + "cell_type": "code", + "source": [ + "model.B_.shape" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "XXyTY-aYFM_M", + "outputId": "09f159b2-4696-4bc9-e676-1b99d51621a8" + }, + "execution_count": 65, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(291, 2)" + ] + }, + "metadata": {}, + "execution_count": 65 + } + ] + }, + { + "cell_type": "code", + "source": [ + "data['Rating'].mean()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "JZyv6-qIFZx4", + "outputId": "53a98aa8-bab3-4e47-c7a2-5e818aa07066" + }, + "execution_count": 66, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "3.5757474669802787" + ] + }, + "metadata": {}, + "execution_count": 66 + } + ] + }, + { + "cell_type": "code", + "source": [ + "model.glob_mean_ # (mu)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "oFBoUcuMFYFM", + "outputId": "9e34ce7b-61c0-45e8-bd6b-04b881036e9a" + }, + "execution_count": 67, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "3.554752826690674" + ] + }, + "metadata": {}, + "execution_count": 67 + } + ] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.metrics import mean_squared_error as mse\n", + "rm__ = np.dot(model.A_, model.B_.T) + model.glob_mean_\n", + "\n", + "mse(r_user_item.values[r_user_item>0] , rm__[r_user_item>0] )**0.5" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "J7tqCJPyFhcX", + "outputId": "856affd7-776e-41ab-be36-2568fcb61f7c" + }, + "execution_count": 68, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "1.494934071049986" + ] + }, + "metadata": {}, + "execution_count": 68 + } + ] + }, + { + "cell_type": "code", + "source": [ + "top_items = model.topN(user = 1, n = 10)\n", + "top_items" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "OVM5puRIGXTX", + "outputId": "1552d041-5075-4e08-c918-5d277f5968d4" + }, + "execution_count": 69, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "array([126, 38, 19, 65, 75, 102, 250, 243, 216, 15])" + ] + }, + "metadata": {}, + "execution_count": 69 + } + ] + }, + { + "cell_type": "code", + "source": [ + "movie.loc[movie.MovieID.isin(top_items)]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 551 + }, + "id": "7GmEXlLgGbaE", + "outputId": "e439db8f-da8f-49f9-b5b9-4af85dbd0dc8" + }, + "execution_count": 70, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " MovieID Title Genres\n", + "14 15 Cutthroat Island (1995) Action\n", + "14 15 Cutthroat Island (1995) Adventure\n", + "14 15 Cutthroat Island (1995) Romance\n", + "18 19 Ace Ventura: When Nature Calls (1995) Comedy\n", + "37 38 It Takes Two (1995) Comedy\n", + "64 65 Bio-Dome (1996) Comedy\n", + "74 75 Big Bully (1996) Comedy\n", + "74 75 Big Bully (1996) Drama\n", + "100 102 Mr. Wrong (1996) Comedy\n", + "124 126 NeverEnding Story III, The (1994) Adventure\n", + "124 126 NeverEnding Story III, The (1994) Children's\n", + "124 126 NeverEnding Story III, The (1994) Fantasy\n", + "214 216 Billy Madison (1995) Comedy\n", + "240 243 Gordy (1995) Comedy\n", + "247 250 Heavyweights (1994) Children's\n", + "247 250 Heavyweights (1994) Comedy" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MovieIDTitleGenres
1415Cutthroat Island (1995)Action
1415Cutthroat Island (1995)Adventure
1415Cutthroat Island (1995)Romance
1819Ace Ventura: When Nature Calls (1995)Comedy
3738It Takes Two (1995)Comedy
6465Bio-Dome (1996)Comedy
7475Big Bully (1996)Comedy
7475Big Bully (1996)Drama
100102Mr. Wrong (1996)Comedy
124126NeverEnding Story III, The (1994)Adventure
124126NeverEnding Story III, The (1994)Children's
124126NeverEnding Story III, The (1994)Fantasy
214216Billy Madison (1995)Comedy
240243Gordy (1995)Comedy
247250Heavyweights (1994)Children's
247250Heavyweights (1994)Comedy
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"movie\",\n \"rows\": 16,\n \"fields\": [\n {\n \"column\": \"MovieID\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 87,\n \"min\": 15,\n \"max\": 250,\n \"num_unique_values\": 10,\n \"samples\": [\n 243,\n 19,\n 102\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 10,\n \"samples\": [\n \"Gordy (1995)\",\n \"Ace Ventura: When Nature Calls (1995)\",\n \"Mr. Wrong (1996)\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Genres\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 7,\n \"samples\": [\n \"Action\",\n \"Adventure\",\n \"Children's\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 70 + } + ] + }, + { + "cell_type": "code", + "source": [ + "import matplotlib.pyplot as plt\n", + "from sklearn.manifold import TSNE\n", + "\n", + "embeddings = r_user_item\n", + "\n", + "# Use t-SNE to reduce dimensionality to 2D for visualization\n", + "tsne = TSNE(n_components=2)\n", + "reduced_embeddings = tsne.fit_transform(embeddings)\n", + "\n", + "# Plotting the reduced embeddings\n", + "plt.scatter(reduced_embeddings[:100, 0], reduced_embeddings[:100, 1], label='Users', color='blue')\n", + "plt.scatter(reduced_embeddings[100:, 0], reduced_embeddings[100:, 1], label='Items', color='red')\n", + "plt.legend()\n", + "plt.show()\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 431 + }, + "id": "e9nevh81HdNe", + "outputId": "08770a7d-2356-4ecd-a676-14ed9f393760" + }, + "execution_count": 71, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Questionnaire:\n", + "\n", + "Users of which age group have watched and rated the most number of movies?\n", + "\n", + "Users belonging to which profession have watched and rated the most movies?\n", + "\n", + "Most of the users in our dataset who’ve rated the movies are Male. (T/F)\n", + "\n", + "Most of the movies present in our dataset were released in which decade?\n", + "\n", + "70s b. 90s c. 50s d.80s\n", + "\n", + "The movie with maximum no. of ratings is ___.\n", + "\n", + "Name the top 3 movies similar to ‘Liar Liar’ on the item-based approach.\n", + "\n", + "On the basis of approach, Collaborative Filtering methods can be classified into ___-based and ___-based.\n", + "\n", + "Pearson Correlation ranges between ___ to ___ whereas, Cosine Similarity belongs to the interval between ___ to ___.\n", + "\n", + "Mention the RMSE and MAPE that you got while evaluating the Matrix Factorization model.\n", + "\n", + "Give the sparse ‘row’ matrix representation for the following dense matrix -\n", + "\n", + "[[1 0]\n", + "[3 7]]" + ], + "metadata": { + "id": "e0DoYU_QnyZF" + } + }, + { + "cell_type": "markdown", + "source": [ + "Most movie rated age group belong to the 25 to 34 age group" + ], + "metadata": { + "id": "5QhevsiZn8g9" + } + }, + { + "cell_type": "code", + "source": [ + "data.groupby('Age')['MovieID'].sum().reset_index().sort_values(by='MovieID', ascending=False)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 269 + }, + "id": "aD6F2nGonstB", + "outputId": "71905112-2a73-435f-bde1-3ba97128c750" + }, + "execution_count": 72, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Age MovieID\n", + "2 25 9954720\n", + "1 18 5051546\n", + "3 35 4302862\n", + "4 45 1754157\n", + "5 50 1489455\n", + "6 56 740501\n", + "0 1 667151" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AgeMovieID
2259954720
1185051546
3354302862
4451754157
5501489455
656740501
01667151
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"data\",\n \"rows\": 7,\n \"fields\": [\n {\n \"column\": \"Age\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 19,\n \"min\": 1,\n \"max\": 56,\n \"num_unique_values\": 7,\n \"samples\": [\n 25,\n 18,\n 56\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"MovieID\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3351711,\n \"min\": 667151,\n \"max\": 9954720,\n \"num_unique_values\": 7,\n \"samples\": [\n 9954720,\n 5051546,\n 740501\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 72 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "college/grad student are rating more number of movies" + ], + "metadata": { + "id": "DqEZbeAFonMQ" + } + }, + { + "cell_type": "code", + "source": [ + "data.groupby('Occupation')['MovieID'].sum().reset_index().sort_values(by='MovieID', ascending=False)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 708 + }, + "id": "o617mse3oNBh", + "outputId": "2f9ee7c0-485b-4e82-9026-f2a09d919cdc" + }, + "execution_count": 74, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Occupation MovieID\n", + "4 4 3498889\n", + "0 0 3102820\n", + "7 7 2383710\n", + "1 1 1999463\n", + "17 17 1769565\n", + "20 20 1410293\n", + "12 12 1381350\n", + "14 14 1231415\n", + "2 2 1190770\n", + "16 16 1022346\n", + "6 6 875731\n", + "3 3 738646\n", + "10 10 594234\n", + "15 15 558604\n", + "5 5 492288\n", + "11 11 452351\n", + "19 19 401680\n", + "18 18 273039\n", + "13 13 263734\n", + "9 9 257423\n", + "8 8 62041" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
OccupationMovieID
443498889
003102820
772383710
111999463
17171769565
20201410293
12121381350
14141231415
221190770
16161022346
66875731
33738646
1010594234
1515558604
55492288
1111452351
1919401680
1818273039
1313263734
99257423
8862041
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"data\",\n \"rows\": 21,\n \"fields\": [\n {\n \"column\": \"Occupation\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 6,\n \"min\": 0,\n \"max\": 20,\n \"num_unique_values\": 21,\n \"samples\": [\n 4,\n 18,\n 11\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"MovieID\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 949200,\n \"min\": 62041,\n \"max\": 3498889,\n \"num_unique_values\": 21,\n \"samples\": [\n 3498889,\n 273039,\n 452351\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 74 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Users from male gender have rated the more to movie as compare to the female" + ], + "metadata": { + "id": "Zirm7_nLphQg" + } + }, + { + "cell_type": "code", + "source": [ + "data.groupby('Gender')['MovieID'].sum().reset_index().sort_values(by='MovieID', ascending=False)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 112 + }, + "id": "obHtOU-Oo4Yb", + "outputId": "5b166a6d-b8fe-4177-88da-cc65ec41808e" + }, + "execution_count": 79, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Gender MovieID\n", + "1 M 17736275\n", + "0 F 6224117" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GenderMovieID
1M17736275
0F6224117
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"data\",\n \"rows\": 2,\n \"fields\": [\n {\n \"column\": \"Gender\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"F\",\n \"M\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"MovieID\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 8140324,\n \"min\": 6224117,\n \"max\": 17736275,\n \"num_unique_values\": 2,\n \"samples\": [\n 6224117,\n 17736275\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 79 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "MovieID 260 are highest rated movie. and 90's movie are more rated" + ], + "metadata": { + "id": "eZyV4ov8rD7b" + } + }, + { + "cell_type": "markdown", + "source": [ + "Pearson Correlation ranges between -1 to 1, whereas Cosine Similarity belongs to the interval between 0 to 1" + ], + "metadata": { + "id": "fj2jeP36rb3Z" + } + }, + { + "cell_type": "code", + "source": [ + "data.groupby('MovieID')['Rating'].sum().reset_index().sort_values(by='Rating', ascending=False)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "id": "DfbKzbbqpv7R", + "outputId": "bec7eaa0-2441-45a2-8d74-89115da37fe0" + }, + "execution_count": 82, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " MovieID Rating\n", + "253 260 53284\n", + "106 110 31038\n", + "0 1 25839\n", + "33 34 20442\n", + "287 296 18576\n", + ".. ... ...\n", + "133 138 5\n", + "128 133 2\n", + "219 226 2\n", + "122 127 1\n", + "137 142 1\n", + "\n", + "[291 rows x 2 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MovieIDRating
25326053284
10611031038
0125839
333420442
28729618576
.........
1331385
1281332
2192262
1221271
1371421
\n", + "

291 rows × 2 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"data\",\n \"rows\": 291,\n \"fields\": [\n {\n \"column\": \"MovieID\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 86,\n \"min\": 1,\n \"max\": 299,\n \"num_unique_values\": 291,\n \"samples\": [\n 280,\n 75,\n 45\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Rating\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 4883,\n \"min\": 1,\n \"max\": 53284,\n \"num_unique_values\": 262,\n \"samples\": [\n 26,\n 121,\n 194\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 82 + } + ] + } + ] +} \ No newline at end of file diff --git a/app.py b/app.py new file mode 100644 index 0000000..8838b77 --- /dev/null +++ b/app.py @@ -0,0 +1 @@ +print("practice github") \ No newline at end of file diff --git a/cuckoo2.0/data/monitor/14b2b875dbfa50d19f5967c4b150a7a2e9465e39/inject-x64.exe b/cuckoo2.0/data/monitor/14b2b875dbfa50d19f5967c4b150a7a2e9465e39/inject-x64.exe deleted file mode 100755 index 06e0fad..0000000 Binary files a/cuckoo2.0/data/monitor/14b2b875dbfa50d19f5967c4b150a7a2e9465e39/inject-x64.exe and /dev/null differ diff --git a/cuckoo2.0/data/monitor/14b2b875dbfa50d19f5967c4b150a7a2e9465e39/inject-x86.exe b/cuckoo2.0/data/monitor/14b2b875dbfa50d19f5967c4b150a7a2e9465e39/inject-x86.exe deleted file mode 100755 index b51c2fc..0000000 Binary files a/cuckoo2.0/data/monitor/14b2b875dbfa50d19f5967c4b150a7a2e9465e39/inject-x86.exe and /dev/null differ diff --git a/cuckoo2.0/data/monitor/2904ecd8163e96db69fe2ac8f48c5935e194fb08/inject-x64.exe b/cuckoo2.0/data/monitor/2904ecd8163e96db69fe2ac8f48c5935e194fb08/inject-x64.exe deleted file mode 100755 index ed2d0de..0000000 Binary files a/cuckoo2.0/data/monitor/2904ecd8163e96db69fe2ac8f48c5935e194fb08/inject-x64.exe and /dev/null differ diff --git a/cuckoo2.0/data/monitor/579f57961915e0cf0b4716d993e374169bd08f5e/inject-x64.exe b/cuckoo2.0/data/monitor/579f57961915e0cf0b4716d993e374169bd08f5e/inject-x64.exe deleted file mode 100755 index 85ec4ab..0000000 Binary files a/cuckoo2.0/data/monitor/579f57961915e0cf0b4716d993e374169bd08f5e/inject-x64.exe and /dev/null differ diff --git a/cuckoo2.0/data/monitor/579f57961915e0cf0b4716d993e374169bd08f5e/inject-x86.exe b/cuckoo2.0/data/monitor/579f57961915e0cf0b4716d993e374169bd08f5e/inject-x86.exe deleted file mode 100755 index 5dd1230..0000000 Binary files a/cuckoo2.0/data/monitor/579f57961915e0cf0b4716d993e374169bd08f5e/inject-x86.exe and /dev/null differ diff --git a/cuckoo2.0/data/monitor/5892c58f567293c86eab8ccb0a491d80b48eab08/inject-x64.exe b/cuckoo2.0/data/monitor/5892c58f567293c86eab8ccb0a491d80b48eab08/inject-x64.exe deleted file mode 100755 index 85ec4ab..0000000 Binary files a/cuckoo2.0/data/monitor/5892c58f567293c86eab8ccb0a491d80b48eab08/inject-x64.exe and /dev/null differ diff --git a/cuckoo2.0/data/monitor/5892c58f567293c86eab8ccb0a491d80b48eab08/inject-x86.exe b/cuckoo2.0/data/monitor/5892c58f567293c86eab8ccb0a491d80b48eab08/inject-x86.exe deleted file mode 100755 index 5dd1230..0000000 Binary files a/cuckoo2.0/data/monitor/5892c58f567293c86eab8ccb0a491d80b48eab08/inject-x86.exe and /dev/null differ diff --git a/cuckoo2.0/data/monitor/6fddfdc86ef18dda769a0120caeecbed76703f0e/inject-x64.exe b/cuckoo2.0/data/monitor/6fddfdc86ef18dda769a0120caeecbed76703f0e/inject-x64.exe deleted file mode 100755 index ed2d0de..0000000 Binary files a/cuckoo2.0/data/monitor/6fddfdc86ef18dda769a0120caeecbed76703f0e/inject-x64.exe and /dev/null differ diff --git a/cuckoo2.0/data/monitor/6fddfdc86ef18dda769a0120caeecbed76703f0e/inject-x86.exe b/cuckoo2.0/data/monitor/6fddfdc86ef18dda769a0120caeecbed76703f0e/inject-x86.exe deleted file mode 100755 index f382fe7..0000000 Binary files a/cuckoo2.0/data/monitor/6fddfdc86ef18dda769a0120caeecbed76703f0e/inject-x86.exe and /dev/null differ diff --git a/cuckoo2.0/data/monitor/952f69a1d4d04988062819cb81ba3948ab4439cf/inject-x64.exe b/cuckoo2.0/data/monitor/952f69a1d4d04988062819cb81ba3948ab4439cf/inject-x64.exe deleted file mode 100755 index ed2d0de..0000000 Binary files a/cuckoo2.0/data/monitor/952f69a1d4d04988062819cb81ba3948ab4439cf/inject-x64.exe and /dev/null differ diff --git a/cuckoo2.0/data/monitor/952f69a1d4d04988062819cb81ba3948ab4439cf/inject-x86.exe b/cuckoo2.0/data/monitor/952f69a1d4d04988062819cb81ba3948ab4439cf/inject-x86.exe deleted file mode 100755 index f382fe7..0000000 Binary files a/cuckoo2.0/data/monitor/952f69a1d4d04988062819cb81ba3948ab4439cf/inject-x86.exe and /dev/null differ diff --git a/cuckoo2.0/data/monitor/98b5eef740c2a0756f7c7be5a3dc5e169714227d/inject-x64.exe b/cuckoo2.0/data/monitor/98b5eef740c2a0756f7c7be5a3dc5e169714227d/inject-x64.exe deleted file mode 100755 index 85ec4ab..0000000 Binary files a/cuckoo2.0/data/monitor/98b5eef740c2a0756f7c7be5a3dc5e169714227d/inject-x64.exe and /dev/null differ diff --git a/cuckoo2.0/data/monitor/98b5eef740c2a0756f7c7be5a3dc5e169714227d/inject-x86.exe b/cuckoo2.0/data/monitor/98b5eef740c2a0756f7c7be5a3dc5e169714227d/inject-x86.exe deleted file mode 100755 index 5dd1230..0000000 Binary files a/cuckoo2.0/data/monitor/98b5eef740c2a0756f7c7be5a3dc5e169714227d/inject-x86.exe and /dev/null differ diff --git a/cuckoo2.0/data/monitor/9e90535ed89363bd424d6788e4341f468b7155d2/inject-x64.exe b/cuckoo2.0/data/monitor/9e90535ed89363bd424d6788e4341f468b7155d2/inject-x64.exe deleted file mode 100755 index ed2d0de..0000000 Binary files a/cuckoo2.0/data/monitor/9e90535ed89363bd424d6788e4341f468b7155d2/inject-x64.exe and /dev/null differ diff --git a/cuckoo2.0/data/monitor/9e90535ed89363bd424d6788e4341f468b7155d2/inject-x86.exe b/cuckoo2.0/data/monitor/9e90535ed89363bd424d6788e4341f468b7155d2/inject-x86.exe deleted file mode 100755 index f382fe7..0000000 Binary files a/cuckoo2.0/data/monitor/9e90535ed89363bd424d6788e4341f468b7155d2/inject-x86.exe and /dev/null differ diff --git a/cuckoo2.0/data/monitor/d8c5f2986e5bd4598cf7bf3b30d43b7f7f5b7856/inject-x64.exe b/cuckoo2.0/data/monitor/d8c5f2986e5bd4598cf7bf3b30d43b7f7f5b7856/inject-x64.exe deleted file mode 100755 index ed2d0de..0000000 Binary files a/cuckoo2.0/data/monitor/d8c5f2986e5bd4598cf7bf3b30d43b7f7f5b7856/inject-x64.exe and /dev/null differ diff --git a/cuckoo2.0/data/monitor/d8c5f2986e5bd4598cf7bf3b30d43b7f7f5b7856/inject-x86.exe b/cuckoo2.0/data/monitor/d8c5f2986e5bd4598cf7bf3b30d43b7f7f5b7856/inject-x86.exe deleted file mode 100755 index f382fe7..0000000 Binary files a/cuckoo2.0/data/monitor/d8c5f2986e5bd4598cf7bf3b30d43b7f7f5b7856/inject-x86.exe and /dev/null differ diff --git a/cuckoo2.0/data/monitor/sw-monitor/inject-x64.exe b/cuckoo2.0/data/monitor/sw-monitor/inject-x64.exe deleted file mode 100755 index 0b20d5f..0000000 Binary files a/cuckoo2.0/data/monitor/sw-monitor/inject-x64.exe and /dev/null differ diff --git a/cuckoo2.0/data/monitor/sw-monitor/inject-x86.exe b/cuckoo2.0/data/monitor/sw-monitor/inject-x86.exe deleted file mode 100755 index 84dfc32..0000000 Binary files a/cuckoo2.0/data/monitor/sw-monitor/inject-x86.exe and /dev/null differ