diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 8a00d2b..0000000 --- a/.gitignore +++ /dev/null @@ -1,179 +0,0 @@ -# Created by https://www.toptal.com/developers/gitignore/api/python -# Edit at https://www.toptal.com/developers/gitignore?templates=python - -### Python ### -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -doc/ -yelp_dataset/ -yelp_photos/ -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ -cover/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -.pybuilder/ -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -# For a library or package, you might want to ignore these files since the code is -# intended to run in multiple environments; otherwise, check them in: -# .python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# poetry -# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. -# This is especially recommended for binary packages to ensure reproducibility, and is more -# commonly ignored for libraries. -# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control -#poetry.lock - -# pdm -# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. -#pdm.lock -# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it -# in version control. -# https://pdm.fming.dev/#use-with-ide -.pdm.toml - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# pytype static type analyzer -.pytype/ - -# Cython debug symbols -cython_debug/ - -# PyCharm -# JetBrains specific template is maintained in a separate JetBrains.gitignore that can -# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore -# and can be added to the global gitignore or merged into this file. For a more nuclear -# option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ - -### Python Patch ### -# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration -poetry.toml - -# ruff -.ruff_cache/ - -# LSP config files -pyrightconfig.json - -# End of https://www.toptal.com/developers/gitignore/api/python diff --git a/App/preprocess/preprocess_pipeline.ipynb b/App/preprocess/preprocess_pipeline.ipynb deleted file mode 100644 index 0db4bf0..0000000 --- a/App/preprocess/preprocess_pipeline.ipynb +++ /dev/null @@ -1 +0,0 @@ -{"cells":[{"cell_type":"code","execution_count":1,"metadata":{"executionInfo":{"elapsed":3811,"status":"ok","timestamp":1728164875352,"user":{"displayName":"BZRecProject","userId":"17728562531777262703"},"user_tz":240},"id":"298aVYAHmo0I"},"outputs":[],"source":["import pandas as pd\n","import numpy as np\n","from sklearn.preprocessing import StandardScaler\n","from sklearn.impute import SimpleImputer\n","from sklearn.compose import ColumnTransformer\n","from sklearn.pipeline import Pipeline\n","from sklearn.feature_extraction.text import TfidfVectorizer\n","from sklearn.preprocessing import OneHotEncoder"]},{"cell_type":"code","execution_count":2,"metadata":{},"outputs":[{"name":"stderr","output_type":"stream","text":["C:\\Users\\User\\AppData\\Local\\Temp\\ipykernel_28432\\2173750344.py:3: DtypeWarning: Columns (57,58) have mixed types. Specify dtype option on import or set low_memory=False.\n"," business_df = pd.read_csv('..\\..\\yelp_dataset\\yelp_business.csv')\n"]}],"source":["# Load the dataset\n","\n","business_df = pd.read_csv('..\\..\\yelp_dataset\\yelp_business.csv')\n","tip_df = pd.read_csv('..\\..\\yelp_dataset\\yelp_tip.csv')\n","checkin_df = pd.read_csv('..\\..\\yelp_dataset\\yelp_checkin.csv')"]},{"cell_type":"code","execution_count":3,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
_idbusiness_idnameaddresscitystatepostal_codelatitudelongitudestars...attributes.AcceptsInsuranceattributes.BestNightsattributes.BYOBattributes.Corkageattributes.BYOBCorkageattributes.HairSpecializesInattributes.Open24Hoursattributes.RestaurantsCounterServiceattributes.AgesAllowedattributes.DietaryRestrictions
066ea4800e59c7c5b6d879305Pns2l4eNsfO8kk83dixA6AAbby Rappoport, LAC, CMQ1616 Chapala St, Ste 2Santa BarbaraCA9310134.426679-119.7111975.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
166ea4800e59c7c5b6d879306mpf3x-BjTdTEA3yCZrAYPwThe UPS Store87 Grasso Plaza Shopping CenterAfftonMO6312338.551126-90.3356953.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
266ea4800e59c7c5b6d879307tUFrWirKiKi_TAnsVWINQQTarget5255 E Broadway BlvdTucsonAZ8571132.223236-110.8804523.5...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n","

3 rows × 61 columns

\n","
"],"text/plain":[" _id business_id name \\\n","0 66ea4800e59c7c5b6d879305 Pns2l4eNsfO8kk83dixA6A Abby Rappoport, LAC, CMQ \n","1 66ea4800e59c7c5b6d879306 mpf3x-BjTdTEA3yCZrAYPw The UPS Store \n","2 66ea4800e59c7c5b6d879307 tUFrWirKiKi_TAnsVWINQQ Target \n","\n"," address city state postal_code \\\n","0 1616 Chapala St, Ste 2 Santa Barbara CA 93101 \n","1 87 Grasso Plaza Shopping Center Affton MO 63123 \n","2 5255 E Broadway Blvd Tucson AZ 85711 \n","\n"," latitude longitude stars ... attributes.AcceptsInsurance \\\n","0 34.426679 -119.711197 5.0 ... NaN \n","1 38.551126 -90.335695 3.0 ... NaN \n","2 32.223236 -110.880452 3.5 ... NaN \n","\n"," attributes.BestNights attributes.BYOB attributes.Corkage \\\n","0 NaN NaN NaN \n","1 NaN NaN NaN \n","2 NaN NaN NaN \n","\n"," attributes.BYOBCorkage attributes.HairSpecializesIn attributes.Open24Hours \\\n","0 NaN NaN NaN \n","1 NaN NaN NaN \n","2 NaN NaN NaN \n","\n"," attributes.RestaurantsCounterService attributes.AgesAllowed \\\n","0 NaN NaN \n","1 NaN NaN \n","2 NaN NaN \n","\n"," attributes.DietaryRestrictions \n","0 NaN \n","1 NaN \n","2 NaN \n","\n","[3 rows x 61 columns]"]},"execution_count":3,"metadata":{},"output_type":"execute_result"}],"source":["business_df.head(3)"]},{"cell_type":"code","execution_count":4,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
_iduser_idbusiness_idtextdatecompliment_count
066ea489ae59c7c5b6d8be1acAGNUgVwnZUey3gcPCJ76iw3uLgwr0qeCNMjKenHJwPGQAvengers time with the ladies.2012-05-18 02:17:210
166ea489ae59c7c5b6d8be1adNBN4MgHP9D3cw--SnauTkAQoezRbYQncpRqyrLH6IqjgThey have lots of good deserts and tasty cuban...2013-02-05 18:35:100
266ea489ae59c7c5b6d8be1ae-copOvldyKh1qr-vzkDEvwMYoRNLb5chwjQe3c_k37GgIt's open even when you think it isn't2013-08-18 00:56:080
\n","
"],"text/plain":[" _id user_id business_id \\\n","0 66ea489ae59c7c5b6d8be1ac AGNUgVwnZUey3gcPCJ76iw 3uLgwr0qeCNMjKenHJwPGQ \n","1 66ea489ae59c7c5b6d8be1ad NBN4MgHP9D3cw--SnauTkA QoezRbYQncpRqyrLH6Iqjg \n","2 66ea489ae59c7c5b6d8be1ae -copOvldyKh1qr-vzkDEvw MYoRNLb5chwjQe3c_k37Gg \n","\n"," text date \\\n","0 Avengers time with the ladies. 2012-05-18 02:17:21 \n","1 They have lots of good deserts and tasty cuban... 2013-02-05 18:35:10 \n","2 It's open even when you think it isn't 2013-08-18 00:56:08 \n","\n"," compliment_count \n","0 0 \n","1 0 \n","2 0 "]},"execution_count":4,"metadata":{},"output_type":"execute_result"}],"source":["tip_df.head(3)"]},{"cell_type":"code","execution_count":5,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
_idbusiness_iddate
066ea4866e59c7c5b6d89de50---kPU91CF4Lq2-WlRu9Lw2020-03-13 21:10:56, 2020-06-02 22:18:06, 2020...
166ea4866e59c7c5b6d89de51--0iUa4sNDFiZFrAdIWhZQ2010-09-13 21:43:09, 2011-05-04 23:08:15, 2011...
266ea4866e59c7c5b6d89de52--30_8IhuyMHbSOcNWd6DQ2013-06-14 23:29:17, 2014-08-13 23:20:22
\n","
"],"text/plain":[" _id business_id \\\n","0 66ea4866e59c7c5b6d89de50 ---kPU91CF4Lq2-WlRu9Lw \n","1 66ea4866e59c7c5b6d89de51 --0iUa4sNDFiZFrAdIWhZQ \n","2 66ea4866e59c7c5b6d89de52 --30_8IhuyMHbSOcNWd6DQ \n","\n"," date \n","0 2020-03-13 21:10:56, 2020-06-02 22:18:06, 2020... \n","1 2010-09-13 21:43:09, 2011-05-04 23:08:15, 2011... \n","2 2013-06-14 23:29:17, 2014-08-13 23:20:22 "]},"execution_count":5,"metadata":{},"output_type":"execute_result"}],"source":["checkin_df.head(3)"]},{"cell_type":"code","execution_count":6,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"QdhFipQ6m6cJ","outputId":"89b05b7e-ca7c-4922-f818-6b91d8709a16"},"outputs":[],"source":["# Load the datasets\n","\n","review_df = pd.read_csv('..\\..\\yelp_dataset\\yelp_review.csv')\n"]}],"metadata":{"colab":{"authorship_tag":"ABX9TyMCQ2DuYha2PjbKa00hrT7h","provenance":[]},"kernelspec":{"display_name":"env","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.5"}},"nbformat":4,"nbformat_minor":0} diff --git a/DataprocessingP1_Review.ipynb b/DataprocessingP1_Review.ipynb deleted file mode 100644 index e322750..0000000 --- a/DataprocessingP1_Review.ipynb +++ /dev/null @@ -1,393 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "9Wt7ZQNdZANN", - "outputId": "59f59c60-aba2-46d2-ab77-fa8f8d7fab0c" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Collecting pyspark\n", - " Downloading pyspark-3.5.3.tar.gz (317.3 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m317.3/317.3 MB\u001b[0m \u001b[31m3.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - "Requirement already satisfied: py4j==0.10.9.7 in /usr/local/lib/python3.10/dist-packages (from pyspark) (0.10.9.7)\n", - "Building wheels for collected packages: pyspark\n", - " Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for pyspark: filename=pyspark-3.5.3-py2.py3-none-any.whl size=317840625 sha256=c95ba513b8ea87737447a56790e3342645a001f55d94dc06cf038953cf54d9d5\n", - " Stored in directory: /root/.cache/pip/wheels/1b/3a/92/28b93e2fbfdbb07509ca4d6f50c5e407f48dce4ddbda69a4ab\n", - "Successfully built pyspark\n", - "Installing collected packages: pyspark\n", - "Successfully installed pyspark-3.5.3\n" - ] - } - ], - "source": [ - "!pip install pyspark" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "6wGKWU-iZT4y", - "outputId": "7e0cb6ac-fc23-4396-d450-08edb6299773" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Collecting findspark\n", - " Downloading findspark-2.0.1-py2.py3-none-any.whl.metadata (352 bytes)\n", - "Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)\n", - "Installing collected packages: findspark\n", - "Successfully installed findspark-2.0.1\n" - ] - } - ], - "source": [ - "!pip install findspark" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "9fyG_DI8ZJMq", - "outputId": "2265ba21-5c7b-42f4-a9f1-f3bc27129b21" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Requirement already satisfied: pyarrow in /usr/local/lib/python3.10/dist-packages (16.1.0)\n", - "Requirement already satisfied: numpy>=1.16.6 in /usr/local/lib/python3.10/dist-packages (from pyarrow) (1.26.4)\n" - ] - } - ], - "source": [ - "\n", - "!pip install pyarrow\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "glBfjrR8Zrzi" - }, - "outputs": [], - "source": [ - "import findspark\n", - "findspark.init()\n", - "from pyspark.sql import SparkSession\n", - "from pyspark.sql.functions import col\n", - "from pyspark.sql.functions import *\n", - "import time" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "1MOwlc9cZv1R" - }, - "outputs": [], - "source": [ - "# Initialize SparkSession\n", - "spark = SparkSession.builder.appName(\"CSVReader\").getOrCreate()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "VMA6hRI_ZfLb", - "outputId": "d5e19af8-cc52-4071-c350-73e8bd35b50b" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Execution started at: 2024-10-11 23:54:45\n", - "+--------------------+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+-------------------+\n", - "| _id| review_id| user_id| business_id|stars|useful|funny|cool| text| date|\n", - "+--------------------+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+-------------------+\n", - "|66ea4ea9e59c7c5b6...|KU_O5udG6zpxOg-Vc...|mh_-eMZ6K5RLWhZyI...|XQfwVwDr-v0ZS3_Cb...| 3| 0| 0| 0|If you decide to ...|2018-07-07 22:09:11|\n", - "|66ea4ea9e59c7c5b6...|BiTunyQ73aT9WBnpR...|OyoGAe7OKpv6SyGZT...|7ATYjTIgM3jUlt4UM...| 5| 1| 0| 1|I've taken a lot ...|2012-01-03 15:28:18|\n", - "|66ea4ea9e59c7c5b6...|saUsX_uimxRlCVr67...|8g_iMtfSiwikVnbP2...|YjUWPpI6HXG530lwP...| 3| 0| 0| 0|Family diner. Had...|2014-02-05 20:30:30|\n", - "|66ea4ea9e59c7c5b6...|AqPFMleE6RsU23_au...|_7bHUi9Uuf5__HHc_...|kxX2SOes4o-D3ZQBk...| 5| 1| 0| 1|Wow! Yummy, diff...|2015-01-04 00:01:03|\n", - "|66ea4ea9e59c7c5b6...|Sx8TMOWLNuJBWer-0...|bcjbaE6dDog4jkNY9...|e4Vwtrqf-wpJfwesg...| 4| 1| 0| 1|Cute interior and...|2017-01-14 20:54:15|\n", - "|66ea4ea9e59c7c5b6...|JrIxlS1TzJ-iCu79u...|eUta8W_HdHMXPzLBB...|04UD14gamNjLY0IDY...| 1| 1| 2| 1|I am a long term ...|2015-09-23 23:10:31|\n", - "|66ea4ea9e59c7c5b6...|6AxgBCNX_PNTOxmbR...|r3zeYsv1XFBRA4dJp...|gmjsEdUsKpj9Xxu6p...| 5| 0| 2| 0|Loved this tour! ...|2015-01-03 23:21:18|\n", - "|66ea4ea9e59c7c5b6...|_ZeMknuYdlQcUqng_...|yfFzsLmaWF2d4Sr0U...|LHSTtnW3YHCeUkRDG...| 5| 2| 0| 0|Amazingly amazing...|2015-08-07 02:29:16|\n", - "|66ea4ea9e59c7c5b6...|ZKvDG2sBvHVdF5oBN...|wSTuiTk-sKNdcFypr...|B5XSoSG3SfvQGtKEG...| 3| 1| 1| 0|This easter inste...|2016-03-30 22:46:33|\n", - "|66ea4ea9e59c7c5b6...|pUycOfUwM8vqX7KjR...|59MxRhNVhU9MYndMk...|gebiRewfieSdtt17P...| 3| 0| 0| 0|Had a party of 6 ...|2016-07-25 07:31:06|\n", - "|66ea4ea9e59c7c5b6...|rGQRf8UafX7OTlMNN...|1WHRWwQmZOZDAhp2Q...|uMvVYRgGNXf5boolA...| 5| 2| 0| 0|My experience wit...|2015-06-21 14:48:06|\n", - "|66ea4ea9e59c7c5b6...|l3Wk_mvAog6XANIuG...|ZbqSHbgCjzVAqaa7N...|EQ-TZ2eeD_E0BHuvo...| 4| 0| 0| 0|Locals recommende...|2015-08-19 14:31:45|\n", - "|66ea4ea9e59c7c5b6...|XW_LfMv0fV21l9c6x...|9OAtfnWag-ajVxRbU...|lj-E32x9_FA7GmUrB...| 4| 0| 0| 0|Love going here f...|2014-06-27 22:44:01|\n", - "|66ea4ea9e59c7c5b6...|8JFGBuHMoiNDyfcxu...|smOvOajNG0lS4Pq7d...|RZtGWDLCAtuipwaZ-...| 4| 0| 0| 0|Good food--loved ...|2009-10-14 19:57:14|\n", - "|66ea4ea9e59c7c5b6...|UBp0zWyH60Hmw6Fsa...|4Uh27DgGzsp6PqrH9...|otQS34_MymijPTdNB...| 4| 0| 2| 0|The bun makes the...|2011-10-27 17:12:05|\n", - "|66ea4ea9e59c7c5b6...|OAhBYw8IQ6wlfw1ow...|1C2lxzUo1Hyye4RFI...|BVndHaLihEYbr76Z0...| 5| 0| 0| 0|Great place for b...|2014-10-11 16:22:06|\n", - "|66ea4ea9e59c7c5b6...|oyaMhzBSwfGgemSGu...|Dd1jQj7S-BFGqRbAp...|YtSqYv1Q_pOltsVPS...| 5| 0| 0| 0|Tremendous servic...|2013-06-24 11:21:25|\n", - "|66ea4ea9e59c7c5b6...|LnGZB0fjfgeVDVz5I...|j2wlzrntrbKwyOcOi...|rBdG_23USc7DletfZ...| 4| 1| 0| 0|The hubby and I h...|2014-08-10 19:41:43|\n", - "|66ea4ea9e59c7c5b6...|u2vzZaOqJ2feRshaa...|NDZvyYHTUWWu-kqgQ...|CLEWowfkj-wKYJlQD...| 5| 2| 0| 1|I go to blow bar ...|2016-03-07 00:02:18|\n", - "|66ea4ea9e59c7c5b6...|Xs8Z8lmKkosqW5mw_...|IQsF3Rc6IgCzjVV9D...|eFvzHawVJofxSnD7T...| 5| 0| 0| 0|My absolute favor...|2014-11-12 15:30:27|\n", - "+--------------------+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+-------------------+\n", - "only showing top 20 rows\n", - "\n", - "Execution ended at: 2024-10-11 23:56:18\n", - "Execution time: 93.61 seconds\n" - ] - } - ], - "source": [ - "# Start time\n", - "start_time = time.time()\n", - "print(f\"Execution started at: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(start_time))}\")\n", - "\n", - "# Read the CSV data\n", - "df = spark.read.csv(\"/content/drive/MyDrive/CSV/yelp_review.csv\", header=True, inferSchema=True)\n", - "\n", - "# Display the entire dataframe (use with caution for large datasets)\n", - "df.show()\n", - "\n", - "# End time\n", - "end_time = time.time()\n", - "print(f\"Execution ended at: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(end_time))}\")\n", - "\n", - "# Calculate and print the execution time\n", - "execution_time = end_time - start_time\n", - "print(f\"Execution time: {execution_time:.2f} seconds\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "hmKYkkpGa2ls", - "outputId": "e04590f9-7f1c-448b-9c60-09c80b84cbbe" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Total records in the dataframe: 6990280\n" - ] - } - ], - "source": [ - "# total records in the dataframe\n", - "\n", - "total_records = df.count()\n", - "print(f\"Total records in the dataframe: {total_records}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "SAQNWOozaX5O", - "outputId": "ba9bfbfb-bc78-4e3d-96c7-7d6467f53c1d" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+-------------------+\n", - "| review_id| user_id| business_id|stars|useful|funny|cool| text| date|\n", - "+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+-------------------+\n", - "|KU_O5udG6zpxOg-Vc...|mh_-eMZ6K5RLWhZyI...|XQfwVwDr-v0ZS3_Cb...| 3| 0| 0| 0|If you decide to ...|2018-07-07 22:09:11|\n", - "|BiTunyQ73aT9WBnpR...|OyoGAe7OKpv6SyGZT...|7ATYjTIgM3jUlt4UM...| 5| 1| 0| 1|I've taken a lot ...|2012-01-03 15:28:18|\n", - "|saUsX_uimxRlCVr67...|8g_iMtfSiwikVnbP2...|YjUWPpI6HXG530lwP...| 3| 0| 0| 0|Family diner. Had...|2014-02-05 20:30:30|\n", - "|AqPFMleE6RsU23_au...|_7bHUi9Uuf5__HHc_...|kxX2SOes4o-D3ZQBk...| 5| 1| 0| 1|Wow! Yummy, diff...|2015-01-04 00:01:03|\n", - "|Sx8TMOWLNuJBWer-0...|bcjbaE6dDog4jkNY9...|e4Vwtrqf-wpJfwesg...| 4| 1| 0| 1|Cute interior and...|2017-01-14 20:54:15|\n", - "|JrIxlS1TzJ-iCu79u...|eUta8W_HdHMXPzLBB...|04UD14gamNjLY0IDY...| 1| 1| 2| 1|I am a long term ...|2015-09-23 23:10:31|\n", - "|6AxgBCNX_PNTOxmbR...|r3zeYsv1XFBRA4dJp...|gmjsEdUsKpj9Xxu6p...| 5| 0| 2| 0|Loved this tour! ...|2015-01-03 23:21:18|\n", - "|_ZeMknuYdlQcUqng_...|yfFzsLmaWF2d4Sr0U...|LHSTtnW3YHCeUkRDG...| 5| 2| 0| 0|Amazingly amazing...|2015-08-07 02:29:16|\n", - "|ZKvDG2sBvHVdF5oBN...|wSTuiTk-sKNdcFypr...|B5XSoSG3SfvQGtKEG...| 3| 1| 1| 0|This easter inste...|2016-03-30 22:46:33|\n", - "|pUycOfUwM8vqX7KjR...|59MxRhNVhU9MYndMk...|gebiRewfieSdtt17P...| 3| 0| 0| 0|Had a party of 6 ...|2016-07-25 07:31:06|\n", - "|rGQRf8UafX7OTlMNN...|1WHRWwQmZOZDAhp2Q...|uMvVYRgGNXf5boolA...| 5| 2| 0| 0|My experience wit...|2015-06-21 14:48:06|\n", - "|l3Wk_mvAog6XANIuG...|ZbqSHbgCjzVAqaa7N...|EQ-TZ2eeD_E0BHuvo...| 4| 0| 0| 0|Locals recommende...|2015-08-19 14:31:45|\n", - "|XW_LfMv0fV21l9c6x...|9OAtfnWag-ajVxRbU...|lj-E32x9_FA7GmUrB...| 4| 0| 0| 0|Love going here f...|2014-06-27 22:44:01|\n", - "|8JFGBuHMoiNDyfcxu...|smOvOajNG0lS4Pq7d...|RZtGWDLCAtuipwaZ-...| 4| 0| 0| 0|Good food--loved ...|2009-10-14 19:57:14|\n", - "|UBp0zWyH60Hmw6Fsa...|4Uh27DgGzsp6PqrH9...|otQS34_MymijPTdNB...| 4| 0| 2| 0|The bun makes the...|2011-10-27 17:12:05|\n", - "|OAhBYw8IQ6wlfw1ow...|1C2lxzUo1Hyye4RFI...|BVndHaLihEYbr76Z0...| 5| 0| 0| 0|Great place for b...|2014-10-11 16:22:06|\n", - "|oyaMhzBSwfGgemSGu...|Dd1jQj7S-BFGqRbAp...|YtSqYv1Q_pOltsVPS...| 5| 0| 0| 0|Tremendous servic...|2013-06-24 11:21:25|\n", - "|LnGZB0fjfgeVDVz5I...|j2wlzrntrbKwyOcOi...|rBdG_23USc7DletfZ...| 4| 1| 0| 0|The hubby and I h...|2014-08-10 19:41:43|\n", - "|u2vzZaOqJ2feRshaa...|NDZvyYHTUWWu-kqgQ...|CLEWowfkj-wKYJlQD...| 5| 2| 0| 1|I go to blow bar ...|2016-03-07 00:02:18|\n", - "|Xs8Z8lmKkosqW5mw_...|IQsF3Rc6IgCzjVV9D...|eFvzHawVJofxSnD7T...| 5| 0| 0| 0|My absolute favor...|2014-11-12 15:30:27|\n", - "+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+-------------------+\n", - "only showing top 20 rows\n", - "\n" - ] - } - ], - "source": [ - "# dropping the column _id\n", - "\n", - "df_review = df.drop('_id')\n", - "df_review.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "099pAj46DlZr", - "outputId": "360d863f-7ee4-4d7e-a7c4-e057584bcb8a" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "If you decide to eat here, just be aware it is going to take about 2 hours from beginning to end. We have tried it multiple times, because I want to like it! I have been to it's other locations in NJ and never had a bad experience. \\n\\nThe food is good, but it takes a very long time to come out. The waitstaff is very young, but usually pleasant. We have just had too many experiences where we spent way too long waiting. We usually opt for another diner or restaurant on the weekends, in order to be done quicker.\n" - ] - } - ], - "source": [ - "\n", - "# Print a valve in column \"text\"\n", - "for row in df_review.select(\"text\").limit(1).collect():\n", - " print(row.text)\n" - ] - }, - { - "cell_type": "code", - "source": [ - "# create two new columns by splitting date field to date and time, maybe convert the original column to a timestamp after trimming naming it date_trimmedand then, split and drop the date_trimmed\n", - "\n", - "from pyspark.sql.functions import to_timestamp, split\n", - "\n", - "# Assuming your date column is named 'date'\n", - "# Convert the 'date' column to timestamp after trimming and name it 'date_trimmed'\n", - "df_review = df_review.withColumn(\"date_trimmed\", to_timestamp(trim(col(\"date\")), \"yyyy-MM-dd HH:mm:ss\"))\n", - "\n", - "# Split the 'date_trimmed' column into 'date' and 'time' columns\n", - "df_review = df_review.withColumn(\"date\", split(col(\"date_trimmed\"), \" \").getItem(0))\\\n", - " .withColumn(\"time\", split(col(\"date_trimmed\"), \" \").getItem(1))\n", - "\n", - "# Drop the 'date_trimmed' column\n", - "df_review = df_review.drop(\"date_trimmed\")\n", - "\n", - "# Show the updated DataFrame\n", - "df_review.show()\n" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "0qhbwD7zANO0", - "outputId": "662f162d-d59a-414b-d47f-ed62c22474cf" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+----------+--------+\n", - "| review_id| user_id| business_id|stars|useful|funny|cool| text| date| time|\n", - "+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+----------+--------+\n", - "|KU_O5udG6zpxOg-Vc...|mh_-eMZ6K5RLWhZyI...|XQfwVwDr-v0ZS3_Cb...| 3| 0| 0| 0|If you decide to ...|2018-07-07|22:09:11|\n", - "|BiTunyQ73aT9WBnpR...|OyoGAe7OKpv6SyGZT...|7ATYjTIgM3jUlt4UM...| 5| 1| 0| 1|I've taken a lot ...|2012-01-03|15:28:18|\n", - "|saUsX_uimxRlCVr67...|8g_iMtfSiwikVnbP2...|YjUWPpI6HXG530lwP...| 3| 0| 0| 0|Family diner. Had...|2014-02-05|20:30:30|\n", - "|AqPFMleE6RsU23_au...|_7bHUi9Uuf5__HHc_...|kxX2SOes4o-D3ZQBk...| 5| 1| 0| 1|Wow! Yummy, diff...|2015-01-04|00:01:03|\n", - "|Sx8TMOWLNuJBWer-0...|bcjbaE6dDog4jkNY9...|e4Vwtrqf-wpJfwesg...| 4| 1| 0| 1|Cute interior and...|2017-01-14|20:54:15|\n", - "|JrIxlS1TzJ-iCu79u...|eUta8W_HdHMXPzLBB...|04UD14gamNjLY0IDY...| 1| 1| 2| 1|I am a long term ...|2015-09-23|23:10:31|\n", - "|6AxgBCNX_PNTOxmbR...|r3zeYsv1XFBRA4dJp...|gmjsEdUsKpj9Xxu6p...| 5| 0| 2| 0|Loved this tour! ...|2015-01-03|23:21:18|\n", - "|_ZeMknuYdlQcUqng_...|yfFzsLmaWF2d4Sr0U...|LHSTtnW3YHCeUkRDG...| 5| 2| 0| 0|Amazingly amazing...|2015-08-07|02:29:16|\n", - "|ZKvDG2sBvHVdF5oBN...|wSTuiTk-sKNdcFypr...|B5XSoSG3SfvQGtKEG...| 3| 1| 1| 0|This easter inste...|2016-03-30|22:46:33|\n", - "|pUycOfUwM8vqX7KjR...|59MxRhNVhU9MYndMk...|gebiRewfieSdtt17P...| 3| 0| 0| 0|Had a party of 6 ...|2016-07-25|07:31:06|\n", - "|rGQRf8UafX7OTlMNN...|1WHRWwQmZOZDAhp2Q...|uMvVYRgGNXf5boolA...| 5| 2| 0| 0|My experience wit...|2015-06-21|14:48:06|\n", - "|l3Wk_mvAog6XANIuG...|ZbqSHbgCjzVAqaa7N...|EQ-TZ2eeD_E0BHuvo...| 4| 0| 0| 0|Locals recommende...|2015-08-19|14:31:45|\n", - "|XW_LfMv0fV21l9c6x...|9OAtfnWag-ajVxRbU...|lj-E32x9_FA7GmUrB...| 4| 0| 0| 0|Love going here f...|2014-06-27|22:44:01|\n", - "|8JFGBuHMoiNDyfcxu...|smOvOajNG0lS4Pq7d...|RZtGWDLCAtuipwaZ-...| 4| 0| 0| 0|Good food--loved ...|2009-10-14|19:57:14|\n", - "|UBp0zWyH60Hmw6Fsa...|4Uh27DgGzsp6PqrH9...|otQS34_MymijPTdNB...| 4| 0| 2| 0|The bun makes the...|2011-10-27|17:12:05|\n", - "|OAhBYw8IQ6wlfw1ow...|1C2lxzUo1Hyye4RFI...|BVndHaLihEYbr76Z0...| 5| 0| 0| 0|Great place for b...|2014-10-11|16:22:06|\n", - "|oyaMhzBSwfGgemSGu...|Dd1jQj7S-BFGqRbAp...|YtSqYv1Q_pOltsVPS...| 5| 0| 0| 0|Tremendous servic...|2013-06-24|11:21:25|\n", - "|LnGZB0fjfgeVDVz5I...|j2wlzrntrbKwyOcOi...|rBdG_23USc7DletfZ...| 4| 1| 0| 0|The hubby and I h...|2014-08-10|19:41:43|\n", - "|u2vzZaOqJ2feRshaa...|NDZvyYHTUWWu-kqgQ...|CLEWowfkj-wKYJlQD...| 5| 2| 0| 1|I go to blow bar ...|2016-03-07|00:02:18|\n", - "|Xs8Z8lmKkosqW5mw_...|IQsF3Rc6IgCzjVV9D...|eFvzHawVJofxSnD7T...| 5| 0| 0| 0|My absolute favor...|2014-11-12|15:30:27|\n", - "+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+----------+--------+\n", - "only showing top 20 rows\n", - "\n" - ] - } - ] - }, - { - "cell_type": "code", - "source": [ - "# write it as a csv in /content/drive/MyDrive/ProcessedCSV named review.csv\n", - "\n", - "# Write the DataFrame to a CSV file in Google Drive\n", - "df_review.write.csv(\"/content/drive/MyDrive/ProcessedCSV/review.csv\", header=True)\n" - ], - "metadata": { - "id": "qq4vxIBRBPqV" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "rcHyrKs1gYUb" - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "gpuType": "T4", - "provenance": [], - "mount_file_id": "14RNdYGKIo0mYPAZwvH4rkcNWGMBtmxXH", - "authorship_tag": "ABX9TyODofy+zMMyMvRSPGbOC+/R" - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file diff --git a/DataprocessingP2_Business_html.html b/DataprocessingP2_Business_html.html deleted file mode 100644 index 5a5dc13..0000000 --- a/DataprocessingP2_Business_html.html +++ /dev/null @@ -1,15002 +0,0 @@ - - - - - -DataprocessingP2_Business - - - - - - - - - - - - - - - - - - - - -
-
- - -
- -
- - - - - -
- -
-
- - -
- -
- - - - - -
- -
-
- - -
- -
- - - - - -
- -
-
- - -
- -
-
- - -
- -
-
- - -
- -
- - - - - -
- -
-
- - -
- -
- - - - - -
- -
-
- - -
- -
- - - - - -
- -
-
- - -
- -
-
- - -
- -
-
- - -
- -
- - - - - - - - - diff --git a/DataprocessingP4_Tip_html.html b/DataprocessingP4_Tip_html.html deleted file mode 100644 index 73e027c..0000000 --- a/DataprocessingP4_Tip_html.html +++ /dev/null @@ -1,15130 +0,0 @@ - - - - - -DataprocessingP4_Tip - - - - - - - - - - - - - - - - - - - - -
-
- - -
- -
- - - - - -
- -
-
- - -
- -
- - - - - -
- -
-
- - -
- -
- - - - - -
- -
-
- - -
- -
-
- - -
- -
-
- - -
- -
- - - - - -
- -
-
- - -
- -
- - - - - -
- -
-
- - -
- -
- - - - - -
- -
-
- - -
- -
- - - - - -
- -
-
- - -
- -
-
- - -
- -
- - - - - -
- -
-
- - -
- -
- - - - - - - - - diff --git a/DataprocessingP5_Checkin.ipynb b/DataprocessingP5_Checkin.ipynb deleted file mode 100644 index abc1c26..0000000 --- a/DataprocessingP5_Checkin.ipynb +++ /dev/null @@ -1,463 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "9Wt7ZQNdZANN", - "outputId": "8f4e24ef-a38e-4a52-ee74-8b7db50c23c1" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Collecting pyspark\n", - " Downloading pyspark-3.5.3.tar.gz (317.3 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m317.3/317.3 MB\u001b[0m \u001b[31m1.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - "Requirement already satisfied: py4j==0.10.9.7 in /usr/local/lib/python3.10/dist-packages (from pyspark) (0.10.9.7)\n", - "Building wheels for collected packages: pyspark\n", - " Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for pyspark: filename=pyspark-3.5.3-py2.py3-none-any.whl size=317840625 sha256=090ea5167edba2965240d229c0959fffe9d7bda3cd34b37ea12d3cdc114f11e2\n", - " Stored in directory: /root/.cache/pip/wheels/1b/3a/92/28b93e2fbfdbb07509ca4d6f50c5e407f48dce4ddbda69a4ab\n", - "Successfully built pyspark\n", - "Installing collected packages: pyspark\n", - "Successfully installed pyspark-3.5.3\n" - ] - } - ], - "source": [ - "!pip install pyspark" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "6wGKWU-iZT4y", - "outputId": "4231dd2b-f506-4aa5-a4b6-425abcf23b5a" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Collecting findspark\n", - " Downloading findspark-2.0.1-py2.py3-none-any.whl.metadata (352 bytes)\n", - "Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)\n", - "Installing collected packages: findspark\n", - "Successfully installed findspark-2.0.1\n" - ] - } - ], - "source": [ - "!pip install findspark" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "9fyG_DI8ZJMq", - "outputId": "4a914dfd-539a-4665-b97f-63434934281c" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Requirement already satisfied: pyarrow in /usr/local/lib/python3.10/dist-packages (16.1.0)\n", - "Requirement already satisfied: numpy>=1.16.6 in /usr/local/lib/python3.10/dist-packages (from pyarrow) (1.26.4)\n" - ] - } - ], - "source": [ - "\n", - "!pip install pyarrow\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "glBfjrR8Zrzi" - }, - "outputs": [], - "source": [ - "import findspark\n", - "findspark.init()\n", - "from pyspark.sql import SparkSession\n", - "from pyspark.sql.functions import col\n", - "from pyspark.sql.functions import *\n", - "import time" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "1MOwlc9cZv1R" - }, - "outputs": [], - "source": [ - "# Initialize SparkSession\n", - "spark = SparkSession.builder.appName(\"CSVReader\").getOrCreate()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "VMA6hRI_ZfLb", - "outputId": "25463c3b-2983-4f73-bf46-c21bc422d057" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Execution started at: 2024-10-11 23:04:19\n", - "+--------------------+--------------------+--------------------+\n", - "| _id| business_id| date|\n", - "+--------------------+--------------------+--------------------+\n", - "|66ea4866e59c7c5b6...|---kPU91CF4Lq2-Wl...|2020-03-13 21:10:...|\n", - "|66ea4866e59c7c5b6...|--0iUa4sNDFiZFrAd...|2010-09-13 21:43:...|\n", - "|66ea4866e59c7c5b6...|--30_8IhuyMHbSOcN...|2013-06-14 23:29:...|\n", - "|66ea4866e59c7c5b6...|--7PUidqRWpRSpXeb...|2011-02-15 17:12:...|\n", - "|66ea4866e59c7c5b6...|--7jw19RH9JKXgFoh...|2014-04-21 20:42:...|\n", - "|66ea4866e59c7c5b6...|--8IbOsAAxjKRoYsB...|2015-06-06 01:03:...|\n", - "|66ea4866e59c7c5b6...|--9osgUCSDUWUkoTL...|2015-06-13 02:00:...|\n", - "|66ea4866e59c7c5b6...|--ARBQr1WMsTWiwOK...|2014-12-12 00:44:...|\n", - "|66ea4866e59c7c5b6...|--FWWsIwxRwuw9vIM...|2010-09-11 16:28:...|\n", - "|66ea4866e59c7c5b6...|--FcbSxK1AoEtEAxO...|2017-08-18 19:43:...|\n", - "|66ea4866e59c7c5b6...|--LC8cIrALInl2vyo...|2017-01-12 19:10:...|\n", - "|66ea4866e59c7c5b6...|--MbOh2O1pATkXa7x...|2013-04-21 01:52:...|\n", - "|66ea4866e59c7c5b6...|--N9yp3ZWqQIm7DqK...|2012-10-06 20:46:...|\n", - "|66ea4866e59c7c5b6...|--O3ip9NpXTKD4oBS...|2010-04-17 21:07:...|\n", - "|66ea4866e59c7c5b6...|--OS_I7dnABrXvRCC...| 2018-05-11 18:23:36|\n", - "|66ea4866e59c7c5b6...|--S43ruInmIsGrnnk...|2010-08-29 01:17:...|\n", - "|66ea4866e59c7c5b6...|--SJXpAa0E-GCp2sm...|2014-04-06 22:23:...|\n", - "|66ea4866e59c7c5b6...|--Sd93OFWITqDHifM...|2013-01-09 17:42:...|\n", - "|66ea4866e59c7c5b6...|--ZVrH2X2QXBFdCil...|2010-08-12 18:21:...|\n", - "|66ea4866e59c7c5b6...|--ZWv8kGlM2YL58uK...|2010-10-13 18:41:...|\n", - "+--------------------+--------------------+--------------------+\n", - "only showing top 20 rows\n", - "\n", - "Execution ended at: 2024-10-11 23:04:40\n", - "Execution time: 20.19 seconds\n" - ] - } - ], - "source": [ - "# Start time\n", - "start_time = time.time()\n", - "print(f\"Execution started at: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(start_time))}\")\n", - "\n", - "# Read the CSV data\n", - "df = spark.read.csv(\"/content/drive/MyDrive/CSV/yelp_checkin.csv\", header=True, inferSchema=True)\n", - "\n", - "# Display the entire dataframe (use with caution for large datasets)\n", - "df.show()\n", - "\n", - "# End time\n", - "end_time = time.time()\n", - "print(f\"Execution ended at: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(end_time))}\")\n", - "\n", - "# Calculate and print the execution time\n", - "execution_time = end_time - start_time\n", - "print(f\"Execution time: {execution_time:.2f} seconds\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "hmKYkkpGa2ls", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "0e36f27b-c543-400f-8b05-7af5d2e52b96" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Total records in the dataframe: 131930\n" - ] - } - ], - "source": [ - "# total records in the dataframe\n", - "\n", - "total_records = df.count()\n", - "print(f\"Total records in the dataframe: {total_records}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SAQNWOozaX5O", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "59ad368a-5ad4-42ec-a218-2a2eae1411d9" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "+--------------------+--------------------+\n", - "| business_id| date|\n", - "+--------------------+--------------------+\n", - "|---kPU91CF4Lq2-Wl...|2020-03-13 21:10:...|\n", - "|--0iUa4sNDFiZFrAd...|2010-09-13 21:43:...|\n", - "|--30_8IhuyMHbSOcN...|2013-06-14 23:29:...|\n", - "|--7PUidqRWpRSpXeb...|2011-02-15 17:12:...|\n", - "|--7jw19RH9JKXgFoh...|2014-04-21 20:42:...|\n", - "|--8IbOsAAxjKRoYsB...|2015-06-06 01:03:...|\n", - "|--9osgUCSDUWUkoTL...|2015-06-13 02:00:...|\n", - "|--ARBQr1WMsTWiwOK...|2014-12-12 00:44:...|\n", - "|--FWWsIwxRwuw9vIM...|2010-09-11 16:28:...|\n", - "|--FcbSxK1AoEtEAxO...|2017-08-18 19:43:...|\n", - "|--LC8cIrALInl2vyo...|2017-01-12 19:10:...|\n", - "|--MbOh2O1pATkXa7x...|2013-04-21 01:52:...|\n", - "|--N9yp3ZWqQIm7DqK...|2012-10-06 20:46:...|\n", - "|--O3ip9NpXTKD4oBS...|2010-04-17 21:07:...|\n", - "|--OS_I7dnABrXvRCC...| 2018-05-11 18:23:36|\n", - "|--S43ruInmIsGrnnk...|2010-08-29 01:17:...|\n", - "|--SJXpAa0E-GCp2sm...|2014-04-06 22:23:...|\n", - "|--Sd93OFWITqDHifM...|2013-01-09 17:42:...|\n", - "|--ZVrH2X2QXBFdCil...|2010-08-12 18:21:...|\n", - "|--ZWv8kGlM2YL58uK...|2010-10-13 18:41:...|\n", - "+--------------------+--------------------+\n", - "only showing top 20 rows\n", - "\n" - ] - } - ], - "source": [ - "# dropping the column _id\n", - "\n", - "df_checkin = df.drop('_id')\n", - "df_checkin.show()" - ] - }, - { - "cell_type": "code", - "source": [ - "# print one value in date field fully\n", - "\n", - "from pyspark.sql.functions import col\n", - "\n", - "# Assuming 'date' is the name of your date field\n", - "df_checkin.select(col(\"date\")).first()[0]\n" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 53 - }, - "id": "6VHRzkWg1ayW", - "outputId": "ec8aaa59-87ea-489f-8153-b83ecd40b94d" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "'2020-03-13 21:10:56, 2020-06-02 22:18:06, 2020-07-24 22:42:27, 2020-10-24 21:36:13, 2020-12-09 21:23:33, 2021-01-20 17:34:57, 2021-04-30 21:02:03, 2021-05-25 21:16:54, 2021-08-06 21:08:08, 2021-10-02 15:15:42, 2021-11-11 16:23:50'" - ], - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "string" - } - }, - "metadata": {}, - "execution_count": 30 - } - ] - }, - { - "cell_type": "code", - "source": [ - "# create and additional column total_checkin_count for each row, where the value is calculated by counting all the dates in the date field.\n", - "\n", - "from pyspark.sql.functions import size, split\n", - "\n", - "# Assuming 'date' is the name of your date field\n", - "df_checkin_with_count = df_checkin.withColumn(\"total_checkin_count\", size(split(col(\"date\"), \",\")))\n", - "\n", - "df_checkin_with_count.show()\n" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "v6_mHg9l1o7Z", - "outputId": "e7d748c0-5d1a-4b1d-8913-bf462ebfd654" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "+--------------------+--------------------+-------------------+\n", - "| business_id| date|total_checkin_count|\n", - "+--------------------+--------------------+-------------------+\n", - "|---kPU91CF4Lq2-Wl...|2020-03-13 21:10:...| 11|\n", - "|--0iUa4sNDFiZFrAd...|2010-09-13 21:43:...| 10|\n", - "|--30_8IhuyMHbSOcN...|2013-06-14 23:29:...| 2|\n", - "|--7PUidqRWpRSpXeb...|2011-02-15 17:12:...| 10|\n", - "|--7jw19RH9JKXgFoh...|2014-04-21 20:42:...| 26|\n", - "|--8IbOsAAxjKRoYsB...|2015-06-06 01:03:...| 32|\n", - "|--9osgUCSDUWUkoTL...|2015-06-13 02:00:...| 24|\n", - "|--ARBQr1WMsTWiwOK...|2014-12-12 00:44:...| 34|\n", - "|--FWWsIwxRwuw9vIM...|2010-09-11 16:28:...| 7|\n", - "|--FcbSxK1AoEtEAxO...|2017-08-18 19:43:...| 82|\n", - "|--LC8cIrALInl2vyo...|2017-01-12 19:10:...| 7|\n", - "|--MbOh2O1pATkXa7x...|2013-04-21 01:52:...| 103|\n", - "|--N9yp3ZWqQIm7DqK...|2012-10-06 20:46:...| 7|\n", - "|--O3ip9NpXTKD4oBS...|2010-04-17 21:07:...| 456|\n", - "|--OS_I7dnABrXvRCC...| 2018-05-11 18:23:36| 1|\n", - "|--S43ruInmIsGrnnk...|2010-08-29 01:17:...| 309|\n", - "|--SJXpAa0E-GCp2sm...|2014-04-06 22:23:...| 45|\n", - "|--Sd93OFWITqDHifM...|2013-01-09 17:42:...| 48|\n", - "|--ZVrH2X2QXBFdCil...|2010-08-12 18:21:...| 67|\n", - "|--ZWv8kGlM2YL58uK...|2010-10-13 18:41:...| 13|\n", - "+--------------------+--------------------+-------------------+\n", - "only showing top 20 rows\n", - "\n" - ] - } - ] - }, - { - "cell_type": "code", - "source": [ - "# split the date column into 2 separate columns date and time. first trim the whole column and then split to avoid errors\n", - "\n", - "from pyspark.sql.functions import split, trim\n", - "\n", - "# Trim whitespace from the date column\n", - "df_checkin = df_checkin_with_count.withColumn(\"date_trimmed\", trim(df_checkin_with_count[\"date\"]))\n", - "\n", - "# Split the trimmed date column into date and time columns\n", - "df_checkin = df_checkin.withColumn(\"date\", split(df_checkin[\"date_trimmed\"], \" \")[0]) \\\n", - " .withColumn(\"time\", split(df_checkin[\"date_trimmed\"], \" \")[1])\n", - "\n", - "df_checkin = df_checkin.drop(\"date_trimmed\")\n", - "\n", - "df_checkin.show()\n" - ], - "metadata": { - "id": "d5iFoRYpJWsH", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "8f97f86f-7c0f-4182-fca2-5615371ce60e" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "+--------------------+----------+-------------------+---------+\n", - "| business_id| date|total_checkin_count| time|\n", - "+--------------------+----------+-------------------+---------+\n", - "|---kPU91CF4Lq2-Wl...|2020-03-13| 11|21:10:56,|\n", - "|--0iUa4sNDFiZFrAd...|2010-09-13| 10|21:43:09,|\n", - "|--30_8IhuyMHbSOcN...|2013-06-14| 2|23:29:17,|\n", - "|--7PUidqRWpRSpXeb...|2011-02-15| 10|17:12:00,|\n", - "|--7jw19RH9JKXgFoh...|2014-04-21| 26|20:42:11,|\n", - "|--8IbOsAAxjKRoYsB...|2015-06-06| 32|01:03:19,|\n", - "|--9osgUCSDUWUkoTL...|2015-06-13| 24|02:00:57,|\n", - "|--ARBQr1WMsTWiwOK...|2014-12-12| 34|00:44:23,|\n", - "|--FWWsIwxRwuw9vIM...|2010-09-11| 7|16:28:39,|\n", - "|--FcbSxK1AoEtEAxO...|2017-08-18| 82|19:43:50,|\n", - "|--LC8cIrALInl2vyo...|2017-01-12| 7|19:10:02,|\n", - "|--MbOh2O1pATkXa7x...|2013-04-21| 103|01:52:06,|\n", - "|--N9yp3ZWqQIm7DqK...|2012-10-06| 7|20:46:47,|\n", - "|--O3ip9NpXTKD4oBS...|2010-04-17| 456|21:07:32,|\n", - "|--OS_I7dnABrXvRCC...|2018-05-11| 1| 18:23:36|\n", - "|--S43ruInmIsGrnnk...|2010-08-29| 309|01:17:44,|\n", - "|--SJXpAa0E-GCp2sm...|2014-04-06| 45|22:23:56,|\n", - "|--Sd93OFWITqDHifM...|2013-01-09| 48|17:42:28,|\n", - "|--ZVrH2X2QXBFdCil...|2010-08-12| 67|18:21:29,|\n", - "|--ZWv8kGlM2YL58uK...|2010-10-13| 13|18:41:45,|\n", - "+--------------------+----------+-------------------+---------+\n", - "only showing top 20 rows\n", - "\n" - ] - } - ] - }, - { - "cell_type": "code", - "source": [ - "# write this as a csv in /content/drive/MyDrive/ProcessedCSV name it checkin.csv\n", - "\n", - "# Write the DataFrame to a CSV file in your Google Drive\n", - "df_checkin.write.csv(\"/content/drive/MyDrive/ProcessedCSV/checkin.csv\", header=True)\n" - ], - "metadata": { - "id": "iY6X6MUk1Muj" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [], - "metadata": { - "id": "bUeOQ24K3lVH" - }, - "execution_count": null, - "outputs": [] - } - ], - "metadata": { - "colab": { - "provenance": [], - "mount_file_id": "1OoisQonpr6QO-IRp5ko-LsDM4CyLyPWV", - "authorship_tag": "ABX9TyOleZaHDHnDe/bA+QJbhm6F", - "include_colab_link": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file diff --git a/Merge_CSVs.ipynb b/Merge_CSVs.ipynb deleted file mode 100644 index f91346f..0000000 --- a/Merge_CSVs.ipynb +++ /dev/null @@ -1,359 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "9Wt7ZQNdZANN", - "outputId": "e92f8c94-1209-4d73-ba8e-2be0a7ceaa8d" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Requirement already satisfied: pyspark in /usr/local/lib/python3.10/dist-packages (3.5.3)\n", - "Requirement already satisfied: py4j==0.10.9.7 in /usr/local/lib/python3.10/dist-packages (from pyspark) (0.10.9.7)\n" - ] - } - ], - "source": [ - "!pip install pyspark" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "6wGKWU-iZT4y", - "outputId": "c2d21205-22bf-46ee-b43b-4b61924fa76d" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Requirement already satisfied: findspark in /usr/local/lib/python3.10/dist-packages (2.0.1)\n" - ] - } - ], - "source": [ - "!pip install findspark" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "9fyG_DI8ZJMq", - "outputId": "189994db-d4ec-42f8-e6f8-cf73284ec2a5" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Requirement already satisfied: pyarrow in /usr/local/lib/python3.10/dist-packages (16.1.0)\n", - "Requirement already satisfied: numpy>=1.16.6 in /usr/local/lib/python3.10/dist-packages (from pyarrow) (1.26.4)\n" - ] - } - ], - "source": [ - "\n", - "!pip install pyarrow\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "glBfjrR8Zrzi" - }, - "outputs": [], - "source": [ - "import findspark\n", - "findspark.init()\n", - "from pyspark.sql import SparkSession\n", - "from pyspark.sql.functions import col\n", - "from pyspark.sql.functions import *\n", - "import time" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "gfR_NVF041Nm" - }, - "outputs": [], - "source": [ - "# from google.colab import drive\n", - "# drive.mount('/content/drive')\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "1MOwlc9cZv1R" - }, - "outputs": [], - "source": [ - "# Initialize SparkSession\n", - "spark = SparkSession.builder.appName(\"CSVReader\").getOrCreate()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "R_asbSHFCiPO" - }, - "outputs": [], - "source": [ - "# prompt: read /content/drive/MyDrive/ProcessedCSV/business.csv, make sure everythings in memory, also print first 2 rec\n", - "\n", - "df_business = spark.read.csv(\"/content/drive/MyDrive/ProcessedCSV/business.csv\", header=True, inferSchema=True)\n", - "df_business = df_business.withColumnRenamed(\"stars\",\"stars_bus\" ).withColumnRenamed(\"name\",\"bus_name\" ).withColumnRenamed(\"review_count\",\"bus_review_count\").cache()\n", - "# df_business.show(2)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "sY291bmaDIcw" - }, - "outputs": [], - "source": [ - "df_user = spark.read.csv(\"/content/drive/MyDrive/ProcessedCSV/user.csv\", header=True, inferSchema=True)\n", - "df_user = df_user.withColumnRenamed(\"cool\",\"cool_usr\" ).withColumnRenamed(\"funny\",\"funny_usr\" ).withColumnRenamed(\"useful\",\"useful_usr\").withColumnRenamed(\"name\",\"usr_name\")\n", - "df_user = df_user.drop(\"date\", \"time\").cache()\n", - "# df_user.show(2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "9WzMux-1DaXo" - }, - "outputs": [], - "source": [ - "df_review = spark.read.csv(\"/content/drive/MyDrive/ProcessedCSV/review.csv\", header=True, inferSchema=True)\n", - "df_review = df_review.withColumnRenamed(\"cool\",\"cool_rev\" ).withColumnRenamed(\"funny\",\"funny_rev\" ).withColumnRenamed(\"useful\",\"useful_rev\" ).withColumnRenamed(\"text\",\"rev_text\" ).withColumnRenamed(\"stars\",\"stars_rev\")\n", - "df_review = df_review.drop(\"date\", \"time\").cache()\n", - "# df_review.show(2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "1oATjKulDiL6" - }, - "outputs": [], - "source": [ - "df_tip = spark.read.csv(\"/content/drive/MyDrive/ProcessedCSV/tip.csv\", header=True, inferSchema=True)\n", - "df_tip = df_tip.withColumnRenamed(\"text\",\"tip_text\" )\n", - "df_tip = df_tip.drop(\"date\", \"time\").cache()\n", - "# df_tip.show(2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "7JUzafl2Douq" - }, - "outputs": [], - "source": [ - "df_checkin = spark.read.csv(\"/content/drive/MyDrive/ProcessedCSV/checkin.csv\", header=True, inferSchema=True)\n", - "df_checkin = df_checkin.drop(\"date\", \"time\").cache()\n", - "# df_checkin.show(2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "RBcK1ihoDu7o" - }, - "outputs": [], - "source": [ - "# prompt: I need to merge all this data frame to a simgle one, business has business_id, user has user_id, review has review_id, user_id, and business_id, tip has user_id and business_id , checkin has business_id\n", - "\n", - "# Merge review with business and user\n", - "df_merged_rev_bus = df_review.join(df_business, on='business_id', how='inner')\n", - "df_business.unpersist(blocking=True)\n", - "df_review.unpersist(blocking=True)\n", - "\n", - "#writing as a parquet file\n", - "df_merged_rev_bus.write.mode(\"overwrite\").parquet(\"/content/drive/MyDrive/parquetfiles/merged_rev_bus.parquet\")\n", - "\n", - "#reading the parquet saved before\n", - "df_merged_rev_bus = spark.read.parquet(\"/content/drive/MyDrive/parquetfiles/merged_rev_bus.parquet\")\n", - "#joining the user to this\n", - "df_merged_rev_bus_usr = df_merged_rev_bus.join(df_user, on='user_id', how='inner')\n", - "df_user.unpersist(blocking=True)\n", - "# writing the new parquet\n", - "df_merged_rev_bus_usr.write.mode(\"overwrite\").parquet(\"/content/drive/MyDrive/parquetfiles/merged_rev_bus_usr.parquet\")\n", - "\n", - "# reading the parquet\n", - "df_merged_rev_bus_usr = spark.read.parquet(\"/content/drive/MyDrive/parquetfiles/merged_rev_bus_usr.parquet\")\n", - "# Merge tip with the merged dataframe\n", - "df_merged_rev_bus_usr_tip = df_merged_rev_bus_usr.join(df_tip, on=['user_id', 'business_id'], how='left')\n", - "df_tip.unpersist(blocking=True)\n", - "# writing the new parquet\n", - "df_merged_rev_bus_usr_tip.write.mode(\"overwrite\").parquet(\"/content/drive/MyDrive/parquetfiles/merged_rev_bus_usr_tip.parquet\")\n", - "\n", - "#reading the parquet\n", - "df_merged_rev_bus_usr_tip = spark.read.parquet(\"/content/drive/MyDrive/parquetfiles/merged_rev_bus_usr_tip.parquet\")\n", - "# Merge checkin with the merged dataframe\n", - "df_merged_rev_bus_usr_tip_checkin = df_merged_rev_bus_usr_tip.join(df_checkin, on='business_id', how='left').cache()\n", - "df_checkin.unpersist(blocking=True)\n", - "# writing the new parquet\n", - "df_merged_rev_bus_usr_tip_checkin.write.mode(\"overwrite\").parquet(\"/content/drive/MyDrive/parquetfiles/finalmerged.parquet\")\n", - "\n", - "# df_merged.show(2)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "b_-aq2j8coxd" - }, - "outputs": [], - "source": [ - "# prompt: write merged as a parquet file\n", - "# Initialize SparkSession\n", - "\n", - "# df_merged.write.mode(\"overwrite\").parquet(\"/content/drive/MyDrive/mergedcsv/merged_data.parquet\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "DhqcBvFGjdBM" - }, - "outputs": [], - "source": [ - "# prompt: read and display 2 rows of the parquet\n", - "\n", - "# df_parquet = spark.read.parquet(\"/content/drive/MyDrive/ProcessedCSV/merged_data_parquet.parquet\")\n", - "# df_parquet.show(2)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "szfHJcmDFx8R" - }, - "outputs": [], - "source": [ - "# prompt: write this merged df in /content/drive/MyDrive/ProcessedCSV named merged.csv\n", - "\n", - "df_merged.write.mode(\"overwrite\").csv(\"/content/drive/MyDrive/ProcessedCSV/merged.csv\", header=True)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "TMEdfGbcHAjh", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "4248e493-ad46-4d22-e655-060aaeea31a2" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "+--------------------+--------------------+--------------------+---------+----------+---------+--------+--------------------+--------------------+--------------------+--------------+-----+-----------+-------------+--------------+---------+----------------+-------+----------------------------+--------------------+-----+-------------------------------------+------------+-------------+---------------+--------------+------------+--------------+----------------------+---------------------------------+--------------------+-----------------------------+------------------------------+-----------------+---------------+--------------------------+-------------------------------+--------------------+-------------------------+----------------+----------------------------------+----------------------+------------+------------------+----------------------+----------------------------+--------------------+----------------------------------+-----------------------------------+--------------------+----------+---------------------+----------------------+---------------------------------+------------------+--------------------+-------------------------+---------------------------+---------------------+---------------+------------------+----------------------+----------------------------+----------------------+------------------------------------+----------------------+------------------------------+---------+------------+----------+---------+--------+--------------------+--------------------+----+-------------+--------------+---------------+------------------+---------------+---------------+---------------+----------------+---------------+----------------+-----------------+-----------------+-------------+-----------+---------+----------------+-------------------+\n", - "| business_id| user_id| review_id|stars_rev|useful_rev|funny_rev|cool_rev| rev_text| bus_name| address| city|state|postal_code| latitude| longitude|stars_bus|bus_review_count|is_open|attributes.ByAppointmentOnly| categories|hours|attributes.BusinessAcceptsCreditCards|hours.Monday|hours.Tuesday|hours.Wednesday|hours.Thursday|hours.Friday|hours.Saturday|attributes.BikeParking|attributes.RestaurantsPriceRange2|attributes.CoatCheck|attributes.RestaurantsTakeOut|attributes.RestaurantsDelivery|attributes.Caters|attributes.WiFi|attributes.BusinessParking|attributes.WheelchairAccessible|attributes.HappyHour|attributes.OutdoorSeating|attributes.HasTV|attributes.RestaurantsReservations|attributes.DogsAllowed|hours.Sunday|attributes.Alcohol|attributes.GoodForKids|attributes.RestaurantsAttire| attributes.Ambience|attributes.RestaurantsTableService|attributes.RestaurantsGoodForGroups|attributes.DriveThru|attributes|attributes.NoiseLevel|attributes.GoodForMeal|attributes.BusinessAcceptsBitcoin|attributes.Smoking| attributes.Music|attributes.GoodForDancing|attributes.AcceptsInsurance|attributes.BestNights|attributes.BYOB|attributes.Corkage|attributes.BYOBCorkage|attributes.HairSpecializesIn|attributes.Open24Hours|attributes.RestaurantsCounterService|attributes.AgesAllowed|attributes.DietaryRestrictions| usr_name|review_count|useful_usr|funny_usr|cool_usr| elite| friends|fans|average_stars|compliment_hot|compliment_more|compliment_profile|compliment_cute|compliment_list|compliment_note|compliment_plain|compliment_cool|compliment_funny|compliment_writer|compliment_photos|total_friends|elite_count| tip_text|compliment_count|total_checkin_count|\n", - "+--------------------+--------------------+--------------------+---------+----------+---------+--------+--------------------+--------------------+--------------------+--------------+-----+-----------+-------------+--------------+---------+----------------+-------+----------------------------+--------------------+-----+-------------------------------------+------------+-------------+---------------+--------------+------------+--------------+----------------------+---------------------------------+--------------------+-----------------------------+------------------------------+-----------------+---------------+--------------------------+-------------------------------+--------------------+-------------------------+----------------+----------------------------------+----------------------+------------+------------------+----------------------+----------------------------+--------------------+----------------------------------+-----------------------------------+--------------------+----------+---------------------+----------------------+---------------------------------+------------------+--------------------+-------------------------+---------------------------+---------------------+---------------+------------------+----------------------+----------------------------+----------------------+------------------------------------+----------------------+------------------------------+---------+------------+----------+---------+--------+--------------------+--------------------+----+-------------+--------------+---------------+------------------+---------------+---------------+---------------+----------------+---------------+----------------+-----------------+-----------------+-------------+-----------+---------+----------------+-------------------+\n", - "|QHWYlmVbLC3K6eglW...|---2PmXbF47D870st...|yA_38sOvL7DiBR4_E...| 5| 0| 0| 0|Dined here last n...| Datz| 2616 S MacDill Ave| Tampa| FL| 33629| 27.921942| -82.4932535| 4.0| 3260| 1| False|Burgers, Bars, Re...| NULL| True| 0:0-0:0| 7:0-22:0| 7:0-22:0| 7:0-22:0| 7:0-23:0| 8:30-23:0| True| 2| NULL| True| True| True| u'free'| {'garage': False,...| True| True| True| True| True| True| 8:30-21:0| u'full_bar'| True| u'casual'|{'touristy': Fals...| True| True| True| NULL| u'average'| {'dessert': None,...| False| NULL|{'dj': False, 'ba...| false| NULL| {'monday': False,...| False| NULL| NULL| NULL| NULL| NULL| NULL| NULL| Susan| 36| 63| 4| 36| NULL|o9QkuHIyxmqbORF7P...| 3| 4.98| 0| 0| 0| 0| 0| 0| 2| 0| 0| 2| 0| 420| 0| NULL| NULL| 6820|\n", - "|qAnKc-pentc9UUQvt...|--9h63HDrtX01Wg3t...|sh8ReL9iu7YUAxpPf...| 5| 0| 0| 0|The staff are all...|NextCare Urgent Care|9525 E Old Spanis...| Tucson| AZ| 85748| 32.2083419| -110.7888894| 2.5| 80| 1| False|Medical Centers, ...| NULL| True| 8:0-0:0| 8:0-0:0| 8:0-0:0| 8:0-16:0| 8:0-0:0| 8:0-0:0| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| 8:0-0:0| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| Natalie| 4| 1| 0| 0| NULL| None| 0| 4.0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 1| 0| NULL| NULL| 52|\n", - "|4ZZR2Ykk4jo7oiGgQ...|--U6F4iI3ABK6IVCC...|Wzeye8TCluBjSsCdM...| 5| 3| 0| 0|Great, new equip...| LA Fitness|9350 E Golf Links Rd| Tucson| AZ| 85730| 32.1901098| -110.7928939| 3.5| 65| 1| False|Gyms, Trainers, F...| NULL| True| 0:0-0:0| 5:0-23:0| 5:0-23:0| 5:0-23:0| 8:0-18:0| 8:0-18:0| True| NULL| NULL| NULL| NULL| NULL| NULL| {'garage': False,...| NULL| NULL| NULL| NULL| NULL| NULL| 8:0-20:0| NULL| True| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| Matt| 31| 44| 9| 11| NULL|N5YRo5ax4X7pcqmvN...| 2| 3.97| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 84| 0| NULL| NULL| 1559|\n", - "|9PZxjhTIU7OgPIzuG...|--cjT1ICjm_ajiwSK...|FN8qUbNlb9ulfoGtn...| 4| 0| 0| 0|\"The decor in thi...| El Vez| 121 S 13th St| Philadelphia| PA| 19107|39.9497020026|-75.1617702842| 4.0| 3187| 1| NULL|Lounges, Bars, Ni...| NULL| True| 0:0-0:0| 12:0-22:0| 12:0-22:0| 12:0-22:0| 16:0-0:30| 16:0-22:0| True| 2| True| True| True| False| u'no'| {'garage': False,...| NULL| True| True| False| True| False| 12:0-22:0| 'full_bar'| True| 'casual'|{'touristy': Fals...| True| True| None| NULL| u'loud'| {'dessert': False...| NULL| NULL| NULL| NULL| NULL| NULL| NULL| False| 'no'| NULL| NULL| NULL| NULL| NULL| Shayla| 7| 4| 2| 1| NULL| None| 0| 3.71| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 1| 0| NULL| NULL| 5354|\n", - "|ZbjlLNcnTpemOsqX9...|--m6DE1KNxjSyL6LF...|8WQkufjVXtqjuHJ3f...| 3| 0| 0| 0|The food was real...|Mad Beach Craft B...| 12945 Village Blvd| Madeira Beach| FL| 33708|27.7863139549|-82.7832909586| 4.0| 526| 1| False|Nightlife, Brewer...| NULL| True| 0:0-0:0| 12:0-21:0| 12:0-21:0| 12:0-21:0| 11:0-21:0| 11:0-23:0| True| 2| NULL| True| True| False| 'free'| {'garage': True, ...| True| False| True| True| False| True| 11:0-21:0| 'beer_and_wine'| True| 'casual'|{'touristy': Fals...| True| True| True| NULL| 'average'| {'dessert': False...| False| NULL| NULL| NULL| NULL| NULL| False| NULL| NULL| NULL| NULL| NULL| NULL| NULL| Steavi| 29| 12| 0| 2| NULL| None| 0| 3.97| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 1| 0| NULL| NULL| 695|\n", - "|wbHYWJ9BHAIBvD-J0...|--yvzhW9sm-4DGw72...|pGc6oi-9Tam9Ey40f...| 1| 1| 0| 0|My wife and I joi...| Kennedy Fitness| 1432 Marlton Pike E| Cherry Hill| NJ| 08034| 39.9115| -74.99103| 2.0| 58| 0| True|Active Life, Fitn...| NULL| True| 5:0-22:0| 5:0-22:0| 5:0-22:0| 5:0-22:0| 5:0-22:0| 7:0-19:0| True| NULL| NULL| NULL| NULL| NULL| NULL| {'garage': False,...| True| NULL| NULL| NULL| NULL| False| 8:0-17:0| NULL| False| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| Perry| 5| 1| 1| 0| NULL| None| 0| 1.2| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 1| 0| NULL| NULL| 428|\n", - "|zea3cUOcoQ_wGynJk...|-0A2QLxO4XfTSVlbS...|mFgKFwWrEohUXsX4y...| 5| 0| 0| 1|Good food. Nice p...|New China Restaurant| 146 Mariner Blvd| Spring Hill| FL| 34609|28.4350613055|-82.5465548323| 3.5| 13| 1| NULL|Chinese, Restaurants| NULL| True| 11:0-22:0| 11:0-22:0| 11:0-22:0| 11:0-22:0| 11:0-22:30| 11:0-22:30| False| 1| NULL| True| False| False| NULL| {'garage': False,...| NULL| NULL| None| True| False| NULL| 12:0-22:0| 'none'| True| 'casual'|{'touristy': Fals...| NULL| False| NULL| NULL| 'quiet'| {'dessert': False...| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| Jack| 5| 0| 1| 1| NULL| None| 0| 3.2| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 1| 0|Eat here.| 0| 15|\n", - "|_PxWQoKAfyoNtEaPs...|-0BRo2qsskTQk6Zi3...|4wrMdvKB3bC7FpYei...| 5| 1| 0| 1|Jeana Oh is amazi...|Mira Bella Salon ...|1819 Cliff Dr, Ste B| Santa Barbara| CA| 93109| 34.4010923| -119.7214959| 3.5| 41| 1| False|Waxing, Hair Remo...| NULL| True| 0:0-0:0| 9:0-19:0| 9:0-19:0| 9:0-19:0| 9:0-19:0| 9:0-17:0| True| 2| NULL| NULL| NULL| NULL| NULL| {'garage': False,...| True| NULL| NULL| NULL| NULL| NULL| NULL| NULL| True| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| Lee| 8| 8| 1| 2| NULL| None| 0| 4.56| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 1| 0| NULL| NULL| 14|\n", - "|LSRRsvRvqyIJxXB_p...|-0KosxqqMBYNfJ7Vu...|nDpz9Yl_pkQrmRJFT...| 2| 1| 0| 1|I recommend you d...|Lowe's Home Impro...| 630 Cowpath Road| Lansdale| PA| 19446| 40.247942| -75.248306| 2.0| 21| 1| NULL|Shopping, Home & ...| NULL| True| 6:0-21:0| 6:0-21:0| 6:0-21:0| 6:0-21:0| 6:0-21:0| 6:0-21:0| NULL| 2| NULL| None| None| NULL| NULL| {'garage': False,...| NULL| NULL| NULL| NULL| NULL| NULL| 8:0-20:0| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| Anthony| 52| 130| 18| 7| NULL|N5DV2wRb5BTCJHUo5...| 0| 3.06| 0| 0| 0| 0| 0| 3| 0| 0| 0| 1| 0| 58| 0| NULL| NULL| 81|\n", - "|WCdSajl5Q0qywpv7K...|-0KrCHEsOcjJ6N4k_...|C8eSgR4SsLSkC9Wy7...| 4| 2| 1| 1|When out of town ...|Sugarfire Smoke H...| 605 Washington Ave| Saint Louis| MO| 63101| 38.6304074| -90.1895289| 4.0| 1113| 1| NULL|Sandwiches, Salad...| NULL| True| 11:0-21:0| 11:0-21:0| 11:0-21:0| 11:0-21:0| 11:0-21:0| 11:0-21:0| True| 2| NULL| True| True| True| 'no'| {'garage': False,...| NULL| False| None| True| False| False| 11:0-21:0| 'full_bar'| True| 'casual'|{'touristy': None...| False| True| None| NULL| u'average'| {'dessert': True,...| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| Nicole| 62| 57| 10| 55| 2018,2019|PRPAbcAqK98BMIGmK...| 1| 4.03| 2| 0| 0| 1| 0| 0| 3| 4| 4| 3| 0| 6| 2| NULL| NULL| 1343|\n", - "|H9KNObMOKNZD73948...|-0WYHeZYcySNvNeFz...|n55IExaR9zeGdXewb...| 1| 0| 1| 0|This has been and...|Bowery Bayside by...|6301 S Westshore ...| Tampa| FL| 33616|27.8770223794|-82.5286011552| 2.5| 27| 1| NULL|Apartments, Home ...| NULL| False| 10:0-18:0| 10:0-18:0| 10:0-18:0| 10:0-18:0| 10:0-18:0| 10:0-17:0| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|Christina| 1| 0| 1| 0| NULL|pXYrZRf3MaUQHJWTI...| 0| 1.0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 240| 0| NULL| NULL| 2|\n", - "|4LG0U_Gxc45-Fanbj...|-0YrXUvXz8112yHap...|duNismkzzZo3brl5n...| 2| 1| 0| 0|\"I've been here 3...|Four Day Ray Brewing| 11671 Lantern Rd| Fishers| IN| 46038| 39.9580802| -86.0137119| 4.0| 644| 1| False|Restaurants, Food...| NULL| True| 0:0-0:0| 11:0-22:0| 11:0-22:0| 11:0-22:0| 11:0-23:0| 11:0-23:0| True| 2| NULL| True| True| True| u'free'| {'garage': True, ...| True| True| True| True| True| True| 11:0-21:0| 'full_bar'| True| 'casual'|{'touristy': Fals...| True| True| NULL| NULL| u'average'| {'dessert': True,...| False| NULL| NULL| false| NULL| NULL| False| NULL| NULL| NULL| NULL| NULL| NULL| NULL| Dale| 110| 150| 26| 73| 2015,2017,2018|5ADgo4z9FGFNCRg9r...| 4| 3.52| 6| 2| 0| 0| 0| 12| 2| 9| 9| 5| 0| 16| 3| NULL| NULL| 979|\n", - "|rOh8NovXVn48p8I-Z...|-0aZWYi2YicFaLxTr...|_nHdGnKoUpTOJOJWN...| 5| 8| 1| 4|De-li-cious! Came...| sweetgreen| 1821 Chestnut St| Philadelphia| PA| 19103| 39.9518677| -75.171174| 4.0| 213| 1| NULL|Vegan, Soup, Vege...| NULL| True| 0:0-0:0| 10:30-20:0| 10:30-20:0| 10:30-20:0| 10:30-20:0| 11:30-20:0| True| 2| NULL| True| True| True| 'free'| {'garage': None, ...| NULL| NULL| False| True| False| False| 10:30-21:0| u'none'| True| 'casual'|{'touristy': Fals...| False| True| NULL| NULL| 'average'| {'dessert': False...| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| Linna| 429| 478| 104| 103|2012,2013,2014,20...|AcFItfjOplJZVO6XB...| 20| 3.47| 5| 1| 0| 0| 0| 3| 15| 14| 14| 11| 0| 35| 9| NULL| NULL| 510|\n", - "|pWEHpA2GgNiww0WGb...|-0gPgo2dX-CRRlXaJ...|jS8aLrT13uYPQQs8f...| 5| 0| 0| 0|Excellent service...|Crabby Bill's Fis...|10316 Roosevelt B...|St. Petersburg| FL| 33716| 27.8659055| -82.6427574| 4.5| 82| 0| NULL|Fish & Chips, Res...| NULL| True| 0:0-0:0| 11:0-21:0| 11:0-21:0| 11:0-21:0| 11:0-21:0| 11:0-21:0| True| 2| NULL| True| True| True| u'free'| {'garage': False,...| True| True| None| True| False| False| 11:0-21:0| u'beer_and_wine'| NULL| NULL|{'touristy': None...| False| True| NULL| NULL| u'average'| {'dessert': None,...| False| NULL| NULL| NULL| NULL| NULL| False| NULL| NULL| NULL| NULL| NULL| NULL| NULL| J| 38| 20| 7| 5| NULL| None| 1| 3.63| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 1| 0| NULL| NULL| 46|\n", - "|ZIMdEfiP954EaLOX0...|-0invYStOTVSxOgA0...|td93CM8hl77v2hTr7...| 4| 2| 0| 0|i had to wait lon...| Ghot Wingz| 2501 Gallatin Ave| Nashville| TN| 37206| 36.19521| -86.743444| 4.0| 10| 0| NULL|Chicken Wings, Re...| NULL| True| NULL| NULL| NULL| NULL| NULL| NULL| NULL| 1| NULL| True| False| True| NULL| NULL| NULL| NULL| False| True| False| NULL| NULL| u'none'| NULL| u'casual'|{'romantic': Fals...| NULL| True| NULL| NULL| u'quiet'| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| craig| 7| 9| 1| 1| NULL|ff6fE0hxaaxs9lGmP...| 0| 2.71| 0| 1| 0| 0| 0| 0| 0| 0| 0| 0| 0| 152| 0| NULL| NULL| 12|\n", - "|sihT-_DtwOdnDDDJb...|-0jW-rb_xOc81KDXl...|FFVWT9l5-iJmP_u7f...| 1| 1| 0| 0|Omg, the chicken ...| Cigar City Brewing| 3924 W Spruce St| Tampa| FL| 33607| 27.958601| -82.509346| 4.5| 876| 1| False|Tours, Food, Wine...| NULL| True| 11:0-22:0| 11:0-22:0| 11:0-22:0| 11:0-22:0| 11:0-21:0| 11:0-23:0| True| 2| NULL| True| None| False| u'free'| {'garage': False,...| True| True| NULL| NULL| NULL| True| 11:0-22:0| u'beer_and_wine'| True| NULL| NULL| NULL| NULL| NULL| NULL| u'average'| NULL| False| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| Eric| 5| 8| 3| 1| NULL| None| 0| 1.6| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 1| 0| NULL| NULL| 4116|\n", - "|itv5mVA2lkp_ivhSd...|-0rsuyw37qlGp7I2k...|Tj4OZHWTSnrgZTXYi...| 5| 1| 0| 0|\"It's a very nice...|Seorabol Korean R...| 5734 Old 2nd St| Philadelphia| PA| 19120|40.0385555068|-75.1242795983| 4.0| 531| 1| False|Korean, Restauran...| NULL| True| 11:0-23:0| 11:0-23:0| 11:0-23:0| 11:0-23:0| 11:0-23:0| 11:0-23:0| True| 2| NULL| True| True| True| u'free'| {'garage': False,...| True| True| None| True| False| False| 11:0-23:0| u'full_bar'| True| 'casual'|{'romantic': Fals...| False| True| NULL| NULL| u'average'| {'dessert': False...| False| NULL| NULL| NULL| NULL| NULL| NULL| False| 'yes_free'| NULL| NULL| NULL| NULL| NULL| Rebekah| 3| 3| 1| 2| NULL|Ju8Wae4dgttdxfE99...| 0| 5.0| 0| 0| 0| 0| 0| 0| 1| 0| 0| 0| 0| 25| 0| NULL| NULL| 1237|\n", - "|8wkPLsDbGU3GcW8Co...|-1-pTXLHph2vcqRCN...|hHy3jQ2aujFuGL-xQ...| 4| 0| 0| 0|Solid coffee! Som...| Caffe Luce| 245 E Congress St| Tucson| AZ| 85701| 32.222231| -110.967274| 4.5| 321| 1| NULL|Coffee & Tea, Foo...| NULL| True| 7:0-21:0| 7:0-21:0| 7:0-21:0| 7:0-21:0| 7:0-21:0| 8:0-20:0| True| 1| NULL| True| True| False| u'free'| {u'valet': False,...| NULL| True| True| True| False| NULL| 9:0-19:0| u'beer_and_wine'| False| u'casual'|{'touristy': Fals...| False| True| NULL| NULL| u'average'| {'dessert': False...| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|Elizabeth| 134| 138| 38| 71|2016,2017,2018,20...|Csd8998hojzTmFGUk...| 11| 4.08| 1| 1| 0| 0| 0| 5| 12| 11| 11| 1| 6| 246| 7| NULL| NULL| 1439|\n", - "|lR_ka9JYYUpL3se5W...|-11rdqk0uCiXWA1So...|9_B80bvYwVYPi4MY_...| 4| 0| 0| 0|My steak was cook...| LongHorn Steakhouse|3642 S Lindbergh ...| Saint Louis| MO| 63127|38.5537877285|-90.4063998857| 3.0| 90| 1| NULL|Restaurants, Barb...| NULL| True| 0:0-0:0| 11:0-22:0| 11:0-22:0| 11:0-20:0| 11:0-23:0| 11:0-23:0| False| 2| NULL| True| False| False| 'free'| {u'valet': False,...| NULL| NULL| False| True| False| False| 11:0-22:0| u'full_bar'| True| u'casual'|{'romantic': Fals...| True| True| NULL| NULL| u'average'| {'dessert': False...| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| Robert| 5| 0| 0| 0| NULL| None| 0| 4.4| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 1| 0| NULL| NULL| 255|\n", - "|hkirr3WzSTx2d9cdP...|-184J3wL7JPF3Oq-F...|Vc3V4WyaJpBcqgHN_...| 5| 0| 0| 0|Came all the way ...| Mama's Vegetarian| 18 S 20th St| Philadelphia| PA| 19103| 39.952706| -75.173477| 4.5| 592| 0| NULL|Comfort Food, Mid...| NULL| False| 11:0-21:0| 11:0-21:0| 11:0-21:0| 11:0-21:0| 11:0-15:0| NULL| True| 1| NULL| True| False| True| u'no'| {'garage': False,...| NULL| NULL| True| False| False| NULL| 12:0-19:0| 'none'| True| 'casual'|{'romantic': Fals...| False| False| NULL| NULL| u'average'| {'dessert': False...| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| Lorena| 3| 3| 0| 0| NULL|vt2gAopIunM1VWQZB...| 0| 3.67| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 0| 1| 0| NULL| NULL| 991|\n", - "+--------------------+--------------------+--------------------+---------+----------+---------+--------+--------------------+--------------------+--------------------+--------------+-----+-----------+-------------+--------------+---------+----------------+-------+----------------------------+--------------------+-----+-------------------------------------+------------+-------------+---------------+--------------+------------+--------------+----------------------+---------------------------------+--------------------+-----------------------------+------------------------------+-----------------+---------------+--------------------------+-------------------------------+--------------------+-------------------------+----------------+----------------------------------+----------------------+------------+------------------+----------------------+----------------------------+--------------------+----------------------------------+-----------------------------------+--------------------+----------+---------------------+----------------------+---------------------------------+------------------+--------------------+-------------------------+---------------------------+---------------------+---------------+------------------+----------------------+----------------------------+----------------------+------------------------------------+----------------------+------------------------------+---------+------------+----------+---------+--------+--------------------+--------------------+----+-------------+--------------+---------------+------------------+---------------+---------------+---------------+----------------+---------------+----------------+-----------------+-----------------+-------------+-----------+---------+----------------+-------------------+\n", - "only showing top 20 rows\n", - "\n" - ] - } - ], - "source": [ - "# prompt: read and display \"/content/drive/MyDrive/ProcessedCSV/merged.csv\"\n", - "\n", - "df_merged = spark.read.csv(\"/content/drive/MyDrive/ProcessedCSV/merged.csv\", header=True, inferSchema=True)\n", - "df_merged.show()\n" - ] - }, - { - "cell_type": "code", - "source": [], - "metadata": { - "id": "_VinzFQbRYjZ" - }, - "execution_count": null, - "outputs": [] - } - ], - "metadata": { - "colab": { - "provenance": [], - "mount_file_id": "1ztEyYU2j0EQv42Q6f8B1IEP6UboxtPZx", - "authorship_tag": "ABX9TyOLIxRNRmTq9HVoLyYyXA/l", - "include_colab_link": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file diff --git a/README.md b/README.md index 0a9fa09..640fc88 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,56 @@ -# Business Recommendation System +# Success Rate Prediction and Business Recommendation Platform -This repository contains the analysis and modeling of the Yelp business dataset. The project includes data cleaning, feature engineering, model building, and evaluation. +Welcome to the Success Rate Prediction and Business Recommendation Model repository! This project leverages machine learning to help entrepreneurs and SMEs predict business success rates and get personalized recommendations. Using real-world data from Yelp and other sources, the model provides actionable insights into business viability. -# BizRecProject +--- +## Table of Contents + +- [Introduction](#introduction) +- [Features](#features) +- [Dataset](#dataset) +- [Contributing](#contributing) +- [License](#license) + +--- + +## Introduction + +High failure rates among small and medium-sized enterprises (SMEs) highlight the need for data-driven tools that provide actionable insights into business viability. This project offers a **Machine Learning-based prediction model** that: +- Predicts the likelihood of business success based on key factors. +- Provides business recommendations tailored to location, industry, and customer sentiment. +- Helps entrepreneurs make better market entry decisions and allocate resources effectively. + +--- + +## Features + +- **Business Success Prediction**: Provides a success rate (%) prediction for each business based on real-time data. +- **Personalized Recommendations**: Ranks and recommends the top N businesses in a given industry or location. +- **Sentiment Analysis**: Incorporates customer reviews to assess customer sentiment and predict business success. +- **Real-Time Data Integration**: Uses real-time market and competitor data for accurate, up-to-date predictions. +- **User-Friendly Dashboard**: Simple and intuitive interface for non-technical users. + +--- + +## Dataset + +The project uses the [Yelp Open Dataset](https://www.yelp.com/dataset) along with additional sources for financial and geographic data. Key features include: +- **Business Data**: Name, location, category, rating, review count, operating hours, etc. +- **Customer Review Data**: Text reviews, ratings, sentiment analysis. +- **User Data**: Information about reviewers, including review history and credibility. +- **Check-In Data**: Customer check-ins, which help measure business popularity. + +--- + +## Contributing + +We welcome contributions to improve the platform! Please fork the repository, create a new branch, and submit a pull request with your changes. For major changes, please open an issue first to discuss what you would like to change. + +--- + +## License + +This project is licensed under the MIT License. See `LICENSE` for more information. + +--- diff --git a/Roshan_senti.ipynb b/Roshan_senti.ipynb new file mode 100644 index 0000000..c45993d --- /dev/null +++ b/Roshan_senti.ipynb @@ -0,0 +1,1554 @@ + +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "tglWjtAwVwJD" + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import nltk\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n", + "from sklearn.naive_bayes import MultinomialNB\n", + "from sklearn.metrics import classification_report, confusion_matrix" + ] + }, + { + "cell_type": "code", + "source": [ + "df=pd.read_csv('/content/yelp_tip.csv')\n", + "df.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 293 + }, + "id": "U2Wd1VVfV6SC", + "outputId": "f59faa21-8542-4f66-e443-ef97f6dfc736" + }, + "execution_count": 3, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " _id user_id business_id \\\n", + "0 66ea489ae59c7c5b6d8be1ac AGNUgVwnZUey3gcPCJ76iw 3uLgwr0qeCNMjKenHJwPGQ \n", + "1 66ea489ae59c7c5b6d8be1ad NBN4MgHP9D3cw--SnauTkA QoezRbYQncpRqyrLH6Iqjg \n", + "2 66ea489ae59c7c5b6d8be1ae -copOvldyKh1qr-vzkDEvw MYoRNLb5chwjQe3c_k37Gg \n", + "3 66ea489ae59c7c5b6d8be1af FjMQVZjSqY8syIO-53KFKw hV-bABTK-glh5wj31ps_Jw \n", + "4 66ea489ae59c7c5b6d8be1b0 ld0AperBXk1h6UbqmM80zw _uN0OudeJ3Zl_tf6nxg5ww \n", + "\n", + " text date \\\n", + "0 Avengers time with the ladies. 2012-05-18 02:17:21 \n", + "1 They have lots of good deserts and tasty cuban... 2013-02-05 18:35:10 \n", + "2 It's open even when you think it isn't 2013-08-18 00:56:08 \n", + "3 Very decent fried chicken 2017-06-27 23:05:38 \n", + "4 Appetizers.. platter special for lunch 2012-10-06 19:43:09 \n", + "\n", + " compliment_count \n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
_iduser_idbusiness_idtextdatecompliment_count
066ea489ae59c7c5b6d8be1acAGNUgVwnZUey3gcPCJ76iw3uLgwr0qeCNMjKenHJwPGQAvengers time with the ladies.2012-05-18 02:17:210
166ea489ae59c7c5b6d8be1adNBN4MgHP9D3cw--SnauTkAQoezRbYQncpRqyrLH6IqjgThey have lots of good deserts and tasty cuban...2013-02-05 18:35:100
266ea489ae59c7c5b6d8be1ae-copOvldyKh1qr-vzkDEvwMYoRNLb5chwjQe3c_k37GgIt's open even when you think it isn't2013-08-18 00:56:080
366ea489ae59c7c5b6d8be1afFjMQVZjSqY8syIO-53KFKwhV-bABTK-glh5wj31ps_JwVery decent fried chicken2017-06-27 23:05:380
466ea489ae59c7c5b6d8be1b0ld0AperBXk1h6UbqmM80zw_uN0OudeJ3Zl_tf6nxg5wwAppetizers.. platter special for lunch2012-10-06 19:43:090
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df" + } + }, + "metadata": {}, + "execution_count": 3 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# prompt: print schema of df\n", + "\n", + "df.info()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "_Pj6djUXeiOk", + "outputId": "26220c08-1781-40e0-9708-1a3bed3356d6" + }, + "execution_count": 4, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "RangeIndex: 908915 entries, 0 to 908914\n", + "Data columns (total 6 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 _id 908915 non-null object\n", + " 1 user_id 908915 non-null object\n", + " 2 business_id 908915 non-null object\n", + " 3 text 908901 non-null object\n", + " 4 date 908915 non-null object\n", + " 5 compliment_count 908915 non-null int64 \n", + "dtypes: int64(1), object(5)\n", + "memory usage: 41.6+ MB\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import nltk\n", + "\n", + "# Download 'stopwords' and 'wordnet' from NLTK\n", + "nltk.download('stopwords')\n", + "nltk.download('wordnet')\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "xx2FfzDGZWiv", + "outputId": "b80973d4-8821-4b8f-9565-fe4defaa6718" + }, + "execution_count": 5, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", + "[nltk_data] Unzipping corpora/stopwords.zip.\n", + "[nltk_data] Downloading package wordnet to /root/nltk_data...\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": {}, + "execution_count": 5 + } + ] + }, + { + "cell_type": "code", + "source": [ + "from nltk.corpus import stopwords\n", + "from nltk.corpus import wordnet" + ], + "metadata": { + "id": "KCEbdZbdaaUf" + }, + "execution_count": 6, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import nltk\n", + "from nltk.corpus import stopwords\n", + "from nltk.stem import WordNetLemmatizer\n", + "import string\n", + "import pandas as pd\n", + "\n", + "# Download resources if not already downloaded\n", + "nltk.download('stopwords')\n", + "nltk.download('wordnet')\n", + "\n", + "# Initialize lemmatizer and stop words\n", + "lemmatizer = WordNetLemmatizer()\n", + "stop_words = set(stopwords.words('english'))\n", + "\n", + "# Sample DataFrame for demonstration (replace with your actual DataFrame)\n", + "data = {'text': [\"Example sentence for preprocessing.\", None, \"Another example text.\", 123.45]}\n", + "df = pd.DataFrame(data)\n", + "\n", + "def preprocess_text(text):\n", + " # Convert non-string types to an empty string\n", + " if not isinstance(text, str):\n", + " text = ''\n", + " # Remove punctuation\n", + " text = text.translate(str.maketrans('', '', string.punctuation))\n", + " # Tokenize and lemmatize\n", + " words = text.split()\n", + " words = [lemmatizer.lemmatize(word.lower()) for word in words if word.lower() not in stop_words]\n", + " return ' '.join(words)\n", + "\n", + "# Fill NaNs with empty strings before applying preprocessing\n", + "df['text'] = df['text'].fillna('')\n", + "df['cleaned_text'] = df['text'].apply(preprocess_text)\n", + "\n", + "df # Display the DataFrame to confirm results\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 244 + }, + "id": "5yXQoR--XpAu", + "outputId": "300aa3cf-b676-4970-8da4-2478f30848f6" + }, + "execution_count": 7, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n", + "[nltk_data] Downloading package wordnet to /root/nltk_data...\n", + "[nltk_data] Package wordnet is already up-to-date!\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " text cleaned_text\n", + "0 Example sentence for preprocessing. example sentence preprocessing\n", + "1 \n", + "2 Another example text. another example text\n", + "3 123.45 " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textcleaned_text
0Example sentence for preprocessing.example sentence preprocessing
1
2Another example text.another example text
3123.45
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df", + "summary": "{\n \"name\": \"df\",\n \"rows\": 4,\n \"fields\": [\n {\n \"column\": \"text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"\",\n 123.45,\n \"Example sentence for preprocessing.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"cleaned_text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"example sentence preprocessing\",\n \"\",\n \"another example text\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 7 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# # Define sentiment labels\n", + "# df['sentiment'] = df['compliment_count'].apply(lambda x: 'pos' if x > 0 else 'neg')" + ], + "metadata": { + "id": "T_HW0B8JXo4M" + }, + "execution_count": 8, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "print(df.columns)\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "VidGzeH0XoxH", + "outputId": "d3ba5d0d-673b-4b30-bb4c-be81cb4bc45a" + }, + "execution_count": 9, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Index(['text', 'cleaned_text'], dtype='object')\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "if 'compliment_count' not in df.columns:\n", + " df['compliment_count'] = 0 # or any other default value\n" + ], + "metadata": { + "id": "qLwvF41aXotk" + }, + "execution_count": 10, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Ensure 'compliment_count' exists in the DataFrame or add it if necessary\n", + "if 'compliment_count' not in df.columns:\n", + " df['compliment_count'] = 0 # Add with a default value, e.g., 0\n", + "\n", + "# Define sentiment labels based on 'compliment_count' values\n", + "df['sentiment'] = df['compliment_count'].apply(lambda x: 'pos' if x > 0 else 'neg')\n" + ], + "metadata": { + "id": "5THjo_VSXoq7" + }, + "execution_count": 11, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "#Splitting the Dataset\n", + "X = df['cleaned_text']\n", + "y = df['sentiment']\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)" + ], + "metadata": { + "id": "yA1BKoVjXooT" + }, + "execution_count": 12, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Count Vectorization\n", + "count_vect = CountVectorizer()\n", + "X_train_counts = count_vect.fit_transform(X_train)\n", + "\n", + "# TF-IDF Transformation\n", + "tfidf_transformer = TfidfTransformer()\n", + "X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)" + ], + "metadata": { + "id": "X7D3RUJ6XolV" + }, + "execution_count": 13, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Train the classifier\n", + "clf = MultinomialNB()\n", + "clf.fit(X_train_tfidf, y_train)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 80 + }, + "id": "kINv1yVFXoim", + "outputId": "5c8d2ad0-8c96-4a09-881d-3d3d3192d4b6" + }, + "execution_count": 14, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "MultinomialNB()" + ], + "text/html": [ + "
MultinomialNB()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ] + }, + "metadata": {}, + "execution_count": 14 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Transform the test data\n", + "X_test_counts = count_vect.transform(X_test)\n", + "X_test_tfidf = tfidf_transformer.transform(X_test_counts)\n", + "\n", + "# Make predictions\n", + "predicted = clf.predict(X_test_tfidf)" + ], + "metadata": { + "id": "qd759H5rdGRP" + }, + "execution_count": 15, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Evaluate the model\n", + "print(confusion_matrix(y_test, predicted))\n", + "print(classification_report(y_test, predicted))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "txfIXKGFdGN3", + "outputId": "e4cd2448-a358-4799-8595-96edddbc5b39" + }, + "execution_count": 16, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[[2]]\n", + " precision recall f1-score support\n", + "\n", + " neg 1.00 1.00 1.00 2\n", + "\n", + " accuracy 1.00 2\n", + " macro avg 1.00 1.00 1.00 2\n", + "weighted avg 1.00 1.00 1.00 2\n", + "\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:409: UserWarning: A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.\n", + " warnings.warn(\n" + ] + } + ] + }, + { + "source": [ + "# Import necessary libraries\n", + "import pandas as pd\n", + "import nltk\n", + "from nltk.sentiment import SentimentIntensityAnalyzer\n", + "\n", + "\n", + "df = pd.read_csv('/content/yelp_tip.csv')\n", + "\n", + "# Download VADER lexicon\n", + "nltk.download('vader_lexicon')\n", + "\n", + "# Initialize the SentimentIntensityAnalyzer\n", + "sia = SentimentIntensityAnalyzer()\n", + "\n", + "# Define a function for sentiment analysis\n", + "def analyze_sentiment(text):\n", + " # Convert the input to string if it's not already\n", + " if not isinstance(text, str):\n", + " text = str(text)\n", + "\n", + " score = sia.polarity_scores(text)\n", + " return score\n", + "\n", + "# Apply sentiment analysis to the text column\n", + "df['sentiment_scores'] = df['text'].apply(analyze_sentiment)\n", + "\n", + "# Expand the sentiment scores into separate columns\n", + "df_sentiment = df['sentiment_scores'].apply(pd.Series)\n", + "\n", + "# Combine the original DataFrame with the sentiment scores\n", + "df = pd.concat([df, df_sentiment], axis=1)\n", + "\n", + "# Function to classify sentiment based on compound score\n", + "def classify_sentiment(compound_score):\n", + " if compound_score >= 0.05:\n", + " return 'positive'\n", + " elif compound_score <= -0.05:\n", + " return 'negative'\n", + " else:\n", + " return 'neutral'\n", + "\n", + "# Apply the classification function\n", + "df['sentiment'] = df['compound'].apply(classify_sentiment)\n", + "\n", + "# Display the DataFrame with sentiment analysis results\n", + "print(df[['text', 'neg', 'neu', 'pos', 'compound', 'sentiment']])" + ], + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "UUkfQI5fflxJ", + "outputId": "39b3458d-c1f0-442a-94e4-dae4c94fde42" + }, + "execution_count": 17, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[nltk_data] Downloading package vader_lexicon to /root/nltk_data...\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + " text neg neu \\\n", + "0 Avengers time with the ladies. 0.000 1.000 \n", + "1 They have lots of good deserts and tasty cuban... 0.000 0.756 \n", + "2 It's open even when you think it isn't 0.000 1.000 \n", + "3 Very decent fried chicken 0.000 1.000 \n", + "4 Appetizers.. platter special for lunch 0.000 0.597 \n", + "... ... ... ... \n", + "908910 Disappointed in one of your managers. 0.383 0.617 \n", + "908911 Great food and service. 0.000 0.423 \n", + "908912 Love their Cubans!! 0.000 0.295 \n", + "908913 Great pizza great price 0.000 0.196 \n", + "908914 Food is good value but a bit hot! 0.000 0.559 \n", + "\n", + " pos compound sentiment \n", + "0 0.000 0.0000 neutral \n", + "1 0.244 0.4404 positive \n", + "2 0.000 0.0000 neutral \n", + "3 0.000 0.0000 neutral \n", + "4 0.403 0.4019 positive \n", + "... ... ... ... \n", + "908910 0.000 -0.4767 negative \n", + "908911 0.577 0.6249 positive \n", + "908912 0.705 0.6988 positive \n", + "908913 0.804 0.8481 positive \n", + "908914 0.441 0.4482 positive \n", + "\n", + "[908915 rows x 6 columns]\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "PTXBFouifYDd" + }, + "execution_count": null, + "outputs": [] + } + ] +} diff --git a/business_analytics.pbix b/business_analytics.pbix deleted file mode 100644 index 82c8c84..0000000 Binary files a/business_analytics.pbix and /dev/null differ diff --git a/recommendation_system_hybrid.py b/recommendation_system_hybrid.py deleted file mode 100644 index 0e85144..0000000 --- a/recommendation_system_hybrid.py +++ /dev/null @@ -1,517 +0,0 @@ -# -*- coding: utf-8 -*- -"""Recommendation_System_M1.ipynb - -Automatically generated by Colab. - -Original file is located at - https://colab.research.google.com/drive/1lG9ErjjQRiohxj_1xczsjZ3EAeikmNWW -""" - -! pip install pyspark - -from google.colab import drive -drive.mount('/content/drive') - -!pip install textblob - -!pip install lightfm -!pip install gensim - -# Create a Spark session -spark = SparkSession.builder.appName("HybridRecommendationSystem").getOrCreate() - -# Import necessary libraries -from pyspark.sql import SparkSession -from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler -from textblob import TextBlob -from pyspark.ml.feature import Word2Vec -from pyspark.sql import SparkSession -from pyspark.ml.feature import VectorAssembler -from pyspark.ml.recommendation import ALS -from pyspark.ml.evaluation import RegressionEvaluator -from pyspark.ml.tuning import CrossValidator, ParamGridBuilder -from pyspark.ml.feature import Normalizer -from pyspark.ml.evaluation import RegressionEvaluator -from pyspark.ml.feature import StringIndexer, VectorIndexer -from pyspark.ml.regression import LinearRegression -import numpy as np -from scipy.sparse import coo_matrix -from gensim.models import Word2Vec as GensimWord2Vec -from pyspark.sql.functions import explode, col, udf, collect_list, avg, lit -from pyspark.sql.functions import year, month, dayofmonth -from pyspark.sql.types import ArrayType, StringType, StructType, StructField, FloatType, IntegerType # Import StructType, StructField, FloatType and IntegerType - -# If you need to calculate cosine similarity, you can define a function like this: -from pyspark.ml.linalg import Vectors -from pyspark.sql.functions import udf - -@udf("double") -def cosine_similarity(v1, v2): - """ - Calculates the cosine similarity between two vectors. - Args: - v1: First vector. - v2: Second vector. - Returns: - The cosine similarity between the two vectors. - """ - return float(v1.dot(v2) / (v1.norm(2) * v2.norm(2))) - -# Define schema for business_df -business_schema = StructType([ - StructField("business_id", StringType(), True), - StructField("name", StringType(), True), - StructField("address", StringType(), True), - StructField("city", StringType(), True), - StructField("state", StringType(), True), - StructField("postal_code", StringType(), True), - StructField("latitude", FloatType(), True), - StructField("longitude", FloatType(), True), - StructField("stars", FloatType(), True), - StructField("review_count", IntegerType(), True), - StructField("is_open", IntegerType(), True), - StructField("attributes", StringType(), True), # You might need to adjust this based on your data - StructField("categories", StringType(), True), # You might need to adjust this based on your data - StructField("hours", StringType(), True) # You might need to adjust this based on your data -]) - -# Load the Yelp datasets with explicit schema or schema inference hints -file_path = "/content/drive/MyDrive/yelp_business1.csv" -business_df = spark.read.csv(file_path, header=True, schema=business_schema) -business_df.show() - -from pyspark.sql import SparkSession, functions as F -user_df = spark.read.csv("/content/drive/MyDrive/ProcessedCSV/final_users.csv", header=True, inferSchema=True) -columns_to_drop = user_df.columns -user_df = user_df.filter(F.col(columns_to_drop[0]) != user_df.first()[columns_to_drop[0]]) -for col in columns_to_drop[1:]: - user_df = user_df.filter(F.col(col) != user_df.first()[col]) -user_df.show() - -review_df = spark.read.csv("/content/drive/MyDrive/ProcessedCSV/final_review.csv", header=True, inferSchema=True) -columns_to_drop = review_df.columns -review_df = review_df.filter(F.col(columns_to_drop[0]) != review_df.first()[columns_to_drop[0]]) -for col in columns_to_drop[1:]: - review_df = review_df.filter(F.col(col) != review_df.first()[col]) -review_df.show() - -tip_df = spark.read.csv("/content/drive/MyDrive/ProcessedCSV/final_tip.csv", header=True, inferSchema=True) -columns_to_drop = tip_df.columns -tip_df = tip_df.filter(F.col(columns_to_drop[0]) != tip_df.first()[columns_to_drop[0]]) -for col in columns_to_drop[1:]: - tip_df = tip_df.filter(F.col(col) != tip_df.first()[col]) -tip_df.show() - -checkin_df = spark.read.csv("/content/drive/MyDrive/ProcessedCSV/final_checkin.csv", header=True, inferSchema=True) -columns_to_drop = checkin_df.columns -checkin_df = checkin_df.filter(F.col(columns_to_drop[0]) != checkin_df.first()[columns_to_drop[0]]) -for col in columns_to_drop[1:]: - checkin_df = checkin_df.filter(F.col(col) != checkin_df.first()[col]) -checkin_df.show() - -from pyspark.sql import SparkSession, functions as F - -# Overview of Business/Restaurant Categories - -# Use withColumn to add a new column 'categories' by splitting the existing 'categories' column -business_categories = business_df.withColumn('categories', F.split(business_df['categories'], ', ')) \ - .withColumn('categories', F.explode('categories')) # Explode the array to get individual categories in separate rows - -print(str('The number of unique business categories is:'), business_categories.count()) # Use count() to get the number of rows in PySpark DataFrame - -# Most Common Business Categories -# Use groupBy and count to get the frequency of each category -business_categories.groupBy('categories').count().orderBy(F.desc('count')).show(30) - -# Assuming your business data is in a DataFrame called 'business_df' - -filtered_businesses = business_df.filter((F.col("review_count") >= 20) & (F.col("stars") >= 2)) - -# Verify the schema to check the new data type -filtered_businesses.printSchema() -filtered_businesses.show() - -# Top States for business -top_states = filtered_businesses.groupBy('state').count().orderBy(F.desc('count')).limit(10) -top_states.show() - -# Top cities for business -top_cities = filtered_businesses.groupBy('city').count().orderBy(F.desc('count')).limit(20) -top_cities.show() - -from pyspark.sql.functions import col, isnan, when, count - -# Check for null or NaN values in each column -null_counts = filtered_businesses.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in filtered_businesses.columns]) - -# Display the null counts for each column -null_counts.show() - -# Filter reviews based on stars and year -filtered_reviews = review_df.filter((col("stars") >= 2) & (col("year") >= 2017)) - -# Display the filtered reviews -filtered_reviews.printSchema() -filtered_reviews.show() - -first_5_user_ids = filtered_reviews.select("user_id").distinct().limit(5).rdd.flatMap(lambda x: x).collect() -print(first_5_user_ids) - -# Check the schema of both DataFrames -filtered_reviews.printSchema() -filtered_businesses.printSchema() - -# Check for distinct business IDs in both DataFrames -filtered_reviews.select("business_id").distinct().show() -filtered_businesses.select("business_id").distinct().show() - -# Check for null business_id values -filtered_reviews.filter(col("business_id").isNull()).show() -filtered_businesses.filter(col("business_id").isNull()).show() - -# Check row counts in both DataFrames -print("Number of rows in filtered_reviews:", filtered_reviews.count()) -print("Number of rows in filtered_businesses:", filtered_businesses.count()) - -# Perform the join if both DataFrames contain data -if filtered_reviews.count() > 0 and filtered_businesses.count() > 0: - joined_df = filtered_reviews.join(filtered_businesses, on="business_id", how="inner") - joined_df.show(10) -else: - print("One or both DataFrames are empty, cannot perform join.") - -from pyspark.sql.functions import col, count, when, isnan, isnull, broadcast - -# Rename 'stars' column in filtered_reviews before joining -filtered_businesses = filtered_businesses.withColumnRenamed("stars", "filtered_business_stars") -filtered_reviews = filtered_reviews.withColumnRenamed("stars", "filtered_review_stars") - -# Perform an inner join based on 'business_id' -# Use broadcast for smaller DataFrame (filtered_businesses) to optimize join -joined_df1 = filtered_reviews.join(broadcast(filtered_businesses), on="business_id", how="inner") - -# Check the number of rows in joined_df1 -print("Number of rows in joined_df1:", joined_df1.count()) - -joined_df1.show() - -from pyspark.sql.functions import col, count, when, isnan, isnull -from pyspark.sql.types import IntegerType, FloatType -from pyspark.ml.recommendation import ALS - - - -# Check for missing or invalid values in 'filtered_review_stars' -print("Number of rows with missing 'filtered_review_stars':", joined_df1.filter(isnan("filtered_review_stars") | isnull("filtered_review_stars")).count()) -print("Number of rows with zero 'filtered_review_stars':", joined_df1.filter(col("filtered_review_stars") == 0).count()) - -# If there are missing or invalid values, handle them appropriately (e.g., impute with average) - -# Cast user_id and business_id to integers -# Use a temporary DataFrame to avoid the ambiguity -# Explicitly specify the source DataFrame for 'stars' using alias -final_df = joined_df1.select( - col("user_id").cast(StringType()), - col("business_id").cast(StringType()), - col("filtered_review_stars").cast(FloatType()) # Choose which column to use -) - -final_df.show() - -final_df.printSchema() - -# Create StringIndexer objects for user_id and business_id -user_indexer = StringIndexer(inputCol="user_id", outputCol="user_id_indexed") -business_indexer = StringIndexer(inputCol="business_id", outputCol="business_id_indexed") - -# Fit the indexers and transform the DataFrame -indexed_df = user_indexer.fit(final_df).transform(final_df) -indexed_df = business_indexer.fit(indexed_df).transform(indexed_df) - -# Create an ALS object with the correct column names -als = ALS(userCol="user_id_indexed", itemCol="business_id_indexed", ratingCol="filtered_review_stars", coldStartStrategy="drop") - -# Fit ALS model using the indexed DataFrame -als_model = als.fit(indexed_df) - -# Generate ALS recommendations (using indexed IDs) -user_als_recommendations = als_model.recommendForAllUsers(10) -item_als_recommendations = als_model.recommendForAllItems(10) - -# Optionally, you can join the recommendations back with the original DataFrame -# to get the original user_id and business_id values. - -print("ALS Recommendations for Users:") -user_als_recommendations.show() - -print("ALS Recommendations for Businesses:") -item_als_recommendations.show() - -# 2. Content-based filtering using business features -# Import the necessary class -from pyspark.ml.feature import StandardScaler -# Assemble business features such as review count, stars, etc. -# Use 'filtered_business_stars' instead of 'stars' as it is present in the DataFrame -business_feature_columns = ["review_count", "filtered_business_stars", "is_open", "latitude", "longitude"] -assembler = VectorAssembler(inputCols=business_feature_columns, outputCol="features") -business_features = assembler.transform(filtered_businesses) - -# Scale features -scaler = StandardScaler(inputCol="features", outputCol="scaled_features") -scaler_model = scaler.fit(business_features) -business_features = scaler_model.transform(business_features) - -# Hybrid score as a weighted sum of ALS and content-based predictions -from pyspark.sql.functions import udf, explode, col -from pyspark.sql.types import DoubleType, ArrayType, StructType, StructField, FloatType, IntegerType - - -# Modify hybrid_score function to handle Vector type -def hybrid_score_udf(als_prediction_array, content_features_vector, alpha=0.5): - """ - Calculates the hybrid score. - - Args: - als_prediction_array (list of structs): The ALS prediction rating, - represented as a list of structs with `business_id_indexed` and `rating`. - content_features_vector (pyspark.ml.linalg.SparseVector or pyspark.ml.linalg.DenseVector): The content-based features vector. - alpha (float, optional): The weight given to ALS predictions. Defaults to 0.5. - - Returns: - list of structs: A list of structs with `business_id_indexed`, `rating` and `hybrid_score`. - """ - hybrid_scores = [] - - for prediction in als_prediction_array: - als_rating = prediction["rating"] - business_id_indexed = prediction["business_id_indexed"] - # Extract the first element (or desired feature) from the content_features_vector - content_feature = content_features_vector[0] # You may need to adjust this based on the desired feature - hybrid_score_val = (alpha * als_rating) + ((1 - alpha) * content_feature) - - hybrid_scores.append({"business_id_indexed": business_id_indexed, - "rating": als_rating, - "hybrid_score": hybrid_score_val}) - - return hybrid_scores - - - -# Register the hybrid_score_udf as a UDF -hybrid_score = udf (hybrid_score_udf, ArrayType(StructType([ - StructField("business_id_indexed", IntegerType(), True), - StructField("rating", FloatType(), True), - StructField("hybrid_score", FloatType(), True) -]))) - -# Assuming 'user_als_recommendations' contains the ALS recommendations -# Rename 'recommendations' column to avoid conflict with existing column name -user_als_recommendations = user_als_recommendations.withColumnRenamed("recommendations", "als_recommendations") - -# Import necessary modules -from pyspark.ml.linalg import Vectors, VectorUDT # Add this import -from pyspark.sql.functions import struct, collect_list, first, col # Add 'struct' here - -# Check if 'business_id_indexed' already exists in 'business_features' -if 'business_id_indexed' not in business_features.columns: # Add this condition - business_features = business_indexer.fit(business_features).transform(business_features) # add this line to index business_id and add 'business_id_indexed' column - -# Assuming 'user_als_recommendations' contains the ALS recommendations -# Rename 'recommendations' column to avoid conflict with existing column name -user_als_recommendations = user_als_recommendations.withColumnRenamed("recommendations", "als_recommendations") - -# Explode the 'als_recommendations' array to get individual rows for each recommendation -# This will create a new column 'business_id_indexed' that can be used for the join -exploded_recommendations = user_als_recommendations.select( - "user_id_indexed", - explode("als_recommendations").alias("exploded_recommendation") -).select( - "user_id_indexed", - "exploded_recommendation.business_id_indexed", - "exploded_recommendation.rating" -) - -# Now you can join using the 'business_id_indexed' column -hybrid_recommendations = exploded_recommendations.join(business_features, on="business_id_indexed", how="inner") - -# **Change here**: Group by 'user_id_indexed' and collect ALS recommendations and scaled features into lists -hybrid_recommendations = hybrid_recommendations.groupBy("user_id_indexed").agg( - collect_list(struct("business_id_indexed", "rating")).alias("als_recommendations"), - first("scaled_features").alias("scaled_features") # Take the first scaled_features as it should be the same for all recommendations of a user -) - -# **Change here**: Convert the 'scaled_features' column to a plain array before calling the UDF -hybrid_recommendations = hybrid_recommendations.withColumn( - "scaled_features_array", - udf(lambda v: v.toArray().tolist(), ArrayType(DoubleType()))("scaled_features") -) - -# Calculate hybrid scores using the collected lists and the converted array -hybrid_recommendations = hybrid_recommendations.withColumn( - "hybrid_scores", - hybrid_score(col("als_recommendations"), col("scaled_features_array")) # Pass the array here -) - -hybrid_recommendations.show(5) - -from pyspark.sql import functions as F -from pyspark.sql.window import Window - -def calculate_rank_relevance(predictions, relevance_threshold=3): - """ - Calculates rank and relevance for recommendations. - - Args: - predictions (pyspark.sql.DataFrame): DataFrame containing recommendations. - relevance_threshold (float): Threshold for determining relevance based on 'hybrid_scores'. - - Returns: - pyspark.sql.DataFrame: DataFrame with added 'rank' and 'relevance' columns. - """ - - # Explode the hybrid_scores array to get individual rows for each recommendation - predictions = predictions.select( - "user_id_indexed", - F.explode("hybrid_scores").alias("exploded_hybrid_score") - ).select( - "user_id_indexed", - "exploded_hybrid_score.business_id_indexed", - "exploded_hybrid_score.rating", - "exploded_hybrid_score.hybrid_score" # Extract the hybrid_score from the struct - ) - - # Create a window to rank recommendations for each user - window = Window.partitionBy("user_id_indexed").orderBy(F.desc("hybrid_score")) # Use the extracted hybrid_score - - # Add rank column based on hybrid_scores - predictions_with_rank = predictions.withColumn("rank", F.row_number().over(window)) - - # Add relevance column based on hybrid_scores and threshold - predictions_with_rank_relevance = predictions_with_rank.withColumn( - "relevance", F.when(F.col("hybrid_score") > relevance_threshold, 1).otherwise(0) # Use the extracted hybrid_score - ) - - return predictions_with_rank_relevance - -# Apply the function to your hybrid_recommendations DataFrame -ranked_recommendations = calculate_rank_relevance(hybrid_recommendations) - -# Display the results -ranked_recommendations.show(5) - -def recall_at_k(predictions, k): - # Define the window specification for ranking - window = Window.partitionBy("user_id_indexed").orderBy(F.desc("hybrid_score")) # Use 'hybrid_score' instead of 'hybrid_scores' - - # Get top k recommendations for each user - top_k_recs = predictions.withColumn("rank", F.row_number().over(window)).filter(F.col("rank") <= k) - - # Count relevant items in top k recommendations - relevant_items_in_top_k = top_k_recs.filter(F.col("relevance") == 1).groupBy("user_id_indexed").agg(F.count("business_id_indexed").alias("relevant_count")) - - # Count total relevant items for each user - total_relevant_items = predictions.filter(F.col("relevance") == 1).groupBy("user_id_indexed").agg(F.count("business_id_indexed").alias("total_relevant_count")) - - # Calculate recall@k - recall = relevant_items_in_top_k.join(total_relevant_items, "user_id_indexed").withColumn("recall", F.col("relevant_count") / F.col("total_relevant_count")) - - # Return average recall@k across all users - return recall.select(F.mean("recall")).collect()[0][0] - -# NDCG@K metric -def ndcg_at_k(predictions, k): - # Change here: Use 'user_id_indexed' for partitioning instead of 'user_id' - # Change here: Use 'hybrid_score' instead of 'hybrid_scores' for orderBy - predictions = predictions.withColumn("rank", F.row_number().over(Window.partitionBy("user_id_indexed").orderBy(F.desc("hybrid_score")))) - - # Replace 'actual' with the actual relevance column in your DataFrame - # Here, I'm assuming your relevance column is named 'relevance' - # If it's different, change it accordingly - predictions = predictions.withColumn("dcg", F.when(F.col("rank") <= k, F.col("relevance") / F.log2(F.col("rank") + 1))) - - # Change here: Use 'user_id_indexed' for grouping instead of 'user_id' - dcg = predictions.groupBy("user_id_indexed").agg(F.sum("dcg").alias("dcg")) - # Change here: Use 'user_id_indexed' for grouping instead of 'user_id' - idcg = predictions.groupBy("user_id_indexed").agg(F.sum(F.lit(1) / F.log2(F.col("rank") + 1)).alias("idcg")) - # Change here: Use 'user_id_indexed' for joining instead of 'user_id' - ndcg = dcg.join(idcg, "user_id_indexed").withColumn("ndcg", F.col("dcg") / F.col("idcg")) - return ndcg.select(F.mean("ndcg")).collect()[0][0] - -import pandas as pd -from pyspark.sql import functions as F - -def avg_precision_at_k_per_user(ranked_recommendations, k=5, num_users_to_show=6): - """ - Calculates average precision@k per user for ranked recommendations and shows results for a specified number of users. - - Args: - ranked_recommendations: A PySpark DataFrame with columns: - - user_id_indexed: The indexed user ID. - - business_id_indexed: The indexed business ID. - - relevance: Relevance of the business to the user (1 for relevant, 0 for irrelevant). - k: The number of top recommendations to consider (default: 5). - num_users_to_show: The number of users to show precision for (default: 6). - - Returns: - A pandas DataFrame with columns: - - user_id_indexed: The indexed user ID. - - avg_precision_at_k: The average precision@k value for the user. - """ - - # Calculate precision@k for each user - precision_per_user_df = ranked_recommendations.withColumn("rank", F.row_number().over(Window.partitionBy("user_id_indexed").orderBy(F.desc("hybrid_score")))) \ - .filter(F.col("rank") <= k) \ - .groupBy("user_id_indexed") \ - .agg(F.collect_list(F.struct("rank", "relevance")).alias("ranked_items")) \ - .withColumn("avg_precision_at_k", F.expr(""" - aggregate( - zip_with(ranked_items, sequence(1, size(ranked_items))), - cast(0 as double), - (acc, x) -> acc + (if(x.ranked_items.relevance = 1, cast(sum(y.ranked_items.relevance for y in slice(ranked_items, 1, x.ranked_items.rank)) as double) / x.ranked_items.rank, 0.0)) - ) / sum(cast(x.ranked_items.relevance as int) for x in ranked_items) - """)) \ - .select("user_id_indexed", "avg_precision_at_k") - - # Convert to pandas DataFrame and show results for specified number of users - precision_df = precision_per_user_df.limit(num_users_to_show).toPandas() # Limit to num_users_to_show - - return precision_df - -from pyspark.sql.window import Window -from pyspark.sql.functions import row_number, rank - -# Diversity metric (measuring how diverse the recommendations are) -def diversity(predictions, k=None): # Add k as an optional parameter - # If k is provided, take only the top k recommendations for each user - if k: - # Create a window specification to partition by user_id and order by some relevant column (e.g., prediction score) - # CHANGED: Replaced 'user_id' with 'user_id_indexed' in partitionBy - # CHANGED: Replaced 'prediction_score' with 'hybrid_score' or the correct score column name - window_spec = Window.partitionBy("user_id_indexed").orderBy("hybrid_score") - - # Use row_number or rank to assign a rank to each recommendation within the user's partition - # and filter out recommendations with rank > k - predictions = predictions.withColumn("rank", row_number().over(window_spec)) \ - .filter("rank <= {}".format(k)) \ - .drop("rank") # Remove the temporary rank column if needed - # Use 'business_id_indexed' instead of 'business_id' - unique_items = predictions.select("business_id_indexed").distinct().count() - total_recommendations = predictions.count() - return unique_items / total_recommendations - -# Evaluate Recall@K, NDCG, and Diversity -k = 5 - -recall = recall_at_k(ranked_recommendations, k) - -ndcg = ndcg_at_k(ranked_recommendations, k) - -diversity_score = diversity(ranked_recommendations,k) - -# Printing evaluatiom metrices -print(f"Recall@{k}: {recall}") -print(f"NDCG@{k}: {ndcg}") -print(f"Diversity: {diversity_score}") - -# Stop the Spark session -spark.stop() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 8db17a8..0000000 --- a/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -pandas -numpy -scikit-learn \ No newline at end of file