diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 8a00d2b..0000000 --- a/.gitignore +++ /dev/null @@ -1,179 +0,0 @@ -# Created by https://www.toptal.com/developers/gitignore/api/python -# Edit at https://www.toptal.com/developers/gitignore?templates=python - -### Python ### -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -doc/ -yelp_dataset/ -yelp_photos/ -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ -cover/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -.pybuilder/ -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -# For a library or package, you might want to ignore these files since the code is -# intended to run in multiple environments; otherwise, check them in: -# .python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# poetry -# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. -# This is especially recommended for binary packages to ensure reproducibility, and is more -# commonly ignored for libraries. -# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control -#poetry.lock - -# pdm -# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. -#pdm.lock -# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it -# in version control. -# https://pdm.fming.dev/#use-with-ide -.pdm.toml - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# pytype static type analyzer -.pytype/ - -# Cython debug symbols -cython_debug/ - -# PyCharm -# JetBrains specific template is maintained in a separate JetBrains.gitignore that can -# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore -# and can be added to the global gitignore or merged into this file. For a more nuclear -# option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ - -### Python Patch ### -# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration -poetry.toml - -# ruff -.ruff_cache/ - -# LSP config files -pyrightconfig.json - -# End of https://www.toptal.com/developers/gitignore/api/python diff --git a/App/preprocess/preprocess_pipeline.ipynb b/App/preprocess/preprocess_pipeline.ipynb deleted file mode 100644 index 0db4bf0..0000000 --- a/App/preprocess/preprocess_pipeline.ipynb +++ /dev/null @@ -1 +0,0 @@ -{"cells":[{"cell_type":"code","execution_count":1,"metadata":{"executionInfo":{"elapsed":3811,"status":"ok","timestamp":1728164875352,"user":{"displayName":"BZRecProject","userId":"17728562531777262703"},"user_tz":240},"id":"298aVYAHmo0I"},"outputs":[],"source":["import pandas as pd\n","import numpy as np\n","from sklearn.preprocessing import StandardScaler\n","from sklearn.impute import SimpleImputer\n","from sklearn.compose import ColumnTransformer\n","from sklearn.pipeline import Pipeline\n","from sklearn.feature_extraction.text import TfidfVectorizer\n","from sklearn.preprocessing import OneHotEncoder"]},{"cell_type":"code","execution_count":2,"metadata":{},"outputs":[{"name":"stderr","output_type":"stream","text":["C:\\Users\\User\\AppData\\Local\\Temp\\ipykernel_28432\\2173750344.py:3: DtypeWarning: Columns (57,58) have mixed types. Specify dtype option on import or set low_memory=False.\n"," business_df = pd.read_csv('..\\..\\yelp_dataset\\yelp_business.csv')\n"]}],"source":["# Load the dataset\n","\n","business_df = pd.read_csv('..\\..\\yelp_dataset\\yelp_business.csv')\n","tip_df = pd.read_csv('..\\..\\yelp_dataset\\yelp_tip.csv')\n","checkin_df = pd.read_csv('..\\..\\yelp_dataset\\yelp_checkin.csv')"]},{"cell_type":"code","execution_count":3,"metadata":{},"outputs":[{"data":{"text/html":["
| \n"," | _id | \n","business_id | \n","name | \n","address | \n","city | \n","state | \n","postal_code | \n","latitude | \n","longitude | \n","stars | \n","... | \n","attributes.AcceptsInsurance | \n","attributes.BestNights | \n","attributes.BYOB | \n","attributes.Corkage | \n","attributes.BYOBCorkage | \n","attributes.HairSpecializesIn | \n","attributes.Open24Hours | \n","attributes.RestaurantsCounterService | \n","attributes.AgesAllowed | \n","attributes.DietaryRestrictions | \n","
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n","66ea4800e59c7c5b6d879305 | \n","Pns2l4eNsfO8kk83dixA6A | \n","Abby Rappoport, LAC, CMQ | \n","1616 Chapala St, Ste 2 | \n","Santa Barbara | \n","CA | \n","93101 | \n","34.426679 | \n","-119.711197 | \n","5.0 | \n","... | \n","NaN | \n","NaN | \n","NaN | \n","NaN | \n","NaN | \n","NaN | \n","NaN | \n","NaN | \n","NaN | \n","NaN | \n","
| 1 | \n","66ea4800e59c7c5b6d879306 | \n","mpf3x-BjTdTEA3yCZrAYPw | \n","The UPS Store | \n","87 Grasso Plaza Shopping Center | \n","Affton | \n","MO | \n","63123 | \n","38.551126 | \n","-90.335695 | \n","3.0 | \n","... | \n","NaN | \n","NaN | \n","NaN | \n","NaN | \n","NaN | \n","NaN | \n","NaN | \n","NaN | \n","NaN | \n","NaN | \n","
| 2 | \n","66ea4800e59c7c5b6d879307 | \n","tUFrWirKiKi_TAnsVWINQQ | \n","Target | \n","5255 E Broadway Blvd | \n","Tucson | \n","AZ | \n","85711 | \n","32.223236 | \n","-110.880452 | \n","3.5 | \n","... | \n","NaN | \n","NaN | \n","NaN | \n","NaN | \n","NaN | \n","NaN | \n","NaN | \n","NaN | \n","NaN | \n","NaN | \n","
3 rows × 61 columns
\n","| \n"," | _id | \n","user_id | \n","business_id | \n","text | \n","date | \n","compliment_count | \n","
|---|---|---|---|---|---|---|
| 0 | \n","66ea489ae59c7c5b6d8be1ac | \n","AGNUgVwnZUey3gcPCJ76iw | \n","3uLgwr0qeCNMjKenHJwPGQ | \n","Avengers time with the ladies. | \n","2012-05-18 02:17:21 | \n","0 | \n","
| 1 | \n","66ea489ae59c7c5b6d8be1ad | \n","NBN4MgHP9D3cw--SnauTkA | \n","QoezRbYQncpRqyrLH6Iqjg | \n","They have lots of good deserts and tasty cuban... | \n","2013-02-05 18:35:10 | \n","0 | \n","
| 2 | \n","66ea489ae59c7c5b6d8be1ae | \n","-copOvldyKh1qr-vzkDEvw | \n","MYoRNLb5chwjQe3c_k37Gg | \n","It's open even when you think it isn't | \n","2013-08-18 00:56:08 | \n","0 | \n","
| \n"," | _id | \n","business_id | \n","date | \n","
|---|---|---|---|
| 0 | \n","66ea4866e59c7c5b6d89de50 | \n","---kPU91CF4Lq2-WlRu9Lw | \n","2020-03-13 21:10:56, 2020-06-02 22:18:06, 2020... | \n","
| 1 | \n","66ea4866e59c7c5b6d89de51 | \n","--0iUa4sNDFiZFrAdIWhZQ | \n","2010-09-13 21:43:09, 2011-05-04 23:08:15, 2011... | \n","
| 2 | \n","66ea4866e59c7c5b6d89de52 | \n","--30_8IhuyMHbSOcNWd6DQ | \n","2013-06-14 23:29:17, 2014-08-13 23:20:22 | \n","
!pip install pyspark
-Collecting pyspark - Downloading pyspark-3.5.3.tar.gz (317.3 MB) - ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 317.3/317.3 MB 4.6 MB/s eta 0:00:00 - Preparing metadata (setup.py) ... done -Requirement already satisfied: py4j==0.10.9.7 in /usr/local/lib/python3.10/dist-packages (from pyspark) (0.10.9.7) -Building wheels for collected packages: pyspark - Building wheel for pyspark (setup.py) ... done - Created wheel for pyspark: filename=pyspark-3.5.3-py2.py3-none-any.whl size=317840625 sha256=35be9fc497795aef00179529df4ea4eab1614099ff6562e5b837a73b9a813a86 - Stored in directory: /root/.cache/pip/wheels/1b/3a/92/28b93e2fbfdbb07509ca4d6f50c5e407f48dce4ddbda69a4ab -Successfully built pyspark -Installing collected packages: pyspark -Successfully installed pyspark-3.5.3 --
!pip install findspark
-Collecting findspark - Downloading findspark-2.0.1-py2.py3-none-any.whl.metadata (352 bytes) -Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB) -Installing collected packages: findspark -Successfully installed findspark-2.0.1 --
!pip install pyarrow
-Requirement already satisfied: pyarrow in /usr/local/lib/python3.10/dist-packages (16.1.0) -Requirement already satisfied: numpy>=1.16.6 in /usr/local/lib/python3.10/dist-packages (from pyarrow) (1.26.4) --
import findspark
-findspark.init()
-from pyspark.sql import SparkSession
-from pyspark.sql.functions import col
-from pyspark.sql.functions import *
-import time
-# Initialize SparkSession
-spark = SparkSession.builder.appName("CSVReader").getOrCreate()
-# Start time
-start_time = time.time()
-print(f"Execution started at: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(start_time))}")
-
-# Read the CSV data
-df = spark.read.csv("/content/drive/MyDrive/CSV/yelp_business.csv", header=True, inferSchema=True)
-
-# Display the entire dataframe (use with caution for large datasets)
-df.show()
-
-# End time
-end_time = time.time()
-print(f"Execution ended at: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(end_time))}")
-
-# Calculate and print the execution time
-execution_time = end_time - start_time
-print(f"Execution time: {execution_time:.2f} seconds")
-Execution started at: 2024-10-11 23:42:41
-+--------------------+--------------------+--------------------+--------------------+--------------+-----+-----------+-------------+--------------+-----+------------+-------+----------------------------+--------------------+-----+-------------------------------------+------------+-------------+---------------+--------------+------------+--------------+----------------------+---------------------------------+--------------------+-----------------------------+------------------------------+-----------------+---------------+--------------------------+-------------------------------+--------------------+-------------------------+----------------+----------------------------------+----------------------+------------+------------------+----------------------+----------------------------+--------------------+----------------------------------+-----------------------------------+--------------------+----------+---------------------+----------------------+---------------------------------+------------------+----------------+-------------------------+---------------------------+---------------------+---------------+------------------+----------------------+----------------------------+----------------------+------------------------------------+----------------------+------------------------------+
-| _id| business_id| name| address| city|state|postal_code| latitude| longitude|stars|review_count|is_open|attributes.ByAppointmentOnly| categories|hours|attributes.BusinessAcceptsCreditCards|hours.Monday|hours.Tuesday|hours.Wednesday|hours.Thursday|hours.Friday|hours.Saturday|attributes.BikeParking|attributes.RestaurantsPriceRange2|attributes.CoatCheck|attributes.RestaurantsTakeOut|attributes.RestaurantsDelivery|attributes.Caters|attributes.WiFi|attributes.BusinessParking|attributes.WheelchairAccessible|attributes.HappyHour|attributes.OutdoorSeating|attributes.HasTV|attributes.RestaurantsReservations|attributes.DogsAllowed|hours.Sunday|attributes.Alcohol|attributes.GoodForKids|attributes.RestaurantsAttire| attributes.Ambience|attributes.RestaurantsTableService|attributes.RestaurantsGoodForGroups|attributes.DriveThru|attributes|attributes.NoiseLevel|attributes.GoodForMeal|attributes.BusinessAcceptsBitcoin|attributes.Smoking|attributes.Music|attributes.GoodForDancing|attributes.AcceptsInsurance|attributes.BestNights|attributes.BYOB|attributes.Corkage|attributes.BYOBCorkage|attributes.HairSpecializesIn|attributes.Open24Hours|attributes.RestaurantsCounterService|attributes.AgesAllowed|attributes.DietaryRestrictions|
-+--------------------+--------------------+--------------------+--------------------+--------------+-----+-----------+-------------+--------------+-----+------------+-------+----------------------------+--------------------+-----+-------------------------------------+------------+-------------+---------------+--------------+------------+--------------+----------------------+---------------------------------+--------------------+-----------------------------+------------------------------+-----------------+---------------+--------------------------+-------------------------------+--------------------+-------------------------+----------------+----------------------------------+----------------------+------------+------------------+----------------------+----------------------------+--------------------+----------------------------------+-----------------------------------+--------------------+----------+---------------------+----------------------+---------------------------------+------------------+----------------+-------------------------+---------------------------+---------------------+---------------+------------------+----------------------+----------------------------+----------------------+------------------------------------+----------------------+------------------------------+
-|66ea4800e59c7c5b6...|Pns2l4eNsfO8kk83d...|Abby Rappoport, L...|1616 Chapala St, ...| Santa Barbara| CA| 93101| 34.4266787| -119.7111968| 5.0| 7| 0| True|Doctors, Traditio...| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|
-|66ea4800e59c7c5b6...|mpf3x-BjTdTEA3yCZ...| The UPS Store|87 Grasso Plaza S...| Affton| MO| 63123| 38.551126| -90.335695| 3.0| 15| 1| NULL|Shipping Centers,...| NULL| True| 0:0-0:0| 8:0-18:30| 8:0-18:30| 8:0-18:30| 8:0-18:30| 8:0-14:0| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|
-|66ea4800e59c7c5b6...|tUFrWirKiKi_TAnsV...| Target|5255 E Broadway Blvd| Tucson| AZ| 85711| 32.223236| -110.880452| 3.5| 22| 0| False|Department Stores...| NULL| True| 8:0-22:0| 8:0-22:0| 8:0-22:0| 8:0-22:0| 8:0-23:0| 8:0-23:0| True| 2| False| False| False| False| u'no'| {'garage': False,...| True| False| False| False| False| False| 8:0-22:0| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|
-|66ea4800e59c7c5b6...|MTSW4McQd7CbVtyjq...| St Honore Pastries| 935 Race St| Philadelphia| PA| 19107| 39.9555052| -75.1555641| 4.0| 80| 1| False|Restaurants, Food...| NULL| False| 7:0-20:0| 7:0-20:0| 7:0-20:0| 7:0-20:0| 7:0-21:0| 7:0-21:0| True| 1| NULL| True| False| True| u'free'| {'garage': False,...| NULL| NULL| False| NULL| NULL| NULL| 7:0-21:0| u'none'| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|
-|66ea4800e59c7c5b6...|mWMc6_wTdE0EUBKIG...|Perkiomen Valley ...| 101 Walnut St| Green Lane| PA| 18054| 40.3381827| -75.4716585| 4.5| 13| 1| NULL|Brewpubs, Breweri...| NULL| True| NULL| NULL| 14:0-22:0| 16:0-22:0| 12:0-22:0| 12:0-22:0| True| NULL| NULL| True| NULL| False| NULL| {'garage': None, ...| True| NULL| NULL| NULL| NULL| NULL| 12:0-18:0| NULL| True| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|
-|66ea4800e59c7c5b6...|CF33F8-E6oudUQ46H...| Sonic Drive-In| 615 S Main St| Ashland City| TN| 37015| 36.269593| -87.058943| 2.0| 6| 1| False|Burgers, Fast Foo...| NULL| True| 0:0-0:0| 6:0-22:0| 6:0-22:0| 6:0-22:0| 9:0-0:0| 9:0-22:0| False| 1| False| True| True| False| u'no'| None| True| False| True| True| False| False| 8:0-22:0| u'none'| True| u'casual'| None| False| True| True| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|
-|66ea4800e59c7c5b6...|n_0UpQx1hsNbnPUSl...| Famous Footwear|8522 Eager Road, ...| Brentwood| MO| 63144| 38.627695| -90.340465| 2.5| 13| 1| NULL|Sporting Goods, F...| NULL| True| 0:0-0:0| 10:0-18:0| 10:0-18:0| 10:0-18:0| 10:0-18:0| 10:0-18:0| True| 2| NULL| NULL| NULL| NULL| NULL| {'garage': False,...| NULL| NULL| NULL| NULL| NULL| NULL| 12:0-18:0| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|
-|66ea4800e59c7c5b6...|qkRM_2X51Yqxk3btl...| Temple Beth-El| 400 Pasadena Ave S|St. Petersburg| FL| 33707| 27.76659| -82.732983| 3.5| 5| 1| NULL|Synagogues, Relig...| NULL| NULL| 9:0-17:0| 9:0-17:0| 9:0-17:0| 9:0-17:0| 9:0-17:0| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|
-|66ea4800e59c7c5b6...|k0hlBqXX-Bt0vf1op...|Tsevi's Pub And G...| 8025 Mackenzie Rd| Affton| MO| 63123| 38.5651648| -90.3210868| 3.0| 19| 0| NULL|Pubs, Restaurants...| NULL| True| NULL| NULL| NULL| NULL| NULL| NULL| NULL| 1| NULL| True| False| True| u'free'| {'garage': False,...| NULL| NULL| True| True| False| NULL| NULL| u'full_bar'| True| u'casual'|{'romantic': Fals...| NULL| True| NULL| NULL| u'average'| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|
-|66ea4800e59c7c5b6...|bBDDEgkFA1Otx9Lfe...| Sonic Drive-In| 2312 Dickerson Pike| Nashville| TN| 37207| 36.2081024| -86.7681696| 1.5| 10| 1| False|Ice Cream & Froze...| NULL| True| 0:0-0:0| 6:0-21:0| 6:0-21:0| 6:0-16:0| 6:0-16:0| 6:0-17:0| NULL| 1| False| True| True| False| u'no'| {'garage': False,...| True| False| True| True| False| False| 6:0-21:0| u'none'| True| 'casual'| NULL| False| False| True| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|
-|66ea4800e59c7c5b6...|UJsufbvfyfONHeWdv...| Marshalls|21705 Village Lak...| Land O' Lakes| FL| 34639|28.1904587953|-82.4573802199| 3.5| 6| 1| NULL|Department Stores...| NULL| True| 9:30-21:30| 9:30-21:30| 9:30-21:30| 9:30-21:30| 9:30-21:30| 9:30-21:30| True| 2| NULL| NULL| NULL| NULL| NULL| {'garage': False,...| NULL| NULL| NULL| NULL| NULL| NULL| 10:0-20:0| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|
-|66ea4800e59c7c5b6...|eEOYSgkmpB90uNA7l...|Vietnamese Food T...| NULL| Tampa Bay| FL| 33602| 27.9552692| -82.4563199| 4.0| 10| 1| NULL|Vietnamese, Food,...| NULL| NULL| 11:0-14:0| 11:0-14:0| 11:0-14:0| 11:0-14:0| 11:0-14:0| 5:0-10:0| NULL| NULL| NULL| NULL| NULL| NULL| NULL| {'garage': False,...| NULL| NULL| None| NULL| False| NULL| 15:0-18:0| 'none'| NULL| NULL|{'touristy': Fals...| NULL| NULL| NULL| NULL| NULL| {'dessert': False...| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|
-|66ea4800e59c7c5b6...|il_Ro8jwPlHresjw9...| Denny's| 8901 US 31 S| Indianapolis| IN| 46227|39.6371332838| -86.127217412| 2.5| 28| 1| NULL|American (Traditi...| NULL| True| 6:0-22:0| 6:0-22:0| 6:0-22:0| 6:0-22:0| 6:0-22:0| 6:0-22:0| False| 1| NULL| True| True| NULL| u'no'| {'garage': None, ...| NULL| NULL| False| True| False| NULL| 6:0-22:0| 'none'| True| 'casual'|{'touristy': None...| NULL| True| NULL| NULL| NULL| {'dessert': False...| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|
-|66ea4800e59c7c5b6...|jaxMSoInw8Poo3XeM...| Adams Dental| 15 N Missouri Ave| Clearwater| FL| 33755| 27.966235| -82.787412| 5.0| 10| 1| True|General Dentistry...| NULL| NULL| 7:30-15:30| 7:30-15:30| 7:30-15:30| 7:30-15:30| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|
-|66ea4800e59c7c5b6...|0bPLkL0QhhPO5kt1_...|Zio's Italian Market| 2575 E Bay Dr| Largo| FL| 33771| 27.9161159| -82.7604608| 4.5| 100| 0| NULL|Food, Delis, Ital...| NULL| True| 10:0-18:0| 10:0-20:0| 10:0-20:0| 10:0-20:0| 10:0-20:0| 10:0-20:0| True| 1| NULL| True| True| True| u'no'| {'garage': False,...| True| NULL| False| True| False| NULL| NULL| u'none'| True| u'casual'|{'romantic': Fals...| False| False| NULL| NULL| u'average'| {'dessert': False...| False| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|
-|66ea4800e59c7c5b6...|MUTTqe8uqyMdBl186...| Tuna Bar| 205 Race St| Philadelphia| PA| 19106| 39.953949| -75.1432262| 4.0| 245| 1| NULL|Sushi Bars, Resta...| NULL| True| NULL| 13:30-22:0| 13:30-22:0| 13:30-22:0| 13:30-23:0| 13:30-23:0| NULL| 2| NULL| True| True| NULL| 'free'| {u'valet': False,...| True| True| True| False| True| False| 13:30-22:0| 'full_bar'| False| 'casual'|{'touristy': Fals...| True| True| NULL| NULL| u'average'| {'dessert': True,...| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|
-|66ea4800e59c7c5b6...|rBmpy_Y1UbBx8ggHl...|Arizona Truck Out...| 625 N Stone Ave| Tucson| AZ| 85705| 32.2298719| -110.9723419| 4.5| 10| 1| False|Automotive, Auto ...| NULL| True| 0:0-0:0| 8:0-17:0| 8:0-17:0| 8:0-17:0| 8:0-17:0| 8:0-14:0| NULL| NULL| NULL| NULL| NULL| NULL| u'free'| NULL| NULL| NULL| NULL| NULL| NULL| NULL| 0:0-0:0| NULL| NULL| NULL| NULL| NULL| NULL| False| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|
-|66ea4800e59c7c5b6...|M0XSSHqrASOnhgbWD...| Herb Import Co| 712 Adams St| New Orleans| LA| 70118|29.9414679565| -90.129952757| 4.0| 5| 1| NULL|Vape Shops, Tobac...| NULL| True| 10:0-19:0| 10:0-19:0| 10:0-19:0| 10:0-19:0| 10:0-19:0| 10:0-19:0| True| 2| NULL| NULL| NULL| NULL| NULL| {'garage': False,...| NULL| NULL| NULL| NULL| NULL| NULL| 10:0-19:0| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|
-|66ea4800e59c7c5b6...|8wGISYjYkE2tSqn3c...| Nifty Car Rental| 1241 Airline Dr| Kenner| LA| 70062| 29.981183| -90.2540123| 3.5| 14| 1| NULL|Automotive, Car R...| NULL| NULL| 8:0-17:0| 8:0-17:0| 8:0-17:0| 8:0-17:0| 8:0-17:0| 9:0-15:0| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| 9:0-12:0| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|
-|66ea4800e59c7c5b6...|ROeacJQwBeh05Rqg7...| BAP| 1224 South St| Philadelphia| PA| 19147| 39.943223| -75.162568| 4.5| 205| 1| NULL| Korean, Restaurants| NULL| True| 11:30-20:30| 11:30-20:30| 11:30-20:30| 11:30-20:30| 11:30-20:30| 11:30-20:30| True| 1| NULL| True| None| True| u'no'| {'garage': False,...| NULL| NULL| None| True| False| NULL| NULL| u'none'| True| u'casual'|{'touristy': Fals...| True| False| NULL| NULL| u'quiet'| {'dessert': False...| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|
-+--------------------+--------------------+--------------------+--------------------+--------------+-----+-----------+-------------+--------------+-----+------------+-------+----------------------------+--------------------+-----+-------------------------------------+------------+-------------+---------------+--------------+------------+--------------+----------------------+---------------------------------+--------------------+-----------------------------+------------------------------+-----------------+---------------+--------------------------+-------------------------------+--------------------+-------------------------+----------------+----------------------------------+----------------------+------------+------------------+----------------------+----------------------------+--------------------+----------------------------------+-----------------------------------+--------------------+----------+---------------------+----------------------+---------------------------------+------------------+----------------+-------------------------+---------------------------+---------------------+---------------+------------------+----------------------+----------------------------+----------------------+------------------------------------+----------------------+------------------------------+
-only showing top 20 rows
-
-Execution ended at: 2024-10-11 23:42:58
-Execution time: 17.29 seconds
-
-# total records in the dataframe
-
-total_records = df.count()
-print(f"Total records in the dataframe: {total_records}")
-Total records in the dataframe: 150346 --
# dropping the column _id
-
-df_business = df.drop('_id')
-df_business.show()
-+--------------------+--------------------+--------------------+--------------+-----+-----------+-------------+--------------+-----+------------+-------+----------------------------+--------------------+-----+-------------------------------------+------------+-------------+---------------+--------------+------------+--------------+----------------------+---------------------------------+--------------------+-----------------------------+------------------------------+-----------------+---------------+--------------------------+-------------------------------+--------------------+-------------------------+----------------+----------------------------------+----------------------+------------+------------------+----------------------+----------------------------+--------------------+----------------------------------+-----------------------------------+--------------------+----------+---------------------+----------------------+---------------------------------+------------------+----------------+-------------------------+---------------------------+---------------------+---------------+------------------+----------------------+----------------------------+----------------------+------------------------------------+----------------------+------------------------------+
-| business_id| name| address| city|state|postal_code| latitude| longitude|stars|review_count|is_open|attributes.ByAppointmentOnly| categories|hours|attributes.BusinessAcceptsCreditCards|hours.Monday|hours.Tuesday|hours.Wednesday|hours.Thursday|hours.Friday|hours.Saturday|attributes.BikeParking|attributes.RestaurantsPriceRange2|attributes.CoatCheck|attributes.RestaurantsTakeOut|attributes.RestaurantsDelivery|attributes.Caters|attributes.WiFi|attributes.BusinessParking|attributes.WheelchairAccessible|attributes.HappyHour|attributes.OutdoorSeating|attributes.HasTV|attributes.RestaurantsReservations|attributes.DogsAllowed|hours.Sunday|attributes.Alcohol|attributes.GoodForKids|attributes.RestaurantsAttire| attributes.Ambience|attributes.RestaurantsTableService|attributes.RestaurantsGoodForGroups|attributes.DriveThru|attributes|attributes.NoiseLevel|attributes.GoodForMeal|attributes.BusinessAcceptsBitcoin|attributes.Smoking|attributes.Music|attributes.GoodForDancing|attributes.AcceptsInsurance|attributes.BestNights|attributes.BYOB|attributes.Corkage|attributes.BYOBCorkage|attributes.HairSpecializesIn|attributes.Open24Hours|attributes.RestaurantsCounterService|attributes.AgesAllowed|attributes.DietaryRestrictions|
-+--------------------+--------------------+--------------------+--------------+-----+-----------+-------------+--------------+-----+------------+-------+----------------------------+--------------------+-----+-------------------------------------+------------+-------------+---------------+--------------+------------+--------------+----------------------+---------------------------------+--------------------+-----------------------------+------------------------------+-----------------+---------------+--------------------------+-------------------------------+--------------------+-------------------------+----------------+----------------------------------+----------------------+------------+------------------+----------------------+----------------------------+--------------------+----------------------------------+-----------------------------------+--------------------+----------+---------------------+----------------------+---------------------------------+------------------+----------------+-------------------------+---------------------------+---------------------+---------------+------------------+----------------------+----------------------------+----------------------+------------------------------------+----------------------+------------------------------+
-|Pns2l4eNsfO8kk83d...|Abby Rappoport, L...|1616 Chapala St, ...| Santa Barbara| CA| 93101| 34.4266787| -119.7111968| 5.0| 7| 0| True|Doctors, Traditio...| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|
-|mpf3x-BjTdTEA3yCZ...| The UPS Store|87 Grasso Plaza S...| Affton| MO| 63123| 38.551126| -90.335695| 3.0| 15| 1| NULL|Shipping Centers,...| NULL| True| 0:0-0:0| 8:0-18:30| 8:0-18:30| 8:0-18:30| 8:0-18:30| 8:0-14:0| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|
-|tUFrWirKiKi_TAnsV...| Target|5255 E Broadway Blvd| Tucson| AZ| 85711| 32.223236| -110.880452| 3.5| 22| 0| False|Department Stores...| NULL| True| 8:0-22:0| 8:0-22:0| 8:0-22:0| 8:0-22:0| 8:0-23:0| 8:0-23:0| True| 2| False| False| False| False| u'no'| {'garage': False,...| True| False| False| False| False| False| 8:0-22:0| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|
-|MTSW4McQd7CbVtyjq...| St Honore Pastries| 935 Race St| Philadelphia| PA| 19107| 39.9555052| -75.1555641| 4.0| 80| 1| False|Restaurants, Food...| NULL| False| 7:0-20:0| 7:0-20:0| 7:0-20:0| 7:0-20:0| 7:0-21:0| 7:0-21:0| True| 1| NULL| True| False| True| u'free'| {'garage': False,...| NULL| NULL| False| NULL| NULL| NULL| 7:0-21:0| u'none'| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|
-|mWMc6_wTdE0EUBKIG...|Perkiomen Valley ...| 101 Walnut St| Green Lane| PA| 18054| 40.3381827| -75.4716585| 4.5| 13| 1| NULL|Brewpubs, Breweri...| NULL| True| NULL| NULL| 14:0-22:0| 16:0-22:0| 12:0-22:0| 12:0-22:0| True| NULL| NULL| True| NULL| False| NULL| {'garage': None, ...| True| NULL| NULL| NULL| NULL| NULL| 12:0-18:0| NULL| True| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|
-|CF33F8-E6oudUQ46H...| Sonic Drive-In| 615 S Main St| Ashland City| TN| 37015| 36.269593| -87.058943| 2.0| 6| 1| False|Burgers, Fast Foo...| NULL| True| 0:0-0:0| 6:0-22:0| 6:0-22:0| 6:0-22:0| 9:0-0:0| 9:0-22:0| False| 1| False| True| True| False| u'no'| None| True| False| True| True| False| False| 8:0-22:0| u'none'| True| u'casual'| None| False| True| True| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|
-|n_0UpQx1hsNbnPUSl...| Famous Footwear|8522 Eager Road, ...| Brentwood| MO| 63144| 38.627695| -90.340465| 2.5| 13| 1| NULL|Sporting Goods, F...| NULL| True| 0:0-0:0| 10:0-18:0| 10:0-18:0| 10:0-18:0| 10:0-18:0| 10:0-18:0| True| 2| NULL| NULL| NULL| NULL| NULL| {'garage': False,...| NULL| NULL| NULL| NULL| NULL| NULL| 12:0-18:0| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|
-|qkRM_2X51Yqxk3btl...| Temple Beth-El| 400 Pasadena Ave S|St. Petersburg| FL| 33707| 27.76659| -82.732983| 3.5| 5| 1| NULL|Synagogues, Relig...| NULL| NULL| 9:0-17:0| 9:0-17:0| 9:0-17:0| 9:0-17:0| 9:0-17:0| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|
-|k0hlBqXX-Bt0vf1op...|Tsevi's Pub And G...| 8025 Mackenzie Rd| Affton| MO| 63123| 38.5651648| -90.3210868| 3.0| 19| 0| NULL|Pubs, Restaurants...| NULL| True| NULL| NULL| NULL| NULL| NULL| NULL| NULL| 1| NULL| True| False| True| u'free'| {'garage': False,...| NULL| NULL| True| True| False| NULL| NULL| u'full_bar'| True| u'casual'|{'romantic': Fals...| NULL| True| NULL| NULL| u'average'| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|
-|bBDDEgkFA1Otx9Lfe...| Sonic Drive-In| 2312 Dickerson Pike| Nashville| TN| 37207| 36.2081024| -86.7681696| 1.5| 10| 1| False|Ice Cream & Froze...| NULL| True| 0:0-0:0| 6:0-21:0| 6:0-21:0| 6:0-16:0| 6:0-16:0| 6:0-17:0| NULL| 1| False| True| True| False| u'no'| {'garage': False,...| True| False| True| True| False| False| 6:0-21:0| u'none'| True| 'casual'| NULL| False| False| True| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|
-|UJsufbvfyfONHeWdv...| Marshalls|21705 Village Lak...| Land O' Lakes| FL| 34639|28.1904587953|-82.4573802199| 3.5| 6| 1| NULL|Department Stores...| NULL| True| 9:30-21:30| 9:30-21:30| 9:30-21:30| 9:30-21:30| 9:30-21:30| 9:30-21:30| True| 2| NULL| NULL| NULL| NULL| NULL| {'garage': False,...| NULL| NULL| NULL| NULL| NULL| NULL| 10:0-20:0| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|
-|eEOYSgkmpB90uNA7l...|Vietnamese Food T...| NULL| Tampa Bay| FL| 33602| 27.9552692| -82.4563199| 4.0| 10| 1| NULL|Vietnamese, Food,...| NULL| NULL| 11:0-14:0| 11:0-14:0| 11:0-14:0| 11:0-14:0| 11:0-14:0| 5:0-10:0| NULL| NULL| NULL| NULL| NULL| NULL| NULL| {'garage': False,...| NULL| NULL| None| NULL| False| NULL| 15:0-18:0| 'none'| NULL| NULL|{'touristy': Fals...| NULL| NULL| NULL| NULL| NULL| {'dessert': False...| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|
-|il_Ro8jwPlHresjw9...| Denny's| 8901 US 31 S| Indianapolis| IN| 46227|39.6371332838| -86.127217412| 2.5| 28| 1| NULL|American (Traditi...| NULL| True| 6:0-22:0| 6:0-22:0| 6:0-22:0| 6:0-22:0| 6:0-22:0| 6:0-22:0| False| 1| NULL| True| True| NULL| u'no'| {'garage': None, ...| NULL| NULL| False| True| False| NULL| 6:0-22:0| 'none'| True| 'casual'|{'touristy': None...| NULL| True| NULL| NULL| NULL| {'dessert': False...| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|
-|jaxMSoInw8Poo3XeM...| Adams Dental| 15 N Missouri Ave| Clearwater| FL| 33755| 27.966235| -82.787412| 5.0| 10| 1| True|General Dentistry...| NULL| NULL| 7:30-15:30| 7:30-15:30| 7:30-15:30| 7:30-15:30| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|
-|0bPLkL0QhhPO5kt1_...|Zio's Italian Market| 2575 E Bay Dr| Largo| FL| 33771| 27.9161159| -82.7604608| 4.5| 100| 0| NULL|Food, Delis, Ital...| NULL| True| 10:0-18:0| 10:0-20:0| 10:0-20:0| 10:0-20:0| 10:0-20:0| 10:0-20:0| True| 1| NULL| True| True| True| u'no'| {'garage': False,...| True| NULL| False| True| False| NULL| NULL| u'none'| True| u'casual'|{'romantic': Fals...| False| False| NULL| NULL| u'average'| {'dessert': False...| False| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|
-|MUTTqe8uqyMdBl186...| Tuna Bar| 205 Race St| Philadelphia| PA| 19106| 39.953949| -75.1432262| 4.0| 245| 1| NULL|Sushi Bars, Resta...| NULL| True| NULL| 13:30-22:0| 13:30-22:0| 13:30-22:0| 13:30-23:0| 13:30-23:0| NULL| 2| NULL| True| True| NULL| 'free'| {u'valet': False,...| True| True| True| False| True| False| 13:30-22:0| 'full_bar'| False| 'casual'|{'touristy': Fals...| True| True| NULL| NULL| u'average'| {'dessert': True,...| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|
-|rBmpy_Y1UbBx8ggHl...|Arizona Truck Out...| 625 N Stone Ave| Tucson| AZ| 85705| 32.2298719| -110.9723419| 4.5| 10| 1| False|Automotive, Auto ...| NULL| True| 0:0-0:0| 8:0-17:0| 8:0-17:0| 8:0-17:0| 8:0-17:0| 8:0-14:0| NULL| NULL| NULL| NULL| NULL| NULL| u'free'| NULL| NULL| NULL| NULL| NULL| NULL| NULL| 0:0-0:0| NULL| NULL| NULL| NULL| NULL| NULL| False| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|
-|M0XSSHqrASOnhgbWD...| Herb Import Co| 712 Adams St| New Orleans| LA| 70118|29.9414679565| -90.129952757| 4.0| 5| 1| NULL|Vape Shops, Tobac...| NULL| True| 10:0-19:0| 10:0-19:0| 10:0-19:0| 10:0-19:0| 10:0-19:0| 10:0-19:0| True| 2| NULL| NULL| NULL| NULL| NULL| {'garage': False,...| NULL| NULL| NULL| NULL| NULL| NULL| 10:0-19:0| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|
-|8wGISYjYkE2tSqn3c...| Nifty Car Rental| 1241 Airline Dr| Kenner| LA| 70062| 29.981183| -90.2540123| 3.5| 14| 1| NULL|Automotive, Car R...| NULL| NULL| 8:0-17:0| 8:0-17:0| 8:0-17:0| 8:0-17:0| 8:0-17:0| 9:0-15:0| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| 9:0-12:0| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|
-|ROeacJQwBeh05Rqg7...| BAP| 1224 South St| Philadelphia| PA| 19147| 39.943223| -75.162568| 4.5| 205| 1| NULL| Korean, Restaurants| NULL| True| 11:30-20:30| 11:30-20:30| 11:30-20:30| 11:30-20:30| 11:30-20:30| 11:30-20:30| True| 1| NULL| True| None| True| u'no'| {'garage': False,...| NULL| NULL| None| True| False| NULL| NULL| u'none'| True| u'casual'|{'touristy': Fals...| True| False| NULL| NULL| u'quiet'| {'dessert': False...| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL| NULL|
-+--------------------+--------------------+--------------------+--------------+-----+-----------+-------------+--------------+-----+------------+-------+----------------------------+--------------------+-----+-------------------------------------+------------+-------------+---------------+--------------+------------+--------------+----------------------+---------------------------------+--------------------+-----------------------------+------------------------------+-----------------+---------------+--------------------------+-------------------------------+--------------------+-------------------------+----------------+----------------------------------+----------------------+------------+------------------+----------------------+----------------------------+--------------------+----------------------------------+-----------------------------------+--------------------+----------+---------------------+----------------------+---------------------------------+------------------+----------------+-------------------------+---------------------------+---------------------+---------------+------------------+----------------------+----------------------------+----------------------+------------------------------------+----------------------+------------------------------+
-only showing top 20 rows
-
-
-# # prompt: print the values in the name field for all rec. print to see all values, print in a loop
-
-# for row in df_business.select("name").collect():
-# print(row.name)
-# prompt: write this as a csv to /content/drive/MyDrive/ProcessedCSV named business.csv
-
-# Write the DataFrame to a CSV file in Google Drive
-df_business.write.csv("/content/drive/MyDrive/ProcessedCSV/business.csv", header=True)
-
-!pip install pyspark
-Collecting pyspark - Downloading pyspark-3.5.3.tar.gz (317.3 MB) - ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 317.3/317.3 MB 4.7 MB/s eta 0:00:00 - Preparing metadata (setup.py) ... done -Requirement already satisfied: py4j==0.10.9.7 in /usr/local/lib/python3.10/dist-packages (from pyspark) (0.10.9.7) -Building wheels for collected packages: pyspark - Building wheel for pyspark (setup.py) ... done - Created wheel for pyspark: filename=pyspark-3.5.3-py2.py3-none-any.whl size=317840625 sha256=3dd61b0974866d4e607af1d47569042a0b2eec9f8ad7f1f009ed3c906afba77c - Stored in directory: /root/.cache/pip/wheels/1b/3a/92/28b93e2fbfdbb07509ca4d6f50c5e407f48dce4ddbda69a4ab -Successfully built pyspark -Installing collected packages: pyspark -Successfully installed pyspark-3.5.3 --
!pip install findspark
-Collecting findspark - Downloading findspark-2.0.1-py2.py3-none-any.whl.metadata (352 bytes) -Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB) -Installing collected packages: findspark -Successfully installed findspark-2.0.1 --
!pip install pyarrow
-Requirement already satisfied: pyarrow in /usr/local/lib/python3.10/dist-packages (16.1.0) -Requirement already satisfied: numpy>=1.16.6 in /usr/local/lib/python3.10/dist-packages (from pyarrow) (1.26.4) --
import findspark
-findspark.init()
-from pyspark.sql import SparkSession
-from pyspark.sql.functions import col
-from pyspark.sql.functions import *
-import time
-# Initialize SparkSession
-spark = SparkSession.builder.appName("CSVReader").getOrCreate()
-# Start time
-start_time = time.time()
-print(f"Execution started at: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(start_time))}")
-
-# Read the CSV data
-df = spark.read.csv("/content/drive/MyDrive/CSV/yelp_tip.csv", header=True, inferSchema=True)
-
-# Display the entire dataframe (use with caution for large datasets)
-df.show()
-
-# End time
-end_time = time.time()
-print(f"Execution ended at: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(end_time))}")
-
-# Calculate and print the execution time
-execution_time = end_time - start_time
-print(f"Execution time: {execution_time:.2f} seconds")
-Execution started at: 2024-10-12 00:27:44 -+--------------------+--------------------+--------------------+--------------------+-------------------+----------------+ -| _id| user_id| business_id| text| date|compliment_count| -+--------------------+--------------------+--------------------+--------------------+-------------------+----------------+ -|66ea489ae59c7c5b6...|AGNUgVwnZUey3gcPC...|3uLgwr0qeCNMjKenH...|Avengers time wit...|2012-05-18 02:17:21| 0| -|66ea489ae59c7c5b6...|NBN4MgHP9D3cw--Sn...|QoezRbYQncpRqyrLH...|They have lots of...|2013-02-05 18:35:10| 0| -|66ea489ae59c7c5b6...|-copOvldyKh1qr-vz...|MYoRNLb5chwjQe3c_...|It's open even wh...|2013-08-18 00:56:08| 0| -|66ea489ae59c7c5b6...|FjMQVZjSqY8syIO-5...|hV-bABTK-glh5wj31...|Very decent fried...|2017-06-27 23:05:38| 0| -|66ea489ae59c7c5b6...|ld0AperBXk1h6Ubqm...|_uN0OudeJ3Zl_tf6n...|Appetizers.. plat...|2012-10-06 19:43:09| 0| -|66ea489ae59c7c5b6...|trf3Qcz8qvCDKXiTg...|7Rm9Ba50bw23KTA8R...|Chili Cup + Singl...|2012-03-13 04:00:52| 0| -|66ea489ae59c7c5b6...|SMGAlRjyfuYu-c-22...|kH-0iXqkL7b8UXNpg...|Saturday, Dec 7th...|2013-12-03 23:42:15| 0| -|66ea489ae59c7c5b6...|YVBB9g23nuVJ0u44z...|jtri188kuhe_AuEOJ...|This is probably ...|2016-11-22 22:14:58| 0| -|66ea489ae59c7c5b6...|VL12EhEdT4OWqGq0n...|xODBZmX4EmlVvbqtK...| Tacos|2012-07-27 01:48:24| 0| -|66ea489ae59c7c5b6...|4ay-fdVks5WMerYL_...|pICJRcyqW1cF96Q3X...|Starbucks substit...|2012-06-09 22:57:04| 0| -|66ea489ae59c7c5b6...|OttfcRxgRrYsTg9EV...|clwjLY7PdYJpe7IP9...|Order the Tortill...|2014-06-17 01:20:14| 0| -|66ea489ae59c7c5b6...|JsXhBw6MntzTJjH_U...|wLHodvVFLTgK3nl2X...|Very good will de...|2017-03-23 22:01:41| 0| -|66ea489ae59c7c5b6...|Y0JfJh4B-jrtGc_AH...|wUMuvdUeVZODZk7Tj...|If the Hotlight i...|2013-02-28 02:05:54| 0| -|66ea489ae59c7c5b6...|MlnuJ7T14CE0JDK2Z...|MDr7KLYSPkEonvGoj...| Let's go Yankees!|2011-07-20 21:52:57| 0| -|66ea489ae59c7c5b6...|ffWWVlmsrN5lZ6sjA...|aK6R2akvIK9ijw3Fv...|Basically same fo...|2014-06-12 17:34:20| 0| -|66ea489ae59c7c5b6...|j2sEA3hiUcwHfq9Ml...|EXYbKA1tocvOK_1tX...|Don't go for dinn...|2011-10-13 03:15:15| 0| -|66ea489ae59c7c5b6...|jsaN4TDygu76AGTiB...|H9fkf4Xkj_j7Zxs1F...|30 mins for take ...|2012-03-11 23:16:12| 0| -|66ea489ae59c7c5b6...|kjFgyrCvmVVGSlgWz...|ReX09lhufLTAx19kr...|Got the grilled c...|2013-06-10 20:18:41| 0| -|66ea489ae59c7c5b6...|I6aRZ4sE7ixv0_2r3...|c5nLy7YgXG-IIrOmq...|This is the bomb ...|2016-04-23 02:44:03| 0| -|66ea489ae59c7c5b6...|Ll5l4WTKPH7zWQWA6...|LJaR65ALpz261_dlV...|Helping Mona find...|2012-06-02 14:39:28| 0| -+--------------------+--------------------+--------------------+--------------------+-------------------+----------------+ -only showing top 20 rows - -Execution ended at: 2024-10-12 00:28:04 -Execution time: 20.11 seconds --
# total records in the dataframe
-
-total_records = df.count()
-print(f"Total records in the dataframe: {total_records}")
-Total records in the dataframe: 908915 --
# dropping the column _id
-
-df_tip = df.drop('_id')
-df_tip.show()
-+--------------------+--------------------+--------------------+-------------------+----------------+ -| user_id| business_id| text| date|compliment_count| -+--------------------+--------------------+--------------------+-------------------+----------------+ -|AGNUgVwnZUey3gcPC...|3uLgwr0qeCNMjKenH...|Avengers time wit...|2012-05-18 02:17:21| 0| -|NBN4MgHP9D3cw--Sn...|QoezRbYQncpRqyrLH...|They have lots of...|2013-02-05 18:35:10| 0| -|-copOvldyKh1qr-vz...|MYoRNLb5chwjQe3c_...|It's open even wh...|2013-08-18 00:56:08| 0| -|FjMQVZjSqY8syIO-5...|hV-bABTK-glh5wj31...|Very decent fried...|2017-06-27 23:05:38| 0| -|ld0AperBXk1h6Ubqm...|_uN0OudeJ3Zl_tf6n...|Appetizers.. plat...|2012-10-06 19:43:09| 0| -|trf3Qcz8qvCDKXiTg...|7Rm9Ba50bw23KTA8R...|Chili Cup + Singl...|2012-03-13 04:00:52| 0| -|SMGAlRjyfuYu-c-22...|kH-0iXqkL7b8UXNpg...|Saturday, Dec 7th...|2013-12-03 23:42:15| 0| -|YVBB9g23nuVJ0u44z...|jtri188kuhe_AuEOJ...|This is probably ...|2016-11-22 22:14:58| 0| -|VL12EhEdT4OWqGq0n...|xODBZmX4EmlVvbqtK...| Tacos|2012-07-27 01:48:24| 0| -|4ay-fdVks5WMerYL_...|pICJRcyqW1cF96Q3X...|Starbucks substit...|2012-06-09 22:57:04| 0| -|OttfcRxgRrYsTg9EV...|clwjLY7PdYJpe7IP9...|Order the Tortill...|2014-06-17 01:20:14| 0| -|JsXhBw6MntzTJjH_U...|wLHodvVFLTgK3nl2X...|Very good will de...|2017-03-23 22:01:41| 0| -|Y0JfJh4B-jrtGc_AH...|wUMuvdUeVZODZk7Tj...|If the Hotlight i...|2013-02-28 02:05:54| 0| -|MlnuJ7T14CE0JDK2Z...|MDr7KLYSPkEonvGoj...| Let's go Yankees!|2011-07-20 21:52:57| 0| -|ffWWVlmsrN5lZ6sjA...|aK6R2akvIK9ijw3Fv...|Basically same fo...|2014-06-12 17:34:20| 0| -|j2sEA3hiUcwHfq9Ml...|EXYbKA1tocvOK_1tX...|Don't go for dinn...|2011-10-13 03:15:15| 0| -|jsaN4TDygu76AGTiB...|H9fkf4Xkj_j7Zxs1F...|30 mins for take ...|2012-03-11 23:16:12| 0| -|kjFgyrCvmVVGSlgWz...|ReX09lhufLTAx19kr...|Got the grilled c...|2013-06-10 20:18:41| 0| -|I6aRZ4sE7ixv0_2r3...|c5nLy7YgXG-IIrOmq...|This is the bomb ...|2016-04-23 02:44:03| 0| -|Ll5l4WTKPH7zWQWA6...|LJaR65ALpz261_dlV...|Helping Mona find...|2012-06-02 14:39:28| 0| -+--------------------+--------------------+--------------------+-------------------+----------------+ -only showing top 20 rows - --
# split the date column to separate date and time columns, trim the date columnforst to avoid errors and then split.
-
-from pyspark.sql.functions import split, to_timestamp
-
-# Trim the date column to avoid errors
-df_tip = df_tip.withColumn("date_trimmed", trim(df_tip["date"]))
-
-# Split the date column into date and time columns
-df_tip = df_tip.withColumn("date_trimmed", to_timestamp("date_trimmed")) \
- .withColumn("date", date_format("date_trimmed", "yyyy-MM-dd")) \
- .withColumn("time", date_format("date_trimmed", "HH:mm:ss"))
-
-df_tip = df_tip.drop("date_trimmed")
-df_tip.show()
-+--------------------+--------------------+--------------------+----------+----------------+--------+ -| user_id| business_id| text| date|compliment_count| time| -+--------------------+--------------------+--------------------+----------+----------------+--------+ -|AGNUgVwnZUey3gcPC...|3uLgwr0qeCNMjKenH...|Avengers time wit...|2012-05-18| 0|02:17:21| -|NBN4MgHP9D3cw--Sn...|QoezRbYQncpRqyrLH...|They have lots of...|2013-02-05| 0|18:35:10| -|-copOvldyKh1qr-vz...|MYoRNLb5chwjQe3c_...|It's open even wh...|2013-08-18| 0|00:56:08| -|FjMQVZjSqY8syIO-5...|hV-bABTK-glh5wj31...|Very decent fried...|2017-06-27| 0|23:05:38| -|ld0AperBXk1h6Ubqm...|_uN0OudeJ3Zl_tf6n...|Appetizers.. plat...|2012-10-06| 0|19:43:09| -|trf3Qcz8qvCDKXiTg...|7Rm9Ba50bw23KTA8R...|Chili Cup + Singl...|2012-03-13| 0|04:00:52| -|SMGAlRjyfuYu-c-22...|kH-0iXqkL7b8UXNpg...|Saturday, Dec 7th...|2013-12-03| 0|23:42:15| -|YVBB9g23nuVJ0u44z...|jtri188kuhe_AuEOJ...|This is probably ...|2016-11-22| 0|22:14:58| -|VL12EhEdT4OWqGq0n...|xODBZmX4EmlVvbqtK...| Tacos|2012-07-27| 0|01:48:24| -|4ay-fdVks5WMerYL_...|pICJRcyqW1cF96Q3X...|Starbucks substit...|2012-06-09| 0|22:57:04| -|OttfcRxgRrYsTg9EV...|clwjLY7PdYJpe7IP9...|Order the Tortill...|2014-06-17| 0|01:20:14| -|JsXhBw6MntzTJjH_U...|wLHodvVFLTgK3nl2X...|Very good will de...|2017-03-23| 0|22:01:41| -|Y0JfJh4B-jrtGc_AH...|wUMuvdUeVZODZk7Tj...|If the Hotlight i...|2013-02-28| 0|02:05:54| -|MlnuJ7T14CE0JDK2Z...|MDr7KLYSPkEonvGoj...| Let's go Yankees!|2011-07-20| 0|21:52:57| -|ffWWVlmsrN5lZ6sjA...|aK6R2akvIK9ijw3Fv...|Basically same fo...|2014-06-12| 0|17:34:20| -|j2sEA3hiUcwHfq9Ml...|EXYbKA1tocvOK_1tX...|Don't go for dinn...|2011-10-13| 0|03:15:15| -|jsaN4TDygu76AGTiB...|H9fkf4Xkj_j7Zxs1F...|30 mins for take ...|2012-03-11| 0|23:16:12| -|kjFgyrCvmVVGSlgWz...|ReX09lhufLTAx19kr...|Got the grilled c...|2013-06-10| 0|20:18:41| -|I6aRZ4sE7ixv0_2r3...|c5nLy7YgXG-IIrOmq...|This is the bomb ...|2016-04-23| 0|02:44:03| -|Ll5l4WTKPH7zWQWA6...|LJaR65ALpz261_dlV...|Helping Mona find...|2012-06-02| 0|14:39:28| -+--------------------+--------------------+--------------------+----------+----------------+--------+ -only showing top 20 rows - --
# write this as a csv to /content/drive/MyDrive/ProcessedCSV name it tip.cv
-
-# Write the DataFrame to a CSV file in your Google Drive
-df_tip.write.mode('overwrite').csv("/content/drive/MyDrive/ProcessedCSV/tip.csv", header=True)
-# read from /content/drive/MyDrive/ProcessedCSV/tip.csv
-
-# Read the CSV data
-df_tip_read = spark.read.csv("/content/drive/MyDrive/ProcessedCSV/tip.csv", header=True, inferSchema=True)
-
-# Display the entire dataframe (use with caution for large datasets)
-df_tip_read.show()
-+--------------------+--------------------+--------------------+----------+----------------+-------------------+ -| user_id| business_id| text| date|compliment_count| time| -+--------------------+--------------------+--------------------+----------+----------------+-------------------+ -|AGNUgVwnZUey3gcPC...|3uLgwr0qeCNMjKenH...|Avengers time wit...|2012-05-18| 0|2024-10-12 02:17:21| -|NBN4MgHP9D3cw--Sn...|QoezRbYQncpRqyrLH...|They have lots of...|2013-02-05| 0|2024-10-12 18:35:10| -|-copOvldyKh1qr-vz...|MYoRNLb5chwjQe3c_...|It's open even wh...|2013-08-18| 0|2024-10-12 00:56:08| -|FjMQVZjSqY8syIO-5...|hV-bABTK-glh5wj31...|Very decent fried...|2017-06-27| 0|2024-10-12 23:05:38| -|ld0AperBXk1h6Ubqm...|_uN0OudeJ3Zl_tf6n...|Appetizers.. plat...|2012-10-06| 0|2024-10-12 19:43:09| -|trf3Qcz8qvCDKXiTg...|7Rm9Ba50bw23KTA8R...|Chili Cup + Singl...|2012-03-13| 0|2024-10-12 04:00:52| -|SMGAlRjyfuYu-c-22...|kH-0iXqkL7b8UXNpg...|Saturday, Dec 7th...|2013-12-03| 0|2024-10-12 23:42:15| -|YVBB9g23nuVJ0u44z...|jtri188kuhe_AuEOJ...|This is probably ...|2016-11-22| 0|2024-10-12 22:14:58| -|VL12EhEdT4OWqGq0n...|xODBZmX4EmlVvbqtK...| Tacos|2012-07-27| 0|2024-10-12 01:48:24| -|4ay-fdVks5WMerYL_...|pICJRcyqW1cF96Q3X...|Starbucks substit...|2012-06-09| 0|2024-10-12 22:57:04| -|OttfcRxgRrYsTg9EV...|clwjLY7PdYJpe7IP9...|Order the Tortill...|2014-06-17| 0|2024-10-12 01:20:14| -|JsXhBw6MntzTJjH_U...|wLHodvVFLTgK3nl2X...|Very good will de...|2017-03-23| 0|2024-10-12 22:01:41| -|Y0JfJh4B-jrtGc_AH...|wUMuvdUeVZODZk7Tj...|If the Hotlight i...|2013-02-28| 0|2024-10-12 02:05:54| -|MlnuJ7T14CE0JDK2Z...|MDr7KLYSPkEonvGoj...| Let's go Yankees!|2011-07-20| 0|2024-10-12 21:52:57| -|ffWWVlmsrN5lZ6sjA...|aK6R2akvIK9ijw3Fv...|Basically same fo...|2014-06-12| 0|2024-10-12 17:34:20| -|j2sEA3hiUcwHfq9Ml...|EXYbKA1tocvOK_1tX...|Don't go for dinn...|2011-10-13| 0|2024-10-12 03:15:15| -|jsaN4TDygu76AGTiB...|H9fkf4Xkj_j7Zxs1F...|30 mins for take ...|2012-03-11| 0|2024-10-12 23:16:12| -|kjFgyrCvmVVGSlgWz...|ReX09lhufLTAx19kr...|Got the grilled c...|2013-06-10| 0|2024-10-12 20:18:41| -|I6aRZ4sE7ixv0_2r3...|c5nLy7YgXG-IIrOmq...|This is the bomb ...|2016-04-23| 0|2024-10-12 02:44:03| -|Ll5l4WTKPH7zWQWA6...|LJaR65ALpz261_dlV...|Helping Mona find...|2012-06-02| 0|2024-10-12 14:39:28| -+--------------------+--------------------+--------------------+----------+----------------+-------------------+ -only showing top 20 rows - --
-| \n", + " | _id | \n", + "user_id | \n", + "business_id | \n", + "text | \n", + "date | \n", + "compliment_count | \n", + "
|---|---|---|---|---|---|---|
| 0 | \n", + "66ea489ae59c7c5b6d8be1ac | \n", + "AGNUgVwnZUey3gcPCJ76iw | \n", + "3uLgwr0qeCNMjKenHJwPGQ | \n", + "Avengers time with the ladies. | \n", + "2012-05-18 02:17:21 | \n", + "0 | \n", + "
| 1 | \n", + "66ea489ae59c7c5b6d8be1ad | \n", + "NBN4MgHP9D3cw--SnauTkA | \n", + "QoezRbYQncpRqyrLH6Iqjg | \n", + "They have lots of good deserts and tasty cuban... | \n", + "2013-02-05 18:35:10 | \n", + "0 | \n", + "
| 2 | \n", + "66ea489ae59c7c5b6d8be1ae | \n", + "-copOvldyKh1qr-vzkDEvw | \n", + "MYoRNLb5chwjQe3c_k37Gg | \n", + "It's open even when you think it isn't | \n", + "2013-08-18 00:56:08 | \n", + "0 | \n", + "
| 3 | \n", + "66ea489ae59c7c5b6d8be1af | \n", + "FjMQVZjSqY8syIO-53KFKw | \n", + "hV-bABTK-glh5wj31ps_Jw | \n", + "Very decent fried chicken | \n", + "2017-06-27 23:05:38 | \n", + "0 | \n", + "
| 4 | \n", + "66ea489ae59c7c5b6d8be1b0 | \n", + "ld0AperBXk1h6UbqmM80zw | \n", + "_uN0OudeJ3Zl_tf6nxg5ww | \n", + "Appetizers.. platter special for lunch | \n", + "2012-10-06 19:43:09 | \n", + "0 | \n", + "
| \n", + " | text | \n", + "cleaned_text | \n", + "
|---|---|---|
| 0 | \n", + "Example sentence for preprocessing. | \n", + "example sentence preprocessing | \n", + "
| 1 | \n", + "\n", + " | \n", + " |
| 2 | \n", + "Another example text. | \n", + "another example text | \n", + "
| 3 | \n", + "123.45 | \n", + "\n", + " |
MultinomialNB()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
MultinomialNB()