|
| 1 | +# Linux / Debian Installation Instructions |
| 2 | + |
| 3 | +# Create a separate python environment with PySpark installed |
| 4 | +# Spark SQL |
| 5 | +pip install pyspark[sql] |
| 6 | +# Install Pandas API on Spark and plotly |
| 7 | +pip install pyspark[pandas_on_spark] plotly |
| 8 | +# Optional if you want to create a separate jupyter kernel |
| 9 | +pip install ipykernel |
| 10 | +python -m ipykernel install --user --name your_env --display-name "Py3.9 (pyspark_dev)" |
| 11 | + |
| 12 | +# Install Java 8, scala, and git |
| 13 | +sudo apt install openjdk-8-jdk scala git |
| 14 | + |
| 15 | +# Install source for Spark 3.2.0 |
| 16 | +wget https://dlcdn.apache.org/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz |
| 17 | + |
| 18 | +# Extract Spark to /opt/spark |
| 19 | +sudo mkdir /opt/spark |
| 20 | +sudo tar -xf spark*.tgz -C /opt/spark --strip-component 1 |
| 21 | + |
| 22 | +# Change permission to /opt/spark so that Spark can write inside it |
| 23 | +sudo chmod -R 777 /opt/spark |
| 24 | + |
| 25 | +# Edit .profile or .bashrc to add environment variables: SPARK_HOME |
| 26 | +export SPARK_HOME=/opt/spark |
| 27 | + |
| 28 | +# Add spark/bin and spark/sbin to the PATH environment variable |
| 29 | +export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin |
| 30 | + |
| 31 | +# Then create PYSPARK_PYTHON environment that points to your Python interpreter that has PySpark installed |
| 32 | +export PYSPARK_PYTHON=<path_to_python_env_with_pyspark_python> |
| 33 | + |
| 34 | +# Refresh .profile or .bashrc |
| 35 | +source ~/.profile |
| 36 | + |
| 37 | + |
| 38 | +# For Windows 10 |
| 39 | +Install Java 8 from old Sun Microsystem's site, not Oracle: https://www.java.com/download/ie_manual.jsp |
| 40 | + |
| 41 | +# Create separate Python virtual environment comtaining PySpark, Pandas API on Spark, and Plotly |
| 42 | +pip install pyspark[sql] |
| 43 | +pip install pyspark[pandas_on_spark] plotly |
| 44 | +# Optional if you want to create a separate jupyter kernel |
| 45 | +pip install ipykernel |
| 46 | +python -m ipykernel install --user --name your_env --display-name "Py3.9 (pyspark_dev)" |
| 47 | + |
| 48 | +# Set SPARK_HOME and PYSPARK_PYTHON environment variables |
| 49 | +set SPARK_HOME=<path_to_site_packages/pyspark_folder> |
| 50 | +set PYSPARK_PYTHON=<path_to_python.exe> |
| 51 | + |
| 52 | +# Trick Spark into thinking that you have Hadoop installed |
| 53 | +Download winutils.exe from https://github.com/cdarlint/winutils, save locally to "hadoop/bin" folder and then |
| 54 | +set HADOOP_HOME=[path_to_hadoop_folder] |
| 55 | + |
0 commit comments