Skip to content

Commit ff063b5

Browse files
committed
Added instructions on installing PySpark 3.2.0
1 parent cdc8dd7 commit ff063b5

File tree

1 file changed

+55
-0
lines changed

1 file changed

+55
-0
lines changed

pyspark/installing_pyspark_320.txt

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
# Linux / Debian Installation Instructions
2+
3+
# Create a separate python environment with PySpark installed
4+
# Spark SQL
5+
pip install pyspark[sql]
6+
# Install Pandas API on Spark and plotly
7+
pip install pyspark[pandas_on_spark] plotly
8+
# Optional if you want to create a separate jupyter kernel
9+
pip install ipykernel
10+
python -m ipykernel install --user --name your_env --display-name "Py3.9 (pyspark_dev)"
11+
12+
# Install Java 8, scala, and git
13+
sudo apt install openjdk-8-jdk scala git
14+
15+
# Install source for Spark 3.2.0
16+
wget https://dlcdn.apache.org/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz
17+
18+
# Extract Spark to /opt/spark
19+
sudo mkdir /opt/spark
20+
sudo tar -xf spark*.tgz -C /opt/spark --strip-component 1
21+
22+
# Change permission to /opt/spark so that Spark can write inside it
23+
sudo chmod -R 777 /opt/spark
24+
25+
# Edit .profile or .bashrc to add environment variables: SPARK_HOME
26+
export SPARK_HOME=/opt/spark
27+
28+
# Add spark/bin and spark/sbin to the PATH environment variable
29+
export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin
30+
31+
# Then create PYSPARK_PYTHON environment that points to your Python interpreter that has PySpark installed
32+
export PYSPARK_PYTHON=<path_to_python_env_with_pyspark_python>
33+
34+
# Refresh .profile or .bashrc
35+
source ~/.profile
36+
37+
38+
# For Windows 10
39+
Install Java 8 from old Sun Microsystem's site, not Oracle: https://www.java.com/download/ie_manual.jsp
40+
41+
# Create separate Python virtual environment comtaining PySpark, Pandas API on Spark, and Plotly
42+
pip install pyspark[sql]
43+
pip install pyspark[pandas_on_spark] plotly
44+
# Optional if you want to create a separate jupyter kernel
45+
pip install ipykernel
46+
python -m ipykernel install --user --name your_env --display-name "Py3.9 (pyspark_dev)"
47+
48+
# Set SPARK_HOME and PYSPARK_PYTHON environment variables
49+
set SPARK_HOME=<path_to_site_packages/pyspark_folder>
50+
set PYSPARK_PYTHON=<path_to_python.exe>
51+
52+
# Trick Spark into thinking that you have Hadoop installed
53+
Download winutils.exe from https://github.com/cdarlint/winutils, save locally to "hadoop/bin" folder and then
54+
set HADOOP_HOME=[path_to_hadoop_folder]
55+

0 commit comments

Comments
 (0)