|
41 | 41 | "id": "1991eb8b-1a17-4281-babc-3221525b626b", |
42 | 42 | "metadata": {}, |
43 | 43 | "source": [ |
44 | | - "#### Read csv file without infering schema" |
| 44 | + "#### Read csv file by infering or guessing schema" |
45 | 45 | ] |
46 | 46 | }, |
47 | 47 | { |
|
1319 | 1319 | "## Connecting to Relational Databases (JDBC)" |
1320 | 1320 | ] |
1321 | 1321 | }, |
| 1322 | + { |
| 1323 | + "cell_type": "markdown", |
| 1324 | + "id": "7aaf3812-5b6b-47dc-826d-58a83ede3810", |
| 1325 | + "metadata": {}, |
| 1326 | + "source": [ |
| 1327 | + "[Link](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/from_to_dbms.html) to their documentation" |
| 1328 | + ] |
| 1329 | + }, |
1322 | 1330 | { |
1323 | 1331 | "cell_type": "markdown", |
1324 | 1332 | "id": "77b439ff-808b-47f1-b64f-dc60e4d0a60a", |
|
1753 | 1761 | "source": [ |
1754 | 1762 | "Unfortunately, the IBM Java JRE does not work with Spark 3.x and thus, we will not be able to connect to mainframe DB2 z/OS platform." |
1755 | 1763 | ] |
| 1764 | + }, |
| 1765 | + { |
| 1766 | + "cell_type": "markdown", |
| 1767 | + "id": "93e3132a-bc44-4ec4-ab60-845423e92e23", |
| 1768 | + "metadata": {}, |
| 1769 | + "source": [ |
| 1770 | + "#### Connecting to Snowflake" |
| 1771 | + ] |
| 1772 | + }, |
| 1773 | + { |
| 1774 | + "cell_type": "markdown", |
| 1775 | + "id": "1cb85857-726f-44b0-96e0-7118fb9addaa", |
| 1776 | + "metadata": {}, |
| 1777 | + "source": [ |
| 1778 | + "[Link](https://docs.snowflake.com/en/user-guide/spark-connector-install.html) to Snowflake's documentation on working with the PySpark connector" |
| 1779 | + ] |
| 1780 | + }, |
| 1781 | + { |
| 1782 | + "cell_type": "markdown", |
| 1783 | + "id": "02ed47d5-db4a-4e64-8fcf-7ead1f22e796", |
| 1784 | + "metadata": {}, |
| 1785 | + "source": [ |
| 1786 | + "Currently have PySpark 3.2.0 installed, but Snowflake does not yet support 3.2.0. Therefore, the code below has not yet been tested and is currently being used as a placeholder." |
| 1787 | + ] |
| 1788 | + }, |
| 1789 | + { |
| 1790 | + "cell_type": "code", |
| 1791 | + "execution_count": null, |
| 1792 | + "id": "50d70a07-ce07-4ae1-afc0-464db3f876f7", |
| 1793 | + "metadata": {}, |
| 1794 | + "outputs": [], |
| 1795 | + "source": [ |
| 1796 | + "from pyspark import SparkConf, SparkContext\n", |
| 1797 | + "from pyspark.sql import SQLContext" |
| 1798 | + ] |
| 1799 | + }, |
| 1800 | + { |
| 1801 | + "cell_type": "code", |
| 1802 | + "execution_count": null, |
| 1803 | + "id": "b33039ad-a2a4-4f5f-93f8-b352022d1850", |
| 1804 | + "metadata": {}, |
| 1805 | + "outputs": [], |
| 1806 | + "source": [ |
| 1807 | + "spark = SparkSession.builder.master(\"local[*]\").appName(\"Snowflake_JDBC\")\\\n", |
| 1808 | + " .config(\"spark.jars\", \"C:\\Path_To_Snowflake_JDBC.jar\")\\\n", |
| 1809 | + " .getOrCreate()" |
| 1810 | + ] |
| 1811 | + }, |
| 1812 | + { |
| 1813 | + "cell_type": "code", |
| 1814 | + "execution_count": null, |
| 1815 | + "id": "61c85a64-3281-4eb2-b92a-f7b1430ab961", |
| 1816 | + "metadata": {}, |
| 1817 | + "outputs": [], |
| 1818 | + "source": [ |
| 1819 | + "# Snowflake connection parameters\n", |
| 1820 | + "sfparams = {\n", |
| 1821 | + " \"sfURL\" : \"<account_identifier>.snowflakecomputing.com\",\n", |
| 1822 | + " \"sfUser\" : \"<user_name>\",\n", |
| 1823 | + " \"sfPassword\" : \"<password>\",\n", |
| 1824 | + " \"sfDatabase\" : \"<database>\",\n", |
| 1825 | + " \"sfSchema\" : \"<schema>\",\n", |
| 1826 | + " \"sfWarehouse\" : \"<warehouse>\"\n", |
| 1827 | + "}" |
| 1828 | + ] |
| 1829 | + }, |
| 1830 | + { |
| 1831 | + "cell_type": "code", |
| 1832 | + "execution_count": null, |
| 1833 | + "id": "61fe30c6-c039-4c77-baf1-09e3a5af219c", |
| 1834 | + "metadata": {}, |
| 1835 | + "outputs": [], |
| 1836 | + "source": [ |
| 1837 | + "#read full table\n", |
| 1838 | + "df = spark.read.format(“snowflake”) \\\n", |
| 1839 | + " .options(**sfparams) \\\n", |
| 1840 | + " .option(\"dbtable\", \"Employee\") \\\n", |
| 1841 | + " .load()\n", |
| 1842 | + "\n", |
| 1843 | + "#run custom query\n", |
| 1844 | + "df = spark.read.format(“snowflake”) \\\n", |
| 1845 | + " .options(**sfparams) \\\n", |
| 1846 | + " .option(\"query\", \"SELECT * FROM Employee\") \\\n", |
| 1847 | + " .load()" |
| 1848 | + ] |
1756 | 1849 | } |
1757 | 1850 | ], |
1758 | 1851 | "metadata": { |
1759 | 1852 | "kernelspec": { |
1760 | | - "display_name": "Py3.8 (pyspark_dev)", |
| 1853 | + "display_name": "Py3.9 (pyspark_dev)", |
1761 | 1854 | "language": "python", |
1762 | 1855 | "name": "pyspark_dev" |
1763 | 1856 | }, |
|
1771 | 1864 | "name": "python", |
1772 | 1865 | "nbconvert_exporter": "python", |
1773 | 1866 | "pygments_lexer": "ipython3", |
1774 | | - "version": "3.8.10" |
| 1867 | + "version": "3.9.8" |
1775 | 1868 | } |
1776 | 1869 | }, |
1777 | 1870 | "nbformat": 4, |
|
0 commit comments