diff --git a/.gitignore b/.gitignore index e567857..4a61ab7 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ .jupyter_cache jupyter_execute uv.lock +nyc_yellow_taxi_2025-01.parquet diff --git a/content/01_table_dataframe.svg b/content/01_table_dataframe.svg new file mode 100644 index 0000000..9bd1c21 --- /dev/null +++ b/content/01_table_dataframe.svg @@ -0,0 +1,262 @@ + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + column + DataFrame + + + row + + + diff --git a/content/benchmarking.md b/content/benchmarking.md new file mode 100644 index 0000000..1dad901 --- /dev/null +++ b/content/benchmarking.md @@ -0,0 +1,80 @@ +# Benchmarking + +:::{questions} + +- What syntax is used to make a lesson? +- How do you structure a lesson effectively for teaching? +- `questions` are at the top of a lesson and provide a starting + point for what you might learn. It is usually a bulleted list. +::: + +:::{objectives} + +- Show a complete lesson page with all of the most common + structures. +- ... + +This is also a holdover from the carpentries-style. It could +usually be left off. +::: + +The introduction should be a high level overview of what is on the +page and why it is interesting. + +The lines below (only in the source) will set the default highlighting +language for the entire page. + +:::{highlight} python +::: + +## Section + +A section. + +:::{discussion} +Discuss the following. + +- A discussion section +- Another discussion topic +::: + +## Section + +``` +print("hello world") +# This uses the default highlighting language +``` + +```python +print("hello world) +``` + +## Exercises: description + +:::{exercise} Exercise Topic-1: imperative description of exercise +Exercise text here. +::: + +:::{solution} +Solution text here +::: + +## Summary + +A Summary of what you learned and why it might be useful. Maybe a +hint of what comes next. + +## See also + +- Other relevant links +- Other link + +:::{keypoints} + +- What the learner should take away +- point 2 +- ... + +This is another holdover from the carpentries style. This perhaps +is better done in a "summary" section. +::: diff --git a/content/conf.py b/content/conf.py index 7c33d39..bd76f25 100644 --- a/content/conf.py +++ b/content/conf.py @@ -14,14 +14,14 @@ # -- Project information ----------------------------------------------------- # FIXME: choose title -project = "Your lesson name" +project = "Python for High Performance Data Analytics" # FIXME: insert correct author -author = "The contributors" +author = "Francesco Fiusco, Qiang Li, Ashwin Mohanan, Juan Triviño, Yonglei Wang" copyright = f"2025, ENCCS, {author}" # FIXME: github organization / user that the repository belongs to github_user = "ENCCS" -github_repo_name = "" # auto-detected from dirname if blank +github_repo_name = "python-for-hpda" # auto-detected from dirname if blank github_version = "main" conf_py_path = "/content/" # with leading and trailing slash diff --git a/content/dask.md b/content/dask.md new file mode 100644 index 0000000..222131e --- /dev/null +++ b/content/dask.md @@ -0,0 +1,93 @@ +# Dask + +:::{questions} +- What syntax is used to make a lesson? +- How do you structure a lesson effectively for teaching? +- `questions` are at the top of a lesson and provide a starting + point for what you might learn. It is usually a bulleted list. +::: + +:::{objectives} +- Show a complete lesson page with all of the most common + structures. +- ... + +This is also a holdover from the carpentries-style. It could +usually be left off. +::: + + + + +The introduction should be a high level overview of what is on the +page and why it is interesting. + + +The lines below (only in the source) will set the default highlighting +language for the entire page. + +:::{highlight} python +::: + + + +## Section + +A section. + +:::{discussion} +Discuss the following. + +- A discussion section +- Another discussion topic +::: + + + +## Section + +``` +print("hello world") +# This uses the default highlighting language +``` + +```python +print("hello world) +``` + + + +## Exercises: description + +:::{exercise} Exercise Topic-1: imperative description of exercise +Exercise text here. +::: + +:::{solution} +Solution text here +::: + + + +## Summary + +A Summary of what you learned and why it might be useful. Maybe a +hint of what comes next. + + + +## See also + +- Other relevant links +- Other link + + + +:::{keypoints} +- What the learner should take away +- point 2 +- ... + +This is another holdover from the carpentries style. This perhaps +is better done in a "summary" section. +::: diff --git a/content/groupby.png b/content/groupby.png new file mode 100644 index 0000000..4786855 Binary files /dev/null and b/content/groupby.png differ diff --git a/content/index.md b/content/index.md index 2e8332c..7e6dd4e 100644 --- a/content/index.md +++ b/content/index.md @@ -1,12 +1,11 @@ -# LESSON NAME - -Intro +# Python for High Performance Data Analytics :::{prereq} -- FIXME -- ... -- ... +- Basic proficiency in Python (variables, flow control, functions) +- Basic grasp of descriptive statistics (such as minimum, maximum, median, arithmetic mean...) +- Basic knowledge of NumPy +- Basic knowledge of some plotting package (Matplotlib, Seaborn, Holoviz...) ::: ```{csv-table} @@ -20,7 +19,13 @@ Intro :caption: The lesson :maxdepth: 1 -episode.md +tabular-data +interfacing-with-storage +visualisation +benchmarking +multithreading +dask + ``` ```{toctree} @@ -31,25 +36,59 @@ quick-reference guide ``` -## Learning outcomes +## What to expect from this course + +:::{discussion} + +How large are the datasets you are working with? -FIXME +::: + +Both for classical machine/deep learning and (generative) AI, the amount of +data needed to train ever-growing models is becoming bigger and bigger. +Moreover, great strides in both hardware and software development for high +performance computing (HPC) applications allow for large scale computations +that were not possible before. +This course focuses on high performance data analytics (HPDA). The data +can come from simulations or experiments (or just generally available +datasets), and the goal is to pre-process, analyse and visualise it. +The lesson introduces some of the modern Python stack for data analytics, +dealing with packages such as Pandas, Polars, multithreading +and Dask, as well as Streamlit for large-scale data visualisations. -This material is for ... +## Learning outcomes -By the end of this module, learners should: +This lesson provides a broad overview of methods to work with large datasets +using tools and libraries from the Python ecosystem. Since this field is +fairly extensive, we will try to expose just enough details on each topic +for you to get a good idea of the picture and an understanding of what +combination of tools and libraries will work well for your particular use +case. -- ... -- ... +Specifically, this lesson covers: + +- Tools for efficiently storing data and writing/reading it to/from disk +- Interfacing with databases and object storage solutions +- Main libraries to work with arrays and tabular data +- Performance monitoring and benchmarking +- Workload parallelisation: threads and Dask ## See also :::{admonition} Credit :class: warning -FIXME +Don't forget to check out additional course materials from the +[Data carpentry](https://datacarpentry.org/lessons/), such as: + +- [Data Analysis and Visualization in Python for Ecologists](https://datacarpentry.github.io/python-ecology-lesson/) +- [10 minutes to pandas](https://pandas.pydata.org/pandas-docs/version/1.3/user_guide/10min.html#minutes-to-pandas) +- [Modern Pandas (blog series by Tom Augspurger)](https://tomaugspurger.net/posts/modern-1-intro/) -Don't forget to check out additional course materials from ... +Moreover, the Polars [documentation](https://docs.pola.rs/) and +[Awesome data science with Python](https://github.com/r0f1/datascience) +are valuable resources, as well as +[PythonSpeed](https://pythonspeed.com/datascience/#pandas). ::: diff --git a/content/interfacing-with-storage.md b/content/interfacing-with-storage.md new file mode 100644 index 0000000..5031f59 --- /dev/null +++ b/content/interfacing-with-storage.md @@ -0,0 +1,80 @@ +# Storage & serialisation backends + +:::{questions} + +- What syntax is used to make a lesson? +- How do you structure a lesson effectively for teaching? +- `questions` are at the top of a lesson and provide a starting + point for what you might learn. It is usually a bulleted list. +::: + +:::{objectives} + +- Show a complete lesson page with all of the most common + structures. +- ... + +This is also a holdover from the carpentries-style. It could +usually be left off. +::: + +The introduction should be a high level overview of what is on the +page and why it is interesting. + +The lines below (only in the source) will set the default highlighting +language for the entire page. + +:::{highlight} python +::: + +## Section + +A section. + +:::{discussion} +Discuss the following. + +- A discussion section +- Another discussion topic +::: + +## Section + +``` +print("hello world") +# This uses the default highlighting language +``` + +```python +print("hello world) +``` + +## Exercises: description + +:::{exercise} Exercise Topic-1: imperative description of exercise +Exercise text here. +::: + +:::{solution} +Solution text here +::: + +## Summary + +A Summary of what you learned and why it might be useful. Maybe a +hint of what comes next. + +## See also + +- Other relevant links +- Other link + +:::{keypoints} + +- What the learner should take away +- point 2 +- ... + +This is another holdover from the carpentries style. This perhaps +is better done in a "summary" section. +::: diff --git a/content/multithreading.md b/content/multithreading.md new file mode 100644 index 0000000..b835806 --- /dev/null +++ b/content/multithreading.md @@ -0,0 +1,80 @@ +# Multithreading + +:::{questions} + +- What syntax is used to make a lesson? +- How do you structure a lesson effectively for teaching? +- `questions` are at the top of a lesson and provide a starting + point for what you might learn. It is usually a bulleted list. +::: + +:::{objectives} + +- Show a complete lesson page with all of the most common + structures. +- ... + +This is also a holdover from the carpentries-style. It could +usually be left off. +::: + +The introduction should be a high level overview of what is on the +page and why it is interesting. + +The lines below (only in the source) will set the default highlighting +language for the entire page. + +:::{highlight} python +::: + +## Section + +A section. + +:::{discussion} +Discuss the following. + +- A discussion section +- Another discussion topic +::: + +## Section + +``` +print("hello world") +# This uses the default highlighting language +``` + +```python +print("hello world) +``` + +## Exercises: description + +:::{exercise} Exercise Topic-1: imperative description of exercise +Exercise text here. +::: + +:::{solution} +Solution text here +::: + +## Summary + +A Summary of what you learned and why it might be useful. Maybe a +hint of what comes next. + +## See also + +- Other relevant links +- Other link + +:::{keypoints} + +- What the learner should take away +- point 2 +- ... + +This is another holdover from the carpentries style. This perhaps +is better done in a "summary" section. +::: diff --git a/content/tabular-data.md b/content/tabular-data.md new file mode 100644 index 0000000..f81be14 --- /dev/null +++ b/content/tabular-data.md @@ -0,0 +1,999 @@ +# Tabular data (aka Dataframes) + +:::{questions} + +- What are series and dataframes? +- What do we mean by tidy and untidy data? +- What packages are available in Python to handle dataframes? + +::: + +:::{objectives} + +- Learn how to manipulate dataframes in Pandas +- Lazy and eager dataframes in Polars + +::: + +:::{highlight} python +::: + +This episode will give an introduction to the concepts of *Series* and *DataFrame* +and how they can be manipulated using different Python packages. + +## Series and dataframes + +A collection of observations (e.g. a time series or simply a set of observations +of a feature of a phenomenon) can be represented by a homogeneous vector, i.e. +an array where all the elements are of the same type. This is known as a *Series* +in many frameworks. Several series (of different types) can be used as columns of +a tabular structure called a *Dataframe*, as depicted in the figure below. + +![Structure of dataframe](01_table_dataframe.svg) + +### Tidy vs untidy dataframes + +Let us look at the following two dataframes: + +::::{tabs} +:::{group-tab} Untidy format + +```{csv-table} +:delim: ; +# ; Runner ; 400 ; 800 ; 1200 ; 1500 +0 ; Runner 1 ; 64 ; 128 ; 192 ; 240 +1 ; Runner 2 ; 80 ; 160 ; 240 ; 300 +2 ; Runner 3 ; 96 ; 192 ; 288 ; 360 +``` + +::: + +:::{group-tab} Tidy format + +```{csv-table} +:delim: , + +#, Runner, distance, time +0, Runner 1, 400, 64 +1, Runner 2, 400, 80 +2, Runner 3, 400, 96 +3, Runner 1, 800, 128 +4, Runner 2, 800, 160 +5, Runner 3, 800, 192 +6, Runner 1, 1200, 192 +7, Runner 2, 1200, 240 +8, Runner 3, 1200, 288 +9, Runner 1, 1500, 240 +10, Runner 2, 1500, 300 +11, Runner 3, 1500, 360 +``` + +::: + +:::: + +Most tabular data is either in a tidy format or a untidy format (some people +refer them as the long format or the wide format). The main differences are +summarised below: + +- In untidy (wide) format, each row represents an observation consisting of +multiple variables and each variable has its own column. This is intuitive and +easy for us to understand and make comparisons across different variables, +calculate statistics, etc. +- In tidy (long) format , i.e. column-oriented format, each row represents only +one variable of the observation, and can be considered “computer readable”. +When it comes to data analysis using Pandas, the tidy format is recommended: +- Each column can be stored as a vector and this not only saves memory but also +allows for vectorized calculations which are much faster. +- It’s easier to filter, group, join and aggregate the data. + +Imagine, for example, that you would like to compute the speed as +`speed=distance/time`. The untidy format would make this much clunkier, as: + +- The distances are encoded as column names, not as data points (rows) +- The speed would have to be stored in a new dataframe since it would not +fit in that data structure. + +In comparison, in a tidy dataframe, this computation would be a simple +operation between two columns. + +:::{tip} + +Recovering a wide dataframe from a tidy one is commonly referred to as +*pivoting*. Most dataframe libraries provide a `pivot()` or +`pivot_table` function. + +::: + +## Pandas & Polars + +Historically, [Pandas](https://pandas.pydata.org/) has been the go-to package +to handle dataframes in Python. It is based on NumPy (each column is a Numpy +vector) and has been the traditional workhorse for tabular data, with a stable +API and a large ecosystem built around it, including the [Seaborn](https://seaborn.pydata.org/) +statistical plotting framework. +More recently, [Polars](https://docs.pola.rs/) was introduced as a more modern +and faster alternative to handle dataframes. It is written in Rust and supports +out of the box out of core evaluation (i.e. does not need loading the whole +dataset in memory), lazy evaluation of queries and automatically uses multiple +threads. Moreover, experimental GPU support is available through +[cuDF](https://docs.rapids.ai/api/cudf/stable/). In the remainder of this +episode, the [NYC taxi](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page) +will be used to showcase how datasets can be accessed, summarised and manipulated +in both Pandas and Polars. The dataset can be download in [Parquet](https://parquet.apache.org/) +format from the link above (the file for the month of January was used in this +case). The dataset contains information about taxi trips performed in New York, +such as the ID of the vendor, the total fare, pickup and drop-off time and +location (expressed as an ID), type of payment, whether additional fees were +charged and more. + +### Opening a dataset + +Assuming the file is called `yellow_tripdata_2025-01.parquet`, the dataset can be +opened as: + +::::{tabs} + +:::{group-tab} Pandas + +```python +import pandas as pd +df = pd.read_parquet("yellow_tripdata_2025-01.parquet") +``` + +::: + +:::{group-tab} Polars + +```python +import polars as pl +df = pl.read_parquet("yellow_tripdata_2025-01.parquet") +``` + +::: +:::: + +### Description and summarisation + +We can get a first understanding of the contents of a dataframe by printing +the first few lines, the "schema" (i.e. the number and type of each column) +and summary statistics as follows: + +:::::{tabs} + +::::{group-tab} Pandas + +```python +df.head() +``` + +:::{exercise} Output +:class: dropdown + +``` + VendorID tpep_pickup_datetime tpep_dropoff_datetime ... congestion_surcharge Airport_fee cbd_congestion_fee +0 1 2025-01-01 00:18:38 2025-01-01 00:26:59 ... 2.5 0.0 0.0 +1 1 2025-01-01 00:32:40 2025-01-01 00:35:13 ... 2.5 0.0 0.0 +2 1 2025-01-01 00:44:04 2025-01-01 00:46:01 ... 2.5 0.0 0.0 +3 2 2025-01-01 00:14:27 2025-01-01 00:20:01 ... 0.0 0.0 0.0 +4 2 2025-01-01 00:21:34 2025-01-01 00:25:06 ... 0.0 0.0 0.0 +``` + +::: + +```python +df.info() +``` + +:::{exercise} Output +:class: dropdown + +``` +RangeIndex: 3475226 entries, 0 to 3475225 +Data columns (total 20 columns): + # Column Dtype +--- ------ ----- + 0 VendorID int32 + 1 tpep_pickup_datetime datetime64[us] + 2 tpep_dropoff_datetime datetime64[us] + 3 passenger_count float64 + 4 trip_distance float64 + 5 RatecodeID float64 + 6 store_and_fwd_flag object + 7 PULocationID int32 + 8 DOLocationID int32 + 9 payment_type int64 + 10 fare_amount float64 + 11 extra float64 + 12 mta_tax float64 + 13 tip_amount float64 + 14 tolls_amount float64 + 15 improvement_surcharge float64 + 16 total_amount float64 + 17 congestion_surcharge float64 + 18 Airport_fee float64 + 19 cbd_congestion_fee float64 +dtypes: datetime64[us](2), float64(13), int32(3), int64(1), object(1) +memory usage: 490.5+ MB +``` + +::: + +```python +df.describe() +``` + +:::{exercise} Output +:class: dropdown + +``` + VendorID tpep_pickup_datetime tpep_dropoff_datetime ... congestion_surcharge Airport_fee cbd_congestion_fee +count 3.475226e+06 3475226 3475226 ... 2.935077e+06 2.935077e+06 3.475226e+06 +mean 1.785428e+00 2025-01-17 11:02:55.910964 2025-01-17 11:17:56.997901 ... 2.225237e+00 1.239111e-01 4.834093e-01 +min 1.000000e+00 2024-12-31 20:47:55 2024-12-18 07:52:40 ... -2.500000e+00 -1.750000e+00 -7.500000e-01 +25% 2.000000e+00 2025-01-10 07:59:01 2025-01-10 08:15:29.500000 ... 2.500000e+00 0.000000e+00 0.000000e+00 +50% 2.000000e+00 2025-01-17 15:41:33 2025-01-17 15:59:34 ... 2.500000e+00 0.000000e+00 7.500000e-01 +75% 2.000000e+00 2025-01-24 19:34:06 2025-01-24 19:48:31 ... 2.500000e+00 0.000000e+00 7.500000e-01 +max 7.000000e+00 2025-02-01 00:00:44 2025-02-01 23:44:11 ... 2.500000e+00 6.750000e+00 7.500000e-01 +std 4.263282e-01 NaN NaN ... 9.039932e-01 4.725090e-01 3.619307e-01 + +[8 rows x 19 columns] +``` + +::: +:::: + +::::{group-tab} Polars + +```python +df.head() +``` + +:::{exercise} Output +:class: dropdown + +```python +shape: (5, 20) +┌──────────┬──────────────────┬──────────────────┬─────────────────┬───┬──────────────┬──────────────────┬─────────────┬─────────────────┐ +│ VendorID ┆ tpep_pickup_date ┆ tpep_dropoff_dat ┆ passenger_count ┆ … ┆ total_amount ┆ congestion_surch ┆ Airport_fee ┆ cbd_congestion_ │ +│ --- ┆ time ┆ etime ┆ --- ┆ ┆ --- ┆ arge ┆ --- ┆ fee │ +│ i32 ┆ --- ┆ --- ┆ i64 ┆ ┆ f64 ┆ --- ┆ f64 ┆ --- │ +│ ┆ datetime[μs] ┆ datetime[μs] ┆ ┆ ┆ ┆ f64 ┆ ┆ f64 │ +╞══════════╪══════════════════╪══════════════════╪═════════════════╪═══╪══════════════╪══════════════════╪═════════════╪═════════════════╡ +│ 1 ┆ 2025-01-01 ┆ 2025-01-01 ┆ 1 ┆ … ┆ 18.0 ┆ 2.5 ┆ 0.0 ┆ 0.0 │ +│ ┆ 00:18:38 ┆ 00:26:59 ┆ ┆ ┆ ┆ ┆ ┆ │ +│ 1 ┆ 2025-01-01 ┆ 2025-01-01 ┆ 1 ┆ … ┆ 12.12 ┆ 2.5 ┆ 0.0 ┆ 0.0 │ +│ ┆ 00:32:40 ┆ 00:35:13 ┆ ┆ ┆ ┆ ┆ ┆ │ +│ 1 ┆ 2025-01-01 ┆ 2025-01-01 ┆ 1 ┆ … ┆ 12.1 ┆ 2.5 ┆ 0.0 ┆ 0.0 │ +│ ┆ 00:44:04 ┆ 00:46:01 ┆ ┆ ┆ ┆ ┆ ┆ │ +│ 2 ┆ 2025-01-01 ┆ 2025-01-01 ┆ 3 ┆ … ┆ 9.7 ┆ 0.0 ┆ 0.0 ┆ 0.0 │ +│ ┆ 00:14:27 ┆ 00:20:01 ┆ ┆ ┆ ┆ ┆ ┆ │ +│ 2 ┆ 2025-01-01 ┆ 2025-01-01 ┆ 3 ┆ … ┆ 8.3 ┆ 0.0 ┆ 0.0 ┆ 0.0 │ +│ ┆ 00:21:34 ┆ 00:25:06 ┆ ┆ ┆ ┆ ┆ ┆ │ +└──────────┴──────────────────┴──────────────────┴─────────────────┴───┴──────────────┴──────────────────┴─────────────┴─────────────────┘ +``` + +::: + +```python +df.describe() +``` + +:::{exercise} Output +:class: dropdown + +``` +shape: (9, 21) +┌────────────┬────────────┬──────────────────┬─────────────────┬───┬──────────────┬─────────────────┬─────────────┬─────────────────┐ +│ statistic ┆ VendorID ┆ tpep_pickup_date ┆ tpep_dropoff_da ┆ … ┆ total_amount ┆ congestion_surc ┆ Airport_fee ┆ cbd_congestion_ │ +│ --- ┆ --- ┆ time ┆ tetime ┆ ┆ --- ┆ harge ┆ --- ┆ fee │ +│ str ┆ f64 ┆ --- ┆ --- ┆ ┆ f64 ┆ --- ┆ f64 ┆ --- │ +│ ┆ ┆ str ┆ str ┆ ┆ ┆ f64 ┆ ┆ f64 │ +╞════════════╪════════════╪══════════════════╪═════════════════╪═══╪══════════════╪═════════════════╪═════════════╪═════════════════╡ +│ count ┆ 3.475226e6 ┆ 3475226 ┆ 3475226 ┆ … ┆ 3.475226e6 ┆ 2.935077e6 ┆ 2.935077e6 ┆ 3.475226e6 │ +│ null_count ┆ 0.0 ┆ 0 ┆ 0 ┆ … ┆ 0.0 ┆ 540149.0 ┆ 540149.0 ┆ 0.0 │ +│ mean ┆ 1.785428 ┆ 2025-01-17 ┆ 2025-01-17 ┆ … ┆ 25.611292 ┆ 2.225237 ┆ 0.123911 ┆ 0.483409 │ +│ ┆ ┆ 11:02:55.910964 ┆ 11:17:56.997901 ┆ ┆ ┆ ┆ ┆ │ +│ std ┆ 0.426328 ┆ null ┆ null ┆ … ┆ 463.658478 ┆ 0.903993 ┆ 0.472509 ┆ 0.361931 │ +│ min ┆ 1.0 ┆ 2024-12-31 ┆ 2024-12-18 ┆ … ┆ -901.0 ┆ -2.5 ┆ -1.75 ┆ -0.75 │ +│ ┆ ┆ 20:47:55 ┆ 07:52:40 ┆ ┆ ┆ ┆ ┆ │ +│ 25% ┆ 2.0 ┆ 2025-01-10 ┆ 2025-01-10 ┆ … ┆ 15.2 ┆ 2.5 ┆ 0.0 ┆ 0.0 │ +│ ┆ ┆ 07:59:01 ┆ 08:15:29 ┆ ┆ ┆ ┆ ┆ │ +│ 50% ┆ 2.0 ┆ 2025-01-17 ┆ 2025-01-17 ┆ … ┆ 19.95 ┆ 2.5 ┆ 0.0 ┆ 0.75 │ +│ ┆ ┆ 15:41:34 ┆ 15:59:34 ┆ ┆ ┆ ┆ ┆ │ +│ 75% ┆ 2.0 ┆ 2025-01-24 ┆ 2025-01-24 ┆ … ┆ 27.78 ┆ 2.5 ┆ 0.0 ┆ 0.75 │ +│ ┆ ┆ 19:34:06 ┆ 19:48:31 ┆ ┆ ┆ ┆ ┆ │ +│ max ┆ 7.0 ┆ 2025-02-01 ┆ 2025-02-01 ┆ … ┆ 863380.37 ┆ 2.5 ┆ 6.75 ┆ 0.75 │ +│ ┆ ┆ 00:00:44 ┆ 23:44:11 ┆ ┆ ┆ ┆ ┆ │ +└────────────┴────────────┴──────────────────┴─────────────────┴───┴──────────────┴─────────────────┴─────────────┴─────────────────┘ +``` + +::: + +:::: + +::::: + +### Indexing + +We can index data in the dataframe as follows: + +:::::{tabs} + +::::{group-tab} Pandas + +```python +# With this we can select a column +df['VendorID'] # Could also be df.VendorID +``` + +:::{exercise} Output +:class: dropdown + +``` +0 1 +1 1 +2 1 +3 2 +4 2 + .. +3475221 2 +3475222 2 +3475223 2 +3475224 2 +3475225 2 +``` + +::: + +```python +# Get a row +df.iloc[1000,:] +``` + +:::{exercise} Output +:class: dropdown + +``` +VendorID 2 +tpep_pickup_datetime 2025-01-01 00:08:06 +tpep_dropoff_datetime 2025-01-01 00:16:20 +passenger_count 4.0 +trip_distance 1.53 +RatecodeID 1.0 +store_and_fwd_flag N +PULocationID 114 +DOLocationID 90 +payment_type 1 +fare_amount 10.0 +extra 1.0 +mta_tax 0.5 +tip_amount 2.25 +tolls_amount 0.0 +improvement_surcharge 1.0 +total_amount 17.25 +congestion_surcharge 2.5 +Airport_fee 0.0 +cbd_congestion_fee 0.0 +Name: 1000, dtype: object +>>> df.iloc[1000,:] +VendorID 2 +tpep_pickup_datetime 2025-01-01 00:08:06 +tpep_dropoff_datetime 2025-01-01 00:16:20 +passenger_count 4.0 +trip_distance 1.53 +RatecodeID 1.0 +store_and_fwd_flag N +PULocationID 114 +DOLocationID 90 +payment_type 1 +fare_amount 10.0 +extra 1.0 +mta_tax 0.5 +tip_amount 2.25 +tolls_amount 0.0 +improvement_surcharge 1.0 +total_amount 17.25 +congestion_surcharge 2.5 +Airport_fee 0.0 +cbd_congestion_fee 0.0 +``` + +::: + +:::: + +::::{group-tab} Polars + +```python +df["VendorID"] # A more Polars-y idiom is to use df.select(["VendorID"]) +``` + +:::{exercise} Output +:class: dropdown + +``` +shape: (3_475_226,) +Series: 'VendorID' [i32] +[ + 1 + 1 + 1 + 2 + 2 + … + 2 + 2 + 2 + 2 + 2 +] +``` + +::: + +```python +df[1000][:] +``` + +:::{exercise} Output +:class: dropdown + +``` +┌──────────┬─────────────┬─────────────┬────────────┬───┬────────────┬────────────┬────────────┬────────────┐ +│ VendorID ┆ tpep_pickup ┆ tpep_dropof ┆ passenger_ ┆ … ┆ total_amou ┆ congestion ┆ Airport_fe ┆ cbd_conges │ +│ --- ┆ _datetime ┆ f_datetime ┆ count ┆ ┆ nt ┆ _surcharge ┆ e ┆ tion_fee │ +│ i32 ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │ +│ ┆ datetime[μs ┆ datetime[μs ┆ i64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │ +│ ┆ ] ┆ ] ┆ ┆ ┆ ┆ ┆ ┆ │ +╞══════════╪═════════════╪═════════════╪════════════╪═══╪════════════╪════════════╪════════════╪════════════╡ +│ 2 ┆ 2025-01-01 ┆ 2025-01-01 ┆ 4 ┆ … ┆ 17.25 ┆ 2.5 ┆ 0.0 ┆ 0.0 │ +│ ┆ 00:08:06 ┆ 00:16:20 ┆ ┆ ┆ ┆ ┆ ┆ │ +└──────────┴─────────────┴─────────────┴────────────┴───┴────────────┴────────────┴────────────┴────────────┘ +``` + +::: + +:::: +::::: + +In both cases, a similar syntax can be used to do in-place modification (e.g. `df[row][column]=...`). +Please note that this kind of replacement carries a big performance penalty, +which is designed to do column-wide operations with minimal overhead. This is +commonly achieved through the [expression API](https://docs.pola.rs/user-guide/concepts/expressions-and-contexts/), as detailed in the next section. + +### Common workflows + +It is quite common to stratify (i.e. divide the samples into a number of groups +based on categorical variables) to produce descriptive statistics (i.e. +statistics that provide a summary of the samples and do not aim to predict +anything regarding the population it comes from). This is commonly achieved +through a `group-by` workflow, where the following happens: + +- Splitting: data is partitioned into different groups based on some criterion +- Applying: applying a function/performing a calculation to each group +- Combining: assembling a dataframe (of potentially any size) with the results. + +This is type of workflow is represented below. + +:::{figure-md} +![split-apply-combine](groupby.png) + +Source: [Earth and environmental data science](https://earth-env-data-science.github.io/intro.html) +::: + +As an example, let us try to to compute the total fare for each hour, split +by payment type. + +:::::{tabs} + +::::{group-tab} Pandas + +```python +#First let us extract the hour from the tpep_pickup_datetime column +df["hour"] = df['tpep_pickup_datetime'].dt.hour + +hourly_fare = ( + df.groupby(['hour', 'payment_type'], observed=False)['fare_amount'] + .sum() + .reset_index() + .sort_values(['hour', 'payment_type']) +) +``` + +:::{exercise} Output +:class: dropdown + +``` + hour payment_type fare_amount +0 0 0 352227.86 +1 0 1 1088201.12 +2 0 2 156546.07 +3 0 3 3537.91 +4 0 4 3941.24 +.. ... ... ... +116 23 0 534063.76 +117 23 1 1618143.37 +118 23 2 219991.22 +119 23 3 4765.54 +120 23 4 3293.61 +``` + +::: + +The `groupby` statement is used to stratify the `fare_amount` column by +hour and payment type. Then the amounts per hour and type get summed and +sorted according to time and payment type. + +:::: + +::::{group-tab} Polars + +```python +#First, let us extract the hour from the tpep_pickup_datetime column +df = df.with_columns(pl.col('tpep_pickup_datetime').dt.hour().alias('hour')) + +hourly_fare = ( + df.group_by(['hour', 'payment_type']) + .agg(pl.sum('fare_amount').alias('total_fare')) + .sort(['hour', 'payment_type']) +) +``` + +:::{exercise} Output +:class: dropdown + +``` +┌──────┬──────────────┬────────────┐ +│ hour ┆ payment_type ┆ total_fare │ +│ --- ┆ --- ┆ --- │ +│ i8 ┆ i64 ┆ f64 │ +╞══════╪══════════════╪════════════╡ +│ 0 ┆ 0 ┆ 352227.86 │ +│ 0 ┆ 1 ┆ 1.0882e6 │ +│ 0 ┆ 2 ┆ 156546.07 │ +│ 0 ┆ 3 ┆ 3537.91 │ +│ 0 ┆ 4 ┆ 3941.24 │ +│ … ┆ … ┆ … │ +│ 23 ┆ 0 ┆ 534063.76 │ +│ 23 ┆ 1 ┆ 1.6181e6 │ +│ 23 ┆ 2 ┆ 219991.22 │ +│ 23 ┆ 3 ┆ 4765.54 │ +│ 23 ┆ 4 ┆ 3293.61 │ +└──────┴──────────────┴────────────┘ +``` + +::: + +The `group_by` statement is used to stratify by hour and payment type, +followed by an aggregated sum of the `fare_amount` column and a sort. +Notice how the syntax has more of a functional programming flavour to it +(`pl.col`, `pl.sum` as pure functions). This will be clarified further in +the next section. Also note that Polars by default spreads the workload +over multiple threads. + +:::: + +::::: + +## Idiomatic Polars + +Polars introduces a few variations to dataset operations compared to the +traditional Pandas approach. In particular, a domain-specific language +(DSL) was developed, where *expressions* are written to represent dataset +operations and *context*s provide the environment where they produce a +result. + +### Expressions + +Let's say that we created a `trip_duration_sec` column in our NYC cab database +and, given the `trip_distance` column, we want to compute the average speed. +In Polars, this can be achieved with: + +```python +pl.col('trip_distance') / pl.col(`trip_duration_sec`) +``` + +This is a lazy representation of an operation we want to perform, which can +be further manipulated or just printed. For it to actually produce data, a +*context* is needed. + +### Contexts + +The same Polars expression can produce different results depending on the +context where it is used. Four common contexts include: + +- `select` +- `with_columns` +- `filter` +- `group_by` + +Both `select` and `with_columns` can produce new columns, which may be +aggregations, combinations of other columns, or literals. The difference +between the two is that `select` only includes the columns contained in its +input expression, whereas `with_columns` returns a new dataframe which +contains all the columns from the original dataframe and the new ones created +by the expression. To exemplify, using our earlier example of computing the +average speed during a trip, using `select` would yield a single column, +whereas `with_columns` would return the original dataframe with an additional +column called `trip distance`: + +```python +df.select(pl.col('trip_distance')/pl.col('trip_duration_sec')*3600) +shape: (3_475_226, 1) +┌───────────────┐ +│ trip_distance │ +│ --- │ +│ f64 │ +╞═══════════════╡ +│ 11.497006 │ +│ 11.764706 │ +│ 18.461538 │ +│ 5.60479 │ +│ 11.207547 │ +│ … │ +│ 13.68899 │ +│ 19.42398 │ +│ 9.879418 │ +│ 9.339901 │ +│ 12.781395 │ +└───────────────┘ +``` + +```python +df.with_columns((pl.col('trip_distance')/pl.col('trip_duration_sec')*3600).alias("avg_sp +eed_mph")) +shape: (3_475_226, 22) +┌──────────┬──────────┬──────────┬──────────┬───┬──────────┬──────────┬──────────┬──────────┐ +│ VendorID ┆ tpep_pic ┆ tpep_dro ┆ passenge ┆ … ┆ Airport_ ┆ cbd_cong ┆ trip_dur ┆ avg_spee │ +│ --- ┆ kup_date ┆ poff_dat ┆ r_count ┆ ┆ fee ┆ estion_f ┆ ation_se ┆ d_mph │ +│ i32 ┆ time ┆ etime ┆ --- ┆ ┆ --- ┆ ee ┆ c ┆ --- │ +│ ┆ --- ┆ --- ┆ i64 ┆ ┆ f64 ┆ --- ┆ --- ┆ f64 │ +│ ┆ datetime ┆ datetime ┆ ┆ ┆ ┆ f64 ┆ i64 ┆ │ +│ ┆ [μs] ┆ [μs] ┆ ┆ ┆ ┆ ┆ ┆ │ +╞══════════╪══════════╪══════════╪══════════╪═══╪══════════╪══════════╪══════════╪══════════╡ +│ 1 ┆ 2025-01- ┆ 2025-01- ┆ 1 ┆ … ┆ 0.0 ┆ 0.0 ┆ 501 ┆ 11.49700 │ +│ ┆ 01 ┆ 01 ┆ ┆ ┆ ┆ ┆ ┆ 6 │ +│ ┆ 00:18:38 ┆ 00:26:59 ┆ ┆ ┆ ┆ ┆ ┆ │ +│ 1 ┆ 2025-01- ┆ 2025-01- ┆ 1 ┆ … ┆ 0.0 ┆ 0.0 ┆ 153 ┆ 11.76470 │ +│ ┆ 01 ┆ 01 ┆ ┆ ┆ ┆ ┆ ┆ 6 │ +│ ┆ 00:32:40 ┆ 00:35:13 ┆ ┆ ┆ ┆ ┆ ┆ │ +│ 1 ┆ 2025-01- ┆ 2025-01- ┆ 1 ┆ … ┆ 0.0 ┆ 0.0 ┆ 117 ┆ 18.46153 │ +│ ┆ 01 ┆ 01 ┆ ┆ ┆ ┆ ┆ ┆ 8 │ +│ ┆ 00:44:04 ┆ 00:46:01 ┆ ┆ ┆ ┆ ┆ ┆ │ +│ 2 ┆ 2025-01- ┆ 2025-01- ┆ 3 ┆ … ┆ 0.0 ┆ 0.0 ┆ 334 ┆ 5.60479 │ +│ ┆ 01 ┆ 01 ┆ ┆ ┆ ┆ ┆ ┆ │ +│ ┆ 00:14:27 ┆ 00:20:01 ┆ ┆ ┆ ┆ ┆ ┆ │ +│ 2 ┆ 2025-01- ┆ 2025-01- ┆ 3 ┆ … ┆ 0.0 ┆ 0.0 ┆ 212 ┆ 11.20754 │ +│ ┆ 01 ┆ 01 ┆ ┆ ┆ ┆ ┆ ┆ 7 │ +│ ┆ 00:21:34 ┆ 00:25:06 ┆ ┆ ┆ ┆ ┆ ┆ │ +│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │ +│ 2 ┆ 2025-01- ┆ 2025-01- ┆ null ┆ … ┆ null ┆ 0.75 ┆ 881 ┆ 13.68899 │ +│ ┆ 31 ┆ 31 ┆ ┆ ┆ ┆ ┆ ┆ │ +│ ┆ 23:01:48 ┆ 23:16:29 ┆ ┆ ┆ ┆ ┆ ┆ │ +│ 2 ┆ 2025-01- ┆ 2025-02- ┆ null ┆ … ┆ null ┆ 0.75 ┆ 1618 ┆ 19.42398 │ +│ ┆ 31 ┆ 01 ┆ ┆ ┆ ┆ ┆ ┆ │ +│ ┆ 23:50:29 ┆ 00:17:27 ┆ ┆ ┆ ┆ ┆ ┆ │ +│ 2 ┆ 2025-01- ┆ 2025-01- ┆ null ┆ … ┆ null ┆ 0.75 ┆ 962 ┆ 9.879418 │ +│ ┆ 31 ┆ 31 ┆ ┆ ┆ ┆ ┆ ┆ │ +│ ┆ 23:26:59 ┆ 23:43:01 ┆ ┆ ┆ ┆ ┆ ┆ │ +│ 2 ┆ 2025-01- ┆ 2025-01- ┆ null ┆ … ┆ null ┆ 0.75 ┆ 1218 ┆ 9.339901 │ +│ ┆ 31 ┆ 31 ┆ ┆ ┆ ┆ ┆ ┆ │ +│ ┆ 23:14:34 ┆ 23:34:52 ┆ ┆ ┆ ┆ ┆ ┆ │ +│ 2 ┆ 2025-01- ┆ 2025-02- ┆ null ┆ … ┆ null ┆ 0.0 ┆ 645 ┆ 12.78139 │ +│ ┆ 31 ┆ 01 ┆ ┆ ┆ ┆ ┆ ┆ 5 │ +│ ┆ 23:56:42 ┆ 00:07:27 ┆ ┆ ┆ ┆ ┆ ┆ │ +└──────────┴──────────┴──────────┴──────────┴───┴──────────┴──────────┴──────────┴──────────┘ +``` + +The `filter` context filters the rows of a dataframe based on one (or more) +expressions which evaluate to a Boolean, e.g. + +```python +df.filter(pl.col('avg_speed_mph') < 1) +shape: (104_410, 22) +┌──────────┬──────────┬──────────┬──────────┬───┬──────────┬──────────┬──────────┬──────────┐ +│ VendorID ┆ tpep_pic ┆ tpep_dro ┆ passenge ┆ … ┆ Airport_ ┆ cbd_cong ┆ trip_dur ┆ avg_spee │ +│ --- ┆ kup_date ┆ poff_dat ┆ r_count ┆ ┆ fee ┆ estion_f ┆ ation_se ┆ d_mph │ +│ i32 ┆ time ┆ etime ┆ --- ┆ ┆ --- ┆ ee ┆ c ┆ --- │ +│ ┆ --- ┆ --- ┆ i64 ┆ ┆ f64 ┆ --- ┆ --- ┆ f64 │ +│ ┆ datetime ┆ datetime ┆ ┆ ┆ ┆ f64 ┆ i64 ┆ │ +│ ┆ [μs] ┆ [μs] ┆ ┆ ┆ ┆ ┆ ┆ │ +╞══════════╪══════════╪══════════╪══════════╪═══╪══════════╪══════════╪══════════╪══════════╡ +│ 2 ┆ 2025-01- ┆ 2025-01- ┆ 1 ┆ … ┆ 0.0 ┆ 0.0 ┆ 10 ┆ 0.0 │ +│ ┆ 01 ┆ 01 ┆ ┆ ┆ ┆ ┆ ┆ │ +│ ┆ 00:37:43 ┆ 00:37:53 ┆ ┆ ┆ ┆ ┆ ┆ │ +│ 2 ┆ 2025-01- ┆ 2025-01- ┆ 3 ┆ … ┆ 0.0 ┆ 0.0 ┆ 8 ┆ 0.0 │ +│ ┆ 01 ┆ 01 ┆ ┆ ┆ ┆ ┆ ┆ │ +│ ┆ 00:57:08 ┆ 00:57:16 ┆ ┆ ┆ ┆ ┆ ┆ │ +│ 1 ┆ 2025-01- ┆ 2025-01- ┆ 1 ┆ … ┆ 0.0 ┆ 0.0 ┆ 1910 ┆ 0.0 │ +│ ┆ 01 ┆ 01 ┆ ┆ ┆ ┆ ┆ ┆ │ +│ ┆ 00:27:40 ┆ 00:59:30 ┆ ┆ ┆ ┆ ┆ ┆ │ +│ 2 ┆ 2025-01- ┆ 2025-01- ┆ 4 ┆ … ┆ 0.0 ┆ 0.0 ┆ 5 ┆ 0.0 │ +│ ┆ 01 ┆ 01 ┆ ┆ ┆ ┆ ┆ ┆ │ +│ ┆ 00:56:49 ┆ 00:56:54 ┆ ┆ ┆ ┆ ┆ ┆ │ +│ 1 ┆ 2025-01- ┆ 2025-01- ┆ 0 ┆ … ┆ 0.0 ┆ 0.0 ┆ 2 ┆ 0.0 │ +│ ┆ 01 ┆ 01 ┆ ┆ ┆ ┆ ┆ ┆ │ +│ ┆ 00:42:42 ┆ 00:42:44 ┆ ┆ ┆ ┆ ┆ ┆ │ +│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │ +│ 1 ┆ 2025-01- ┆ 2025-02- ┆ null ┆ … ┆ null ┆ 0.75 ┆ 266 ┆ 0.0 │ +│ ┆ 31 ┆ 01 ┆ ┆ ┆ ┆ ┆ ┆ │ +│ ┆ 23:59:17 ┆ 00:03:43 ┆ ┆ ┆ ┆ ┆ ┆ │ +│ 1 ┆ 2025-01- ┆ 2025-01- ┆ null ┆ … ┆ null ┆ 0.75 ┆ 1100 ┆ 0.0 │ +│ ┆ 31 ┆ 31 ┆ ┆ ┆ ┆ ┆ ┆ │ +│ ┆ 23:17:38 ┆ 23:35:58 ┆ ┆ ┆ ┆ ┆ ┆ │ +│ 2 ┆ 2025-01- ┆ 2025-01- ┆ null ┆ … ┆ null ┆ 0.0 ┆ 161 ┆ 0.0 │ +│ ┆ 31 ┆ 31 ┆ ┆ ┆ ┆ ┆ ┆ │ +│ ┆ 23:39:25 ┆ 23:42:06 ┆ ┆ ┆ ┆ ┆ ┆ │ +│ 1 ┆ 2025-01- ┆ 2025-01- ┆ null ┆ … ┆ null ┆ 0.75 ┆ 24 ┆ 0.0 │ +│ ┆ 31 ┆ 31 ┆ ┆ ┆ ┆ ┆ ┆ │ +│ ┆ 23:30:42 ┆ 23:31:06 ┆ ┆ ┆ ┆ ┆ ┆ │ +│ 1 ┆ 2025-01- ┆ 2025-01- ┆ null ┆ … ┆ null ┆ 0.75 ┆ 1556 ┆ 0.0 │ +│ ┆ 31 ┆ 31 ┆ ┆ ┆ ┆ ┆ ┆ │ +│ ┆ 23:10:25 ┆ 23:36:21 ┆ ┆ ┆ ┆ ┆ ┆ │ +└──────────┴──────────┴──────────┴──────────┴───┴──────────┴──────────┴──────────┴──────────┘ +``` + +The `group_by` context behaves like its Pandas counterpart. + +### Transformations + +A `join` operation combines columns from one or more dataframes into a new +dataframe. There are different joining strategies, which influence how columns +are combined and what rows are included in the final set. A common type is the +*equi* join, where rows are matched by a key expression. Let us clarify this +with an example. The `df` dataframe does not include specific coordinates for +each pickup and drop-off, rather only a `PULocationID` and a `DOLocationID`. +There is a `taxy_zones_xy.csv` file that contains, for each `LocationID`, the +latitude (X) and longitude (Y) of each location, as well as the name of zone +and borough: + +```python + +lookup_df = pl.read_csv('taxy_zones_xy.csv', has_header=True) +lookup_df.head() +┌────────────┬────────────┬───────────┬─────────────────────────┬───────────────┐ +│ LocationID ┆ X ┆ Y ┆ zone ┆ borough │ +│ --- ┆ --- ┆ --- ┆ --- ┆ --- │ +│ i64 ┆ f64 ┆ f64 ┆ str ┆ str │ +╞════════════╪════════════╪═══════════╪═════════════════════════╪═══════════════╡ +│ 1 ┆ -74.176786 ┆ 40.689516 ┆ Newark Airport ┆ EWR │ +│ 2 ┆ -73.826126 ┆ 40.625724 ┆ Jamaica Bay ┆ Queens │ +│ 3 ┆ -73.849479 ┆ 40.865888 ┆ Allerton/Pelham Gardens ┆ Bronx │ +│ 4 ┆ -73.977023 ┆ 40.724152 ┆ Alphabet City ┆ Manhattan │ +│ 5 ┆ -74.18993 ┆ 40.55034 ┆ Arden Heights ┆ Staten Island │ +└────────────┴────────────┴───────────┴─────────────────────────┴───────────────┘ +``` + +This can be used to append these columns to the original df to have some form +of geographical data as follows (e.g. for the `PULocationID`): + +```python +df = df.join(lookup_df, left_on='PULocationID', right_on='LocationID', how='left' +, suffix='_pickup') +``` + +In the line above, `left_on` is used to indicate the *key* in the original +dataframe, `right_on` is used to specify the *key* in the `lookup_df` dataframe, +`how=left` means that the columns from the second dataframe will be added to +the first (and not the other way around) and `suffix` is what will be added to +the names of the joined columns (i.e., df will contain columns called `X_pickup`, +`Y_pickup`, `zone_pickup` and `borough_pickup`). More information on join +operations can be found [here](https://docs.pola.rs/user-guide/transformations/joins/). + +## Exercises + +::::{exercise} Joining geographical data +We have already seen how to add actual latitude and longitude for the pickups. +Now do the same for the drop-offs! + +:::{solution} + +```python +df = df.join(lookup_df, left_on='DOLocationID', right_on='LocationID', how='left' +, suffix='_dropoff') +``` + +::: + +:::: + +::::{exercise} Feature engineering: enriching the dataset +We want to understand a bit more of the traffic in the city by creating +new features (i.e. columns), in particular: + +- Split the pickup datetime into hour, minute, day of the week and month +to indentify daily, weekly and monthly trends +- Compute the average speed as an indicator of congestion (low speed -> +traffic jam) +- Stratify the trip distance and fare by zone to identify how expensive +different zones are. +Below is a skeleton of the code, where some lines have been blanked out +for you to fill (marked with `TODO:...`) + +```python +import polars as pl +raw_df = pl.read_parquet('yellow_tripdata_2025-01.parquet') +df = raw_df.with_columns([ + pl.col("tpep_pickup_datetime").dt.hour().alias("pickup_hour"), + #TODO: do this for the minute + pl.col("tpep_pickup_datetime").dt.day_of_week().alias("pickup_dow"), # Mon=0 … Sun=6 + pl.col("tpep_pickup_datetime").dt.month().alias("pickup_month"), + # Trip duration in seconds + (pl.col("tpep_dropoff_datetime") - pl.col("tpep_pickup_datetime")) + .dt.total_seconds() + .alias("trip_duration_sec"), +]) + +df = df.with_column( + #TODO: add expression for average velocity here + .replace_nan(None) # protect against div‑by‑zero + .alias("avg_speed_mph") +) + +# Compute per‑pickup‑zone statistics once +zone_stats = ( + df.groupby("PULocationID") + .agg([ + pl.mean("fare_amount").alias("zone_avg_fare"), + #TODO: do the same for the trip distance here + pl.count().alias("zone_trip_cnt"), + ]) + .rename({"PULocationID": "pickup_zone_id"}) # avoid name clash later +) + +# Join those stats back onto the original rows +df = df.join(zone_stats, left_on="PULocationID", right_on="pickup_zone_id", how="left") +``` + +While we haven't covered the `join` instruction earlier, its main role +is to "spread" the `zone_stats` over all the rides in the original dataframe +(i.e. write the `zone_avg_fare` on each ride in `df`). `join` has its roots +in relational databases, where different tables can be merged based on a +common column. + +:::{solution} + +```python +import polars as pl +raw_df = pl.read_parquet('yellow_tripdata_2025-01.parquet') +df = raw_df.with_columns([ + pl.col("tpep_pickup_datetime").dt.hour().alias("pickup_hour"), + pl.col("tpep_pickup_datetime").dt.minute().alias("pickup_minute"), + pl.col("tpep_pickup_datetime").dt.day_of_week().alias("pickup_dow"), # Mon=0 … Sun=6 + pl.col("tpep_pickup_datetime").dt.month().alias("pickup_month"), + # Trip duration in seconds + (pl.col("tpep_dropoff_datetime") - pl.col("tpep_pickup_datetime")) + .dt.seconds() + .alias("trip_duration_sec"), +]) + +df = df.with_column( + ( + pl.col("trip_distance") / + (pl.col("trip_duration_sec") / 3600) # seconds → hours + ) + .replace_nan(None) # protect against div‑by‑zero + .alias("avg_speed_mph") +) + +# Compute per‑pickup‑zone statistics once +zone_stats = ( + df.groupby("PULocationID") + .agg([ + pl.mean("fare_amount").alias("zone_avg_fare"), + pl.mean("trip_distance").alias("zone_avg_dist"), + pl.count().alias("zone_trip_cnt"), + ]) + .rename({"PULocationID": "pickup_zone_id"}) # avoid name clash later +) + +# Join those stats back onto the original rows +df = df.join(zone_stats, left_on="PULocationID", right_on="pickup_zone_id", how="left") +``` + +::: +:::: + +::::{exercise} More feature engineering! +Similarly to the exercise above, define the following features in the data: + +- `pickup_hour` extracted from `tpep_pickup_time` +- `is_weekend`, a Boolean value for each trip +- `avg_speed_mph`, exactly as before +- `tip_to_fare_ratio`, dividing the tip amount by the total fare. Be careful +with division by 0 +- `fare_per_mile`, dividing the total fare by the distance +- `dist_per_passenger`, the average distance travelled by each passenger +(sum of all trip distances divided by number of trips) +- `speed_per_pickup_area`, the average velocity stratified by pickup location +- `dropoff_trip_count`, count of trips stratified per dropoff location + +:::{solution} + +```python +import polars as pl +raw_df = pl.read_parquet("yellow_tripdata_2025-01.parquet") +df = raw_df.with_columns([ + # 1. pickup_hour + pl.col("tpep_pickup_datetime").dt.hour().alias("pickup_hour"), + + # 2. is_weekend (Sat=5, Sun=6) + pl.col("tpep_pickup_datetime") + .dt.day_of_week() + .is_in([5, 6]) + .alias("is_weekend"), + + # 3. trip_duration_sec + (pl.col("tpep_dropoff_datetime") - pl.col("tpep_pickup_datetime")) + .dt.seconds() + .alias("trip_duration_sec"), + + # 4. avg_speed_mph + ( + pl.col("trip_distance") / + (pl.col("trip_duration_sec") / 3600) + ) + .replace_nan(None) # protect against div‑by‑zero + .alias("avg_speed_mph"), + + # 5. tip_to_fare_ratio + (pl.col("tip_amount") / pl.col("fare_amount")) + .replace_inf(None) + .replace_nan(None) + .alias("tip_to_fare_ratio"), + + # 6. fare_per_mile + (pl.col("fare_amount") / pl.col("trip_distance")) + .replace_inf(None) + .replace_nan(None) + .alias("fare_per_mile"), + + # 7. dist_per_passenger + (pl.col("trip_distance") / pl.col("passenger_count")) + .replace_inf(None) + .replace_nan(None) + .alias("dist_per_passenger"), +]) + +dropoff_stats = ( + df.groupby("DOLocationID") + .agg([ + pl.mean("avg_speed_mph").alias("dropoff_avg_speed"), + pl.count().alias("dropoff_trip_cnt"), + ]) + .rename({"DOLocationID": "dropoff_zone_id"}) # avoid name clash later +) + +# Join the per‑zone stats back onto every row +df = df.join(dropoff_stats, left_on="DOLocationID", right_on="dropoff_zone_id", how="left") + +``` + +::: +:::: + +## Summary + +We have seen how to deal with common workflows in both Pandas and Polars, +starting from basic tasks like opening a dataset and inspecting it to performing +split-apply-combine pipelines. We have seen how to use Polars to manipulate +datasets and perform some basic feature engineering. + +:::{keypoints} + +- Dataframes are combinations of series +- Both Pandas and Polars can be used to manipulate them +- The expression API in Polars allows to perform advanced operations with a +simple DSL. + +::: + +## See also + +There is a lot more to Polars than what we covered in this short introduction. +For example, queries like the ones we introduced can be performed lazily, i.e. +just declared and then run all together, giving the backend a chance to +optimise them. This can dramatically improve performance in the case of complex +queries. For this and a lot more, we refer you to the official +[documentation](https://docs.pola.rs/). diff --git a/content/taxi_zones_xy.csv b/content/taxi_zones_xy.csv new file mode 100644 index 0000000..5d66bfc --- /dev/null +++ b/content/taxi_zones_xy.csv @@ -0,0 +1,264 @@ +LocationID,X,Y,zone,borough +1,-74.1767857452143,40.6895156480431,Newark Airport,EWR +2,-73.8261257703202,40.6257242377511,Jamaica Bay,Queens +3,-73.8494789238597,40.8658875419774,Allerton/Pelham Gardens,Bronx +4,-73.9770229219339,40.7241521436714,Alphabet City,Manhattan +5,-74.1899296712375,40.550340123832,Arden Heights,Staten Island +6,-74.0677744607421,40.5990621740821,Arrochar/Fort Wadsworth,Staten Island +7,-73.9214905669465,40.761084729151,Astoria,Queens +8,-73.9232024092836,40.7786069617704,Astoria Park,Queens +9,-73.7880202487407,40.7544109271114,Auburndale,Queens +10,-73.7916654578906,40.6781247031195,Baisley Park,Queens +11,-74.0106156305362,40.6039777088098,Bath Beach,Brooklyn +12,-74.0154903292143,40.7024884135418,Battery Park,Manhattan +13,-74.0161196692833,40.7116120831165,Battery Park City,Manhattan +14,-74.0304470508297,40.6235842793089,Bay Ridge,Brooklyn +15,-73.7879710847436,40.7852195006457,Bay Terrace/Fort Totten,Queens +16,-73.7716678221165,40.7612088345005,Bayside,Queens +17,-73.9491813010382,40.6919940962748,Bedford,Brooklyn +18,-73.8869219948557,40.8687628819908,Bedford Park,Bronx +19,-73.7278693961567,40.7364724391387,Bellerose,Queens +20,-73.8860348405346,40.8577731142544,Belmont,Bronx +21,-73.9813007259727,40.6020489840097,Bensonhurst East,Brooklyn +22,-73.9942970150052,40.6096273400171,Bensonhurst West,Brooklyn +23,-74.1594432875648,40.607504219542,Bloomfield/Emerson Hill,Staten Island +24,-73.9655685290937,40.8020327729942,Bloomingdale,Manhattan +25,-73.9864589876713,40.685614589464,Boerum Hill,Brooklyn +26,-73.9895604065528,40.6286122405859,Borough Park,Brooklyn +27,-73.9097811978144,40.5589500919216,Breezy Point/Fort Tilden/Riis Beach,Queens +28,-73.80732908405,40.710852781755,Briarwood/Jamaica Hills,Queens +29,-73.9605798481175,40.582195546571,Brighton Beach,Brooklyn +30,-73.8200975464803,40.6048721837899,Broad Channel,Queens +31,-73.875722045295,40.85992052114,Bronx Park,Bronx +32,-73.8646241408313,40.8644517064252,Bronxdale,Bronx +33,-73.995328797204,40.6962383270628,Brooklyn Heights,Brooklyn +34,-73.9676989686791,40.7025590381029,Brooklyn Navy Yard,Brooklyn +35,-73.9124814882782,40.6638098226151,Brownsville,Brooklyn +36,-73.9165639802734,40.6990847380764,Bushwick North,Brooklyn +37,-73.9259483634522,40.6962674524983,Bushwick South,Brooklyn +38,-73.7355495585459,40.6932955704325,Cambria Heights,Queens +39,-73.899773871952,40.6388789299243,Canarsie,Brooklyn +40,-73.9958180819094,40.6785042230919,Carroll Gardens,Brooklyn +41,-73.9520653308923,40.8042048286288,Central Harlem,Manhattan +42,-73.9395164693167,40.8210462140976,Central Harlem North,Manhattan +43,-73.9655721799594,40.7824597386361,Central Park,Manhattan +44,-74.2295465457844,40.527298175003,Charleston/Tottenville,Staten Island +45,-73.9982526213797,40.7130578275841,Chinatown,Manhattan +46,-73.7864863118305,40.8474998984855,City Island,Bronx +47,-73.8969291492323,40.8457549453718,Claremont/Bathgate,Bronx +48,-73.9898566508315,40.7622367556144,Clinton East,Manhattan +49,-73.9649337316477,40.6885158752504,Clinton Hill,Brooklyn +50,-73.9938994033218,40.7666923746668,Clinton West,Manhattan +51,-73.8304236318969,40.874061550909,Co-Op City,Bronx +52,-73.9969224713871,40.6866045417646,Cobble Hill,Brooklyn +53,-73.8440703458857,40.7819879584294,College Point,Queens +54,-74.003092790563,40.6869911475205,Columbia Street,Brooklyn +55,-73.9904741395563,40.5768996198541,Coney Island,Brooklyn +56,-73.8590533534623,40.7415986155017,Corona,Queens +57,-73.853384474855,40.7523160392058,Corona,Queens +58,-73.8207053822972,40.8414754987567,Country Club,Bronx +59,-73.8930744338338,40.8388599905968,Crotona Park,Bronx +60,-73.8897822026553,40.8312941187883,Crotona Park East,Bronx +61,-73.9412820292214,40.6738428461316,Crown Heights North,Brooklyn +62,-73.9492699670747,40.6670895607261,Crown Heights South,Brooklyn +63,-73.8776764323789,40.6848101947076,Cypress Hills,Brooklyn +64,-73.7313921625056,40.760631276033,Douglaston,Queens +65,-73.9855710635353,40.6953726093294,Downtown Brooklyn/MetroTech,Brooklyn +66,-73.9863827475546,40.701732471687,DUMBO/Vinegar Hill,Brooklyn +67,-74.0147358711929,40.6184544072203,Dyker Heights,Brooklyn +68,-73.9999401565074,40.7483972248318,East Chelsea,Manhattan +69,-73.9153662202336,40.8306076776139,East Concourse/Concourse Village,Bronx +70,-73.8684029143644,40.7639478203254,East Elmhurst,Queens +71,-73.9377345201578,40.6439285192804,East Flatbush/Farragut,Brooklyn +72,-73.9202100548833,40.6524606311874,East Flatbush/Remsen Village,Brooklyn +73,-73.8065843331469,40.7536974869459,East Flushing,Queens +74,-73.9383104968462,40.8055655654775,East Harlem North,Manhattan +75,-73.9449566676846,40.7906501597646,East Harlem South,Manhattan +76,-73.8784173669943,40.6586863044662,East New York,Brooklyn +77,-73.8957171569753,40.6677018744258,East New York/Pennsylvania Avenue,Brooklyn +78,-73.8866455940689,40.8460302771125,East Tremont,Bronx +79,-73.9852141243977,40.7279442789046,East Village,Manhattan +80,-73.9423293071901,40.7144695745871,East Williamsburg,Brooklyn +81,-73.8455325663154,40.8773588281637,Eastchester,Bronx +82,-73.8723440095934,40.7384639203673,Elmhurst,Queens +83,-73.889221829475,40.7401456031773,Elmhurst/Maspeth,Queens +84,-74.1739373269384,40.5320172010873,Eltingville/Annadale/Prince's Bay,Staten Island +85,-73.9522088368464,40.6472549665241,Erasmus,Brooklyn +86,-73.7541867292922,40.6025539265615,Far Rockaway,Queens +87,-74.0078121163865,40.706659617356,Financial District North,Manhattan +88,-74.0113077706368,40.7033938017209,Financial District South,Manhattan +89,-73.9626937337469,40.64098273381,Flatbush/Ditmas Park,Brooklyn +90,-73.9967775123964,40.7425461606612,Flatiron,Manhattan +91,-73.932543590688,40.6273899924426,Flatlands,Brooklyn +92,-73.8304471275476,40.7641272834135,Flushing,Queens +93,-73.8418927550086,40.7392346574674,Flushing Meadows-Corona Park,Queens +94,-73.900591010865,40.8582607643596,Fordham South,Bronx +95,-73.8482199362705,40.7234652723597,Forest Hills,Queens +96,-73.8760122181239,40.6957609903045,Forest Park/Highland Park,Queens +97,-73.975576580035,40.6906156556941,Fort Greene,Brooklyn +98,-73.7795502346253,40.7338412186621,Fresh Meadows,Queens +99,-74.187702737221,40.5796179453647,Freshkills Park,Staten Island +100,-73.9887858928492,40.7535140917746,Garment District,Manhattan +101,-73.7090705910379,40.7437217835813,Glen Oaks,Queens +102,-73.8819987296701,40.7026793599143,Glendale,Queens +103,-74.0451814058337,40.6898605712245,Governor's Island/Ellis Island/Liberty Island,Manhattan +103,-74.0391308720565,40.6986732840584,Governor's Island/Ellis Island/Liberty Island,Manhattan +103,-74.0187943689955,40.6881378249245,Governor's Island/Ellis Island/Liberty Island,Manhattan +106,-73.9917635756064,40.6733611676171,Gowanus,Brooklyn +107,-73.9833103532463,40.7373483763208,Gramercy,Manhattan +108,-73.9803761510937,40.5891103441533,Gravesend,Brooklyn +109,-74.1527146857649,40.5488307159092,Great Kills,Staten Island +110,-74.1258464065771,40.5432675249983,Great Kills Park,Staten Island +111,-73.990988902762,40.6521158025647,Green-Wood Cemetery,Brooklyn +112,-73.9484721972696,40.7288303471399,Greenpoint,Brooklyn +113,-73.9946282171513,40.7324859518256,Greenwich Village North,Manhattan +114,-73.9986779466048,40.7286117315904,Greenwich Village South,Manhattan +115,-74.0924861023209,40.6201275450588,Grymes Hill/Clifton,Staten Island +116,-73.9473675432227,40.827535259806,Hamilton Heights,Manhattan +117,-73.7760841015511,40.5960558298222,Hammels/Arverne,Queens +118,-74.1370701313217,40.5856310621159,Heartland Village/Todt Hill,Staten Island +119,-73.9269662109924,40.8367300313346,Highbridge,Bronx +120,-73.9308068517413,40.8460040720504,Highbridge Park,Manhattan +121,-73.7996883075716,40.7273471948519,Hillcrest/Pomonok,Queens +122,-73.7615641851515,40.7110684671189,Hollis,Queens +123,-73.9651983047602,40.6002048027388,Homecrest,Brooklyn +124,-73.8500050428708,40.6595667686077,Howard Beach,Queens +125,-74.0071756920052,40.7253763405368,Hudson Sq,Manhattan +126,-73.8849772729363,40.813918192187,Hunts Point,Bronx +127,-73.9203254718565,40.8650626832916,Inwood,Manhattan +128,-73.9254320855109,40.8721876093241,Inwood Hill Park,Manhattan +129,-73.8874063794919,40.7590574739636,Jackson Heights,Queens +130,-73.8003475246798,40.7032738833012,Jamaica,Queens +131,-73.7713744950727,40.720412328134,Jamaica Estates,Queens +132,-73.778263658902,40.6426045217178,JFK Airport,Queens +133,-73.9743361977043,40.6393872038878,Kensington,Brooklyn +134,-73.8299927030517,40.7087501798075,Kew Gardens,Queens +135,-73.8239855138759,40.7294099643617,Kew Gardens Hills,Queens +136,-73.9057199662352,40.8648027310875,Kingsbridge Heights,Bronx +137,-73.9771193707357,40.7403592357115,Kips Bay,Manhattan +138,-73.8728037146871,40.7748673895178,LaGuardia Airport,Queens +139,-73.7433230842567,40.6777493674309,Laurelton,Queens +140,-73.9545680421348,40.7655068182316,Lenox Hill East,Manhattan +141,-73.9597126787805,40.7668388165846,Lenox Hill West,Manhattan +142,-73.9813524137399,40.7739059853886,Lincoln Square East,Manhattan +143,-73.9879729664182,40.7757702095713,Lincoln Square West,Manhattan +144,-73.9974066841975,40.7205814515076,Little Italy/NoLiTa,Manhattan +145,-73.9486991918172,40.7465899679036,Long Island City/Hunters Point,Queens +146,-73.9336340945451,40.7545288941149,Long Island City/Queens Plaza,Queens +147,-73.8981929347315,40.8191985303455,Longwood,Bronx +148,-73.9907183626618,40.7192116742044,Lower East Side,Manhattan +149,-73.9484742088121,40.6065579859268,Madison,Brooklyn +150,-73.9428842620734,40.5802623241253,Manhattan Beach,Brooklyn +151,-73.9678083909732,40.797866268004,Manhattan Valley,Manhattan +152,-73.9543248746271,40.8175772001908,Manhattanville,Manhattan +153,-73.9110625766578,40.8756004059432,Marble Hill,Manhattan +154,-73.8961233069501,40.593118666611,Marine Park/Floyd Bennett Field,Brooklyn +155,-73.9067693423191,40.6177885388735,Marine Park/Mill Basin,Brooklyn +156,-74.1648593047613,40.6287483754853,Mariners Harbor,Staten Island +157,-73.9019256908442,40.7240393566008,Maspeth,Queens +158,-74.0083857570117,40.735248066679,Meatpacking/West Village West,Manhattan +159,-73.9135829897644,40.8182594605072,Melrose South,Bronx +160,-73.8807122478637,40.7185038197548,Middle Village,Queens +161,-73.9774318381095,40.7582264812217,Midtown Center,Manhattan +162,-73.9721454850812,40.7568161553126,Midtown East,Manhattan +163,-73.9783669880204,40.7644254534051,Midtown North,Manhattan +164,-73.9859288064872,40.7488076668144,Midtown South,Manhattan +165,-73.9546029482619,40.6209587000862,Midwood,Brooklyn +166,-73.9618152643826,40.8095702282723,Morningside Heights,Manhattan +167,-73.9044425941057,40.8279880356749,Morrisania/Melrose,Bronx +168,-73.9170580317151,40.8074395308644,Mott Haven/Port Morris,Bronx +169,-73.9050214600049,40.8491148219147,Mount Hope,Bronx +170,-73.9769419924356,40.7476542811031,Murray Hill,Manhattan +171,-73.8088807171629,40.7689436687832,Murray Hill-Queens,Queens +172,-74.1039265024817,40.5724654201423,New Dorp/Midland Beach,Staten Island +173,-73.8630837007824,40.7517792439212,North Corona,Queens +174,-73.8777605618466,40.8768525921671,Norwood,Bronx +175,-73.7573567716067,40.743273849336,Oakland Gardens,Queens +176,-74.1196124324174,40.5620606070131,Oakwood,Staten Island +177,-73.9111057913062,40.6770986129152,Ocean Hill,Brooklyn +178,-73.9706887790417,40.6176331814343,Ocean Parkway South,Brooklyn +179,-73.9268123582238,40.7714253418156,Old Astoria,Queens +180,-73.8493643543423,40.6751638515945,Ozone Park,Queens +181,-73.979044896845,40.672019140546,Park Slope,Brooklyn +182,-73.8579416206911,40.8374043586994,Parkchester,Bronx +183,-73.8318540529646,40.8495686468422,Pelham Bay,Bronx +184,-73.8046108382118,40.8647854171785,Pelham Bay Park,Bronx +185,-73.8551804331804,40.8534716586784,Pelham Parkway,Bronx +186,-73.9924553277106,40.7484763617618,Penn Station/Madison Sq West,Manhattan +187,-74.1411524820629,40.6254537199379,Port Richmond,Staten Island +188,-73.94520015759,40.6575600630782,Prospect-Lefferts Gardens,Brooklyn +189,-73.968269632125,40.6771824866237,Prospect Heights,Brooklyn +190,-73.9709500738325,40.6606075676682,Prospect Park,Brooklyn +191,-73.7411204207497,40.7142783033807,Queens Village,Queens +192,-73.8151925356493,40.7444109985,Queensboro Hill,Queens +193,-73.9402863033051,40.7617244307638,Queensbridge/Ravenswood,Queens +194,-73.9210290324994,40.7914329582306,Randalls Island,Manhattan +195,-74.009549313284,40.6754616838619,Red Hook,Brooklyn +196,-73.8640102120344,40.7236478589889,Rego Park,Queens +197,-73.8290435569064,40.6930004742218,Richmond Hill,Queens +198,-73.9019937512221,40.7045215336003,Ridgewood,Queens +199,-73.8835364013293,40.7920459432226,Rikers Island,Bronx +200,-73.9064606078974,40.8998596211143,Riverdale/North Riverdale/Fieldston,Bronx +201,-73.8472464785559,40.577468718337,Rockaway Park,Queens +202,-73.9504104879777,40.7611679139077,Roosevelt Island,Manhattan +203,-73.7367188089441,40.6595017770463,Rosedale,Queens +204,-74.2069729508607,40.5407426890056,Rossville/Woodrow,Staten Island +205,-73.7626156564313,40.6922189718589,Saint Albans,Queens +206,-74.1231637564731,40.6359593979133,Saint George/New Brighton,Staten Island +207,-73.8993250159726,40.7635113549352,Saint Michaels Cemetery/Woodside,Queens +208,-73.824886342133,40.8246861384134,Schuylerville/Edgewater Park,Bronx +209,-74.0023597244525,40.7084896914032,Seaport,Manhattan +210,-73.9443360454416,40.5937616085004,Sheepshead Bay,Brooklyn +211,-74.0013747299582,40.7238990714701,SoHo,Manhattan +212,-73.8699019343016,40.8282769856365,Soundview/Bruckner,Bronx +213,-73.8607910370492,40.816490762555,Soundview/Castle Hill,Bronx +214,-74.0858854521269,40.5866190655298,South Beach/Dongan Hills,Staten Island +215,-73.7903675378715,40.6941903013512,South Jamaica,Queens +216,-73.8205132705286,40.6770712325623,South Ozone Park,Queens +217,-73.9568457688564,40.703249610232,South Williamsburg,Brooklyn +218,-73.7721162803473,40.6736200279408,Springfield Gardens North,Queens +219,-73.7610079604205,40.6602284956109,Springfield Gardens South,Queens +220,-73.9118772631728,40.8819009385792,Spuyten Duyvil/Kingsbridge,Bronx +221,-74.0813347613662,40.6202239775881,Stapleton,Staten Island +222,-73.8821375169092,40.6468719097136,Starrett City,Brooklyn +223,-73.906671830047,40.7781933282005,Steinway,Queens +224,-73.9778420751297,40.7317281591558,Stuy Town/Peter Cooper Village,Manhattan +225,-73.9314448522357,40.6887850672001,Stuyvesant Heights,Brooklyn +226,-73.9293212827563,40.7352734102809,Sunnyside,Queens +227,-74.0067945517929,40.6415976370168,Sunset Park East,Brooklyn +228,-74.009960471569,40.6536182069019,Sunset Park West,Brooklyn +229,-73.9651741989212,40.7565892984893,Sutton Place/Turtle Bay North,Manhattan +230,-73.9841761608222,40.7598447400293,Times Sq/Theatre District,Manhattan +231,-74.0067116243899,40.7186956410077,TriBeCa/Civic Center,Manhattan +232,-73.9823067049464,40.7153860841071,Two Bridges/Seward Park,Manhattan +233,-73.9712559771296,40.7491720982236,UN/Turtle Bay South,Manhattan +234,-73.9904776189691,40.7403134645878,Union Sq,Manhattan +235,-73.9149683294755,40.8535029301211,University Heights/Morris Heights,Bronx +236,-73.9569723853064,40.7804914757347,Upper East Side North,Manhattan +237,-73.9656914919459,40.7685418846646,Upper East Side South,Manhattan +238,-73.9728145174503,40.7917662467824,Upper West Side North,Manhattan +239,-73.9782733418486,40.7841073645538,Upper West Side South,Manhattan +240,-73.8790671945698,40.8947464191253,Van Cortlandt Park,Bronx +241,-73.8964501437395,40.8759758476571,Van Cortlandt Village,Bronx +242,-73.8398789614734,40.8495965844009,Van Nest/Morris Park,Bronx +243,-73.9328243182356,40.8586702866313,Washington Heights North,Manhattan +244,-73.9416668140575,40.8412186698664,Washington Heights South,Manhattan +245,-74.1031706067521,40.628478475102,West Brighton,Staten Island +246,-74.004512572075,40.7524372519647,West Chelsea/Hudson Yards,Manhattan +247,-73.9250591826957,40.8292114745072,West Concourse,Bronx +248,-73.8710104534108,40.8346771999308,West Farms/Bronx River,Bronx +249,-74.0024969086272,40.7346115559814,West Village,Manhattan +250,-73.8494786753898,40.8324906886295,Westchester Village/Unionport,Bronx +251,-74.1232368409925,40.6196055161537,Westerleigh,Staten Island +252,-73.8153942986072,40.788360648021,Whitestone,Queens +253,-73.8414745485909,40.7600863368444,Willets Point,Queens +254,-73.8582696518177,40.8832233292992,Williamsbridge/Olinville,Bronx +255,-73.9571337756889,40.7188341922346,Williamsburg (North Side),Brooklyn +256,-73.9591078838642,40.7109771296904,Williamsburg (South Side),Brooklyn +257,-73.9772393111852,40.6536644952118,Windsor Terrace,Brooklyn +258,-73.8566390530717,40.6901263678129,Woodhaven,Queens +259,-73.8563511172889,40.8991027731978,Woodlawn/Wakefield,Bronx +260,-73.9037132789432,40.7467977944692,Woodside,Queens +261,-74.0129193755126,40.708975618892,World Trade Center,Manhattan +262,-73.9458298180079,40.7765342289951,Yorkville East,Manhattan +263,-73.9512079916544,40.7784958687768,Yorkville West,Manhattan \ No newline at end of file diff --git a/content/visualisation.md b/content/visualisation.md new file mode 100644 index 0000000..74a0103 --- /dev/null +++ b/content/visualisation.md @@ -0,0 +1,93 @@ +# Visualisations and dashboards + +:::{questions} +- What syntax is used to make a lesson? +- How do you structure a lesson effectively for teaching? +- `questions` are at the top of a lesson and provide a starting + point for what you might learn. It is usually a bulleted list. +::: + +:::{objectives} +- Show a complete lesson page with all of the most common + structures. +- ... + +This is also a holdover from the carpentries-style. It could +usually be left off. +::: + + + + +The introduction should be a high level overview of what is on the +page and why it is interesting. + + +The lines below (only in the source) will set the default highlighting +language for the entire page. + +:::{highlight} python +::: + + + +## Section + +A section. + +:::{discussion} +Discuss the following. + +- A discussion section +- Another discussion topic +::: + + + +## Section + +``` +print("hello world") +# This uses the default highlighting language +``` + +```python +print("hello world) +``` + + + +## Exercises: description + +:::{exercise} Exercise Topic-1: imperative description of exercise +Exercise text here. +::: + +:::{solution} +Solution text here +::: + + + +## Summary + +A Summary of what you learned and why it might be useful. Maybe a +hint of what comes next. + + + +## See also + +- Other relevant links +- Other link + + + +:::{keypoints} +- What the learner should take away +- point 2 +- ... + +This is another holdover from the carpentries style. This perhaps +is better done in a "summary" section. +::: diff --git a/content/yellow_tripdata_2025-01.parquet b/content/yellow_tripdata_2025-01.parquet new file mode 100644 index 0000000..f89625d Binary files /dev/null and b/content/yellow_tripdata_2025-01.parquet differ