From cd86aa3ecb1189863635fad9dabffb5124ef87ff Mon Sep 17 00:00:00 2001 From: Alifia Rahmah Date: Mon, 25 Nov 2024 00:37:37 +0700 Subject: [PATCH 1/4] Change pd.set_option('display.mpl_style', 'default') to use plt.style.use('default') pd.set_option('display.mpl_style', 'default') is deprecated. Changed with plt.style.use('default') --- content/pandas cookbook/chapter1.md | 2 +- content/pandas cookbook/chapter2.md | 6 +++--- content/pandas cookbook/chapter3.md | 6 +++--- content/pandas cookbook/chapter4.md | 14 +++++++------- content/pandas cookbook/chapter5.md | 12 ++++++------ content/pandas cookbook/chapter6.md | 4 ++-- content/pandas cookbook/chapter7.md | 20 ++++++++++---------- 7 files changed, 32 insertions(+), 32 deletions(-) diff --git a/content/pandas cookbook/chapter1.md b/content/pandas cookbook/chapter1.md index 54a2186..600af1c 100644 --- a/content/pandas cookbook/chapter1.md +++ b/content/pandas cookbook/chapter1.md @@ -26,7 +26,7 @@ keywords: import pandas as pd import matplotlib.pyplot as plt -pd.set_option('display.mpl_style', 'default') # Make the graphs a bit prettier +plt.style.use('default') # Make the graphs a bit prettier plt.rcParams['figure.figsize'] = (15, 5) ``` diff --git a/content/pandas cookbook/chapter2.md b/content/pandas cookbook/chapter2.md index 2b3961e..93c1af9 100644 --- a/content/pandas cookbook/chapter2.md +++ b/content/pandas cookbook/chapter2.md @@ -26,11 +26,11 @@ import pandas as pd import matplotlib.pyplot as plt # Make the graphs a bit prettier, and bigger -pd.set_option('display.mpl_style', 'default') +plt.style.use('default') -# This is necessary to show lots of columns in pandas 0.12. +# This is necessary to show lots of columns in pandas 0.12. # Not necessary in pandas 0.13. -pd.set_option('display.width', 5000) +pd.set_option('display.width', 5000) pd.set_option('display.max_columns', 60) plt.rcParams['figure.figsize'] = (15, 5) diff --git a/content/pandas cookbook/chapter3.md b/content/pandas cookbook/chapter3.md index e6098c7..3c3cc05 100644 --- a/content/pandas cookbook/chapter3.md +++ b/content/pandas cookbook/chapter3.md @@ -27,13 +27,13 @@ import matplotlib.pyplot as plt import numpy as np # Make the graphs a bit prettier, and bigger -pd.set_option('display.mpl_style', 'default') +plt.style.use('default') plt.rcParams['figure.figsize'] = (15, 5) -# This is necessary to show lots of columns in pandas 0.12. +# This is necessary to show lots of columns in pandas 0.12. # Not necessary in pandas 0.13. -pd.set_option('display.width', 5000) +pd.set_option('display.width', 5000) pd.set_option('display.max_columns', 60) ``` diff --git a/content/pandas cookbook/chapter4.md b/content/pandas cookbook/chapter4.md index bb4c51f..b0e7c47 100644 --- a/content/pandas cookbook/chapter4.md +++ b/content/pandas cookbook/chapter4.md @@ -24,13 +24,13 @@ keywords: import pandas as pd import matplotlib.pyplot as plt -pd.set_option('display.mpl_style', 'default') # Make the graphs a bit prettier +plt.style.use('default') # Make the graphs a bit prettier plt.rcParams['figure.figsize'] = (15, 5) plt.rcParams['font.family'] = 'sans-serif' -# This is necessary to show lots of columns in pandas 0.12. +# This is necessary to show lots of columns in pandas 0.12. # Not necessary in pandas 0.13. -pd.set_option('display.width', 5000) +pd.set_option('display.width', 5000) pd.set_option('display.max_columns', 60) ``` @@ -232,7 +232,7 @@ Output: This turns out to be really easy! Dataframes have a [.groupby()](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.groupby.html) method that is similar to [SQL groupby](https://docs.microsoft.com/en-us/sql/t-sql/queries/select-group-by-transact-sql), if you're familiar with that. I'm not going to explain more about it right now -- if you want to to know more, [the documentation](http://pandas.pydata.org/pandas-docs/stable/groupby.html) is really good. -In this case, `berri_bikes.groupby('weekday').aggregate(sum)` means +In this case, `berri_bikes.groupby('weekday').aggregate(sum)` means > "Group the rows by weekday and then add up all the values with the same weekday." @@ -360,9 +360,9 @@ Let's put all that together, to prove how easy it is. 6 lines of magical pandas! If you want to play around, try changing sum to max, numpy.median, or any other function you like. ```python -bikes = pd.read_csv('../data/bikes.csv', - sep=';', encoding='latin1', - parse_dates=['Date'], dayfirst=True, +bikes = pd.read_csv('../data/bikes.csv', + sep=';', encoding='latin1', + parse_dates=['Date'], dayfirst=True, index_col='Date') # Add the weekday column berri_bikes = bikes[['Berri 1']].copy() diff --git a/content/pandas cookbook/chapter5.md b/content/pandas cookbook/chapter5.md index 350b412..8a06e4d 100644 --- a/content/pandas cookbook/chapter5.md +++ b/content/pandas cookbook/chapter5.md @@ -24,7 +24,7 @@ import pandas as pd import matplotlib.pyplot as plt import numpy as np -pd.set_option('display.mpl_style', 'default') +plt.style.use('default') plt.rcParams['figure.figsize'] = (15, 3) plt.rcParams['font.family'] = 'sans-serif' ``` @@ -1621,11 +1621,11 @@ I had to write '\xb0' for that degree character °. Let's fix up the columns. We ```python weather_mar2012.columns = [ - u'Year', u'Month', u'Day', u'Time', u'Data Quality', u'Temp (C)', - u'Temp Flag', u'Dew Point Temp (C)', u'Dew Point Temp Flag', - u'Rel Hum (%)', u'Rel Hum Flag', u'Wind Dir (10s deg)', u'Wind Dir Flag', + u'Year', u'Month', u'Day', u'Time', u'Data Quality', u'Temp (C)', + u'Temp Flag', u'Dew Point Temp (C)', u'Dew Point Temp Flag', + u'Rel Hum (%)', u'Rel Hum Flag', u'Wind Dir (10s deg)', u'Wind Dir Flag', u'Wind Spd (km/h)', u'Wind Spd Flag', u'Visibility (km)', u'Visibility Flag', - u'Stn Press (kPa)', u'Stn Press Flag', u'Hmdx', u'Hmdx Flag', u'Wind Chill', + u'Stn Press (kPa)', u'Stn Press Flag', u'Hmdx', u'Hmdx Flag', u'Wind Chill', u'Wind Chill Flag', u'Weather'] ``` @@ -1866,7 +1866,7 @@ temperatures.groupby('Hour').aggregate(np.median).plot() Output: ```bash -Date/Time +Date/Time 2012-03-01 00:00:00 -5.5 2012-03-01 01:00:00 -5.7 2012-03-01 02:00:00 -5.4 diff --git a/content/pandas cookbook/chapter6.md b/content/pandas cookbook/chapter6.md index ea3404d..855b008 100644 --- a/content/pandas cookbook/chapter6.md +++ b/content/pandas cookbook/chapter6.md @@ -9,7 +9,7 @@ prev: /pandas-cookbook/chapter5 title: Chapter 6 - String Operations weight: 35 url: /pandas-cookbook/chapter6 -description: String Operations in pandas. Using resampling and plotting temperature. +description: String Operations in pandas. Using resampling and plotting temperature. keywords: - pandas - string @@ -23,7 +23,7 @@ import pandas as pd import matplotlib.pyplot as plt import numpy as np -pd.set_option('display.mpl_style', 'default') +plt.style.use('default') plt.rcParams['figure.figsize'] = (15, 3) plt.rcParams['font.family'] = 'sans-serif' ``` diff --git a/content/pandas cookbook/chapter7.md b/content/pandas cookbook/chapter7.md index 2038876..2e5210b 100644 --- a/content/pandas cookbook/chapter7.md +++ b/content/pandas cookbook/chapter7.md @@ -21,12 +21,12 @@ keywords: import pandas as pd # Make the graphs a bit prettier, and bigger -pd.set_option('display.mpl_style', 'default') +plt.style.use('default') figsize(15, 5) # Always display all the columns -pd.set_option('display.line_width', 5000) -pd.set_option('display.max_columns', 60) +pd.set_option('display.line_width', 5000) +pd.set_option('display.max_columns', 60) ``` One of the main problems with messy data is: how do you know if it's messy or not? @@ -62,7 +62,7 @@ Some of the problems: - There are nans - Some of the zip codes are 29616-0759 or 83 - There are some N/A values that pandas didn't recognize, like 'N/A' and 'NO CLUE' - + What we can do: - Normalize 'N/A' and 'NO CLUE' into regular nan values @@ -1003,20 +1003,20 @@ Here's what we ended up doing to clean up our zip codes, all together: ```python na_values = ['NO CLUE', 'N/A', '0'] -requests = pd.read_csv('311-service-requests.csv', - na_values=na_values, +requests = pd.read_csv('311-service-requests.csv', + na_values=na_values, dtype={'Incident Zip': str}) def fix_zip_codes(zips): - # Truncate everything to length 5 + # Truncate everything to length 5 zips = zips.str.slice(0, 5) - + # Set 00000 zip codes to nan zero_zips = zips == '00000' zips[zero_zips] = np.nan - + return zips - + requests['Incident Zip'] = fix_zip_codes(requests['Incident Zip']) requests['Incident Zip'].unique() ``` From 4c083be5403ee4c27cca82095a1e582f55098945 Mon Sep 17 00:00:00 2001 From: Alifia Rahmah Date: Mon, 25 Nov 2024 00:43:36 +0700 Subject: [PATCH 2/4] Update chapter5.md The dataset in url seems updated, so the code in guide is modified so it is still relevant to the updated dataset --- content/pandas cookbook/chapter5.md | 31 ++++++++--------------------- 1 file changed, 8 insertions(+), 23 deletions(-) diff --git a/content/pandas cookbook/chapter5.md b/content/pandas cookbook/chapter5.md index 8a06e4d..2de5302 100644 --- a/content/pandas cookbook/chapter5.md +++ b/content/pandas cookbook/chapter5.md @@ -64,7 +64,7 @@ To get the data for March 2013, we need to format it with month=3, year=2012. ```python url = url_template.format(month=3, year=2012) -weather_mar2012 = pd.read_csv(url, index_col='Date/Time', parse_dates=True) +weather_mar2012 = pd.read_csv(url, index_col='Date/Time (LST)', parse_dates=True) ``` This is super great! We can just use the same read_csv function as before, and just give it a URL as a filename. Awesome. @@ -1604,7 +1604,7 @@ Output: Let's plot it! ```python -weather_mar2012[u"Temp (\xc2\xb0C)"].plot(figsize=(15, 5)) +weather_mar2012[u"Temp (°C)"].plot(figsize=(15, 5)) ``` Output: @@ -1617,18 +1617,6 @@ Notice how it goes up to 25° C in the middle there? That was a big deal. It was And I was out of town and I missed it. Still sad, humans. -I had to write '\xb0' for that degree character °. Let's fix up the columns. We're going to just print them out, copy, and fix them up by hand. - -```python -weather_mar2012.columns = [ - u'Year', u'Month', u'Day', u'Time', u'Data Quality', u'Temp (C)', - u'Temp Flag', u'Dew Point Temp (C)', u'Dew Point Temp Flag', - u'Rel Hum (%)', u'Rel Hum Flag', u'Wind Dir (10s deg)', u'Wind Dir Flag', - u'Wind Spd (km/h)', u'Wind Spd Flag', u'Visibility (km)', u'Visibility Flag', - u'Stn Press (kPa)', u'Stn Press Flag', u'Hmdx', u'Hmdx Flag', u'Wind Chill', - u'Wind Chill Flag', u'Weather'] -``` - You'll notice in the summary above that there are a few columns which are are either entirely empty or only have a few values in them. Let's get rid of all of those with dropna. The argument `axis=1` to `dropna` means "drop columns", not rows", and `how='any'` means "drop the column if any value is null". @@ -1758,12 +1746,12 @@ Output: -The Year/Month/Day/Time columns are redundant, though, and the Data Quality column doesn't look too useful. Let's get rid of those. +The Year/Month/Day/Time columns are redundant, though. Let's get rid of those. The `axis=1` argument means "Drop columns", like before. The default for operations like `dropna` and `drop` is always to operate on rows. ```python -weather_mar2012 = weather_mar2012.drop(['Year', 'Month', 'Day', 'Time', 'Data Quality'], axis=1) +weather_mar2012 = weather_mar2012.drop(['Year', 'Month', 'Day', 'Time (LST)'], axis=1) weather_mar2012[:5] ``` @@ -1857,7 +1845,7 @@ Awesome! We now only have the relevant columns, and it's much more manageable. This one's just for fun -- we've already done this before, using groupby and aggregate! We will learn whether or not it gets colder at night. Well, obviously. But let's do it anyway. ```python -temperatures = weather_mar2012[[u'Temp (C)']].copy() +temperatures = weather_mar2012[[u'Temp (°C)']].copy() print(temperatures.head) temperatures.loc[:,'Hour'] = weather_mar2012.index.hour temperatures.groupby('Hour').aggregate(np.median).plot() @@ -1948,13 +1936,10 @@ I noticed that there's an irritating bug where when I ask for January, it gives ```python def download_weather_month(year, month): - if month == 1: - year += 1 url = url_template.format(year=year, month=month) - weather_data = pd.read_csv(url, skiprows=15, index_col='Date/Time', parse_dates=True, header=True) + weather_data = pd.read_csv(url, index_col='Date/Time (LST)', parse_dates=True) weather_data = weather_data.dropna(axis=1) - weather_data.columns = [col.replace('\xb0', '') for col in weather_data.columns] - weather_data = weather_data.drop(['Year', 'Day', 'Month', 'Time', 'Data Quality'], axis=1) + weather_data = weather_data.drop(['Year', 'Month', 'Day', 'Time (LST)'], axis=1) return weather_data ``` @@ -2050,7 +2035,7 @@ Output: Now we can get all the months at once. This will take a little while to run. ```python -data_by_month = [download_weather_month(2012, i) for i in range(1, 13)] +data_by_month = [download_weather_month(2012, i) for i in range(1, 12)] ``` Once we have this, it's easy to concatenate all the dataframes together into one big dataframe using [pd.concat](http://pandas.pydata.org/pandas-docs/version/0.20/generated/pandas.concat.html). And now we have the whole year's data! From 193282e2df7c73f6c942eb8b2f328acd75d045c3 Mon Sep 17 00:00:00 2001 From: Alifia Rahmah Date: Mon, 25 Nov 2024 00:51:00 +0700 Subject: [PATCH 3/4] Update chapter6.md - Update the code to match updated datasets - Changed resample from .resample('M').apply(np.median) to .resample('ME').apply('median') (same as means) for deprecation fixes --- content/pandas cookbook/chapter6.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/content/pandas cookbook/chapter6.md b/content/pandas cookbook/chapter6.md index 855b008..1a02adb 100644 --- a/content/pandas cookbook/chapter6.md +++ b/content/pandas cookbook/chapter6.md @@ -165,7 +165,7 @@ Output: If we wanted the median temperature each month, we could use the [resample()](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.resample.html) method like this: ```python -weather_2012['Temp (C)'].resample('M').apply(np.median).plot(kind='bar') +weather_2012['Temp (°C)'].resample('ME').apply('median').plot(kind='bar') ``` Output: @@ -202,7 +202,7 @@ Name: Weather, dtype: float64 and then use resample to find the percentage of time it was snowing each month ```python -is_snowing.astype(float).resample('M').apply(np.mean) +is_snowing.astype(float).resample('ME').apply('mean') ``` Output: @@ -225,7 +225,7 @@ Freq: M, Name: Weather, dtype: float64 ``` ```python -is_snowing.astype(float).resample('M').apply(np.mean).plot(kind='bar') +is_snowing.astype(float).resample('ME').apply('mean').plot(kind='bar') ``` Output: @@ -242,9 +242,9 @@ So now we know! In 2012, December was the snowiest month. Also, this graph sugge We can also combine these two statistics (temperature, and snowiness) into one dataframe and plot them together: ```python -temperature = weather_2012['Temp (C)'].resample('M').apply(np.median) +temperature = weather_2012['Temp (°C)'].resample('ME').apply('median') is_snowing = weather_2012['Weather'].str.contains('Snow') -snowiness = is_snowing.astype(float).resample('M').apply(np.mean) +snowiness = is_snowing.astype(float).resample('ME').apply('mean') # Name the columns temperature.name = "Temperature" From 603846b433b97bb84610d9495e46c8a7017aedcd Mon Sep 17 00:00:00 2001 From: Alifia Rahmah Date: Mon, 25 Nov 2024 00:59:39 +0700 Subject: [PATCH 4/4] Update chapter7.md - Change pd.set_option('display.width', 5000) to pd.set_option('display.width', 5000) for deprecation fix - Use requests.loc to replace requests['Incident Zip'] with value '00000' with np.nan for addressing pandas copy warnings - Delete unique_zips.sort() because of error sorting array of strings and nan - Add .dropna() - Change .sort() to sort_values() --- content/pandas cookbook/chapter7.md | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/content/pandas cookbook/chapter7.md b/content/pandas cookbook/chapter7.md index 2e5210b..35e7a50 100644 --- a/content/pandas cookbook/chapter7.md +++ b/content/pandas cookbook/chapter7.md @@ -25,7 +25,7 @@ plt.style.use('default') figsize(15, 5) # Always display all the columns -pd.set_option('display.line_width', 5000) +pd.set_option('display.width', 5000) pd.set_option('display.max_columns', 60) ``` @@ -767,15 +767,13 @@ Output: This looks bad to me. Let's set these to nan. ```python -zero_zips = requests['Incident Zip'] == '00000' -requests['Incident Zip'][zero_zips] = np.nan +requests.loc[requests['Incident Zip'] == '00000', 'Incident Zip'] = np.nan ``` Great. Let's see where we are now: ```python unique_zips = requests['Incident Zip'].unique() -unique_zips.sort() unique_zips ``` @@ -829,7 +827,7 @@ zips = requests['Incident Zip'] is_close = zips.str.startswith('0') | zips.str.startswith('1') # There are a bunch of NaNs, but we're not interested in them right now, so we'll say they're True is_far = ~(is_close.fillna(True).astype(bool)) -zips[is_far] +zips.loc[is_far].dropna() ``` Output: @@ -955,7 +953,7 @@ Output: Okay, there really are requests coming from LA and Houston! Good to know. Filtering by zip code is probably a bad way to handle this -- we should really be looking at the city instead. ```python -requests['City'].str.upper().value_counts() +requests[is_far][['Incident Zip', 'Descriptor', 'City']].dropna().sort_values('Incident Zip') ``` Output: