|
| 1 | +import pandas as pd |
| 2 | +import numpy as np |
| 3 | +import os |
| 4 | +pd.set_option('display.max_columns', None) |
| 5 | +os.chdir(os.path.dirname(os.path.dirname(__file__))) |
| 6 | + |
| 7 | +# %% |
| 8 | +# Directories |
| 9 | +data_dir = 'data' |
| 10 | +raw_dir = f'{data_dir}/raw' |
| 11 | +interim_dir = f'{data_dir}/interim' |
| 12 | +processed_dir = f'{data_dir}/processed' |
| 13 | + |
| 14 | +# %% |
| 15 | +# Load data |
| 16 | +hh = pd.read_csv(f'{raw_dir}/ex_hh.csv') |
| 17 | +# day = pd.read_csv(f'{raw_dir}/ex_day.csv') |
| 18 | +person = pd.read_csv(f'{raw_dir}/ex_person.csv') |
| 19 | +trips = pd.read_csv(f'{raw_dir}/ex_trip_unlinked.csv') |
| 20 | +ltrips = pd.read_csv(f'{raw_dir}/ex_trip_linked.csv') |
| 21 | + |
| 22 | +# %% Rename variables |
| 23 | +ltrips.rename( |
| 24 | + columns= |
| 25 | + {'linked_trip_weight': 'ltrip_weight'}, |
| 26 | + inplace=True |
| 27 | +) |
| 28 | + |
| 29 | +# Re-map values to labels |
| 30 | +person_mapping = { |
| 31 | + 'gender': {1: 'Female', 2: 'Male', 3:'Non-binary', 995:'Missing', 997:'Other', 999:'PNTA'}, |
| 32 | + 'age': {1:'Under5', 2:'5-10', 3:'11-15', 4:'16-17', 5:'18-24', 6:'25-34', 7:'35-44', 8:'45-54', 9:'55-64', 10:'65-74', 11:'75-84', 12:'85plus'}, |
| 33 | + 'bike_freq': {1: '6-7days', 2: '5days', 3: '4days', 4: '3days', 5: '2days', 6: '1day', 7: '1-3month', 8: '<1month', 996: 'Never', 995: 'Missing'}, |
| 34 | + 'bike_attitude': {1:'PhysicallyUnable', 2:'DoesNotKnowHow', 3:'NotInterested', 4:'WantBikeLess', 5:'HappyWithCurrent', 6:'WantBikeMore', 995:'Missing'} |
| 35 | +} |
| 36 | + |
| 37 | +# Re-map to broader categories |
| 38 | +bike_freq_map = { |
| 39 | + '6-7days': 'VeryFrequent', |
| 40 | + '5days': 'VeryFrequent', |
| 41 | + '4days': 'VeryFrequent', |
| 42 | + '3days': 'Occasional', |
| 43 | + '2days': 'Occasional', |
| 44 | + '1day': 'Occasional', |
| 45 | + '1-3month': 'Infrequent', |
| 46 | + '<1month': 'Infrequent', |
| 47 | + 'Never': 'Never', |
| 48 | + 'Missing': 'Missing' |
| 49 | +} |
| 50 | + |
| 51 | +bike_att_bin = { |
| 52 | + 'PhysicallyUnable': 'NotInterestedCant', |
| 53 | + 'DoesNotKnowHow': 'NotInterestedCant', |
| 54 | + 'NotInterested': 'NotInterestedCant', |
| 55 | + 'WantBikeLess': 'WantBikeLess', |
| 56 | + 'HappyWithCurrent': 'HappyWithCurrent', |
| 57 | + 'WantBikeMore': 'WantBikeMore', |
| 58 | + 'Missing': 'Missing' |
| 59 | +} |
| 60 | + |
| 61 | +income_bin = { |
| 62 | + 1: 'Low', |
| 63 | + 2: 'Low', |
| 64 | + 3: 'Mid', |
| 65 | + 4: 'Mid', |
| 66 | + 5: 'High', |
| 67 | + 6: 'High', |
| 68 | + 995: 'Missing', |
| 69 | + 999: 'Missing' |
| 70 | +} |
| 71 | + |
| 72 | +income_num = { |
| 73 | + 1: 25000, |
| 74 | + 2: 37500, |
| 75 | + 3: 62500, |
| 76 | + 4: 87500, |
| 77 | + 5: 150000, |
| 78 | + 6: 200000, |
| 79 | + 995: np.nan, |
| 80 | + 999: np.nan |
| 81 | +} |
| 82 | + |
| 83 | +age_bin = { |
| 84 | + 'Under5': 'Under10', |
| 85 | + '5-10': 'Under10', |
| 86 | + '11-15': '10-17', |
| 87 | + '16-17': '10-17', |
| 88 | + '18-24': '18-34', |
| 89 | + '25-34': '18-34', |
| 90 | + '35-44': '35-64', |
| 91 | + '45-54': '35-64', |
| 92 | + '55-64': '35-64', |
| 93 | + '65-74': '65+', |
| 94 | + '75-84': '65+', |
| 95 | + '85plus': '65+' |
| 96 | +} |
| 97 | + |
| 98 | +age_num = { |
| 99 | + 'Under5': 2.5, |
| 100 | + '5_10': 8, |
| 101 | + '11-15': 13, |
| 102 | + '16-17': 16, |
| 103 | + '18-24': 21, |
| 104 | + '25-34': 30, |
| 105 | + '35-44': 40, |
| 106 | + '45-54': 50, |
| 107 | + '55-64': 60, |
| 108 | + '65-74': 70, |
| 109 | + '75-84': 80, |
| 110 | + '85plus': 85 |
| 111 | +} |
| 112 | + |
| 113 | +person['gender_num'] = np.where( |
| 114 | + person['gender'].isin([1,3,997]), 1, # Female & Others |
| 115 | + np.where(person['gender'] == 2, 2, 3) # Male and Missing |
| 116 | +) |
| 117 | +person['gender_lab'] = person['gender'].map(person_mapping['gender']) |
| 118 | +person['gender_bin'] = person['gender_num'].map({1:'Female & Others', 2: 'Male', 3: 'Missing'}) |
| 119 | + |
| 120 | +person['age_lab'] = person['age'].map(person_mapping['age']) |
| 121 | +person['age_bin'] = person['age_lab'].map(age_bin) |
| 122 | +person['age_num'] = person['age_lab'].map(age_num) |
| 123 | + |
| 124 | +person['bike_freq_lab'] = person['bike_freq'].map(person_mapping['bike_freq']) |
| 125 | +person['bike_freq_bin'] = person['bike_freq_lab'].map(bike_freq_map) |
| 126 | + |
| 127 | +person['bike_att'] = person['bike_attitude'].map(person_mapping['bike_attitude']) |
| 128 | +person['bike_att_bin'] = person['bike_att'].map(bike_att_bin) |
| 129 | + |
| 130 | +hh['income_bin'] = hh['income_broad'].map(income_bin) |
| 131 | +hh['income_num'] = hh['income_broad'].map(income_num) |
| 132 | + |
| 133 | +# %% |
| 134 | +# Define comfort variable |
| 135 | +comfort_cols = [col for col in person.columns if 'bike_comfort' in col] |
| 136 | +likert_scale = {1:4, 2:3, 3:2, 4:1} # Reverse scoring for comfort |
| 137 | +person[comfort_cols] = person[comfort_cols].replace(likert_scale) |
| 138 | +person['avg_comfort'] = person[comfort_cols].replace(995, pd.NA).mean(axis=1, skipna=True).fillna(0) |
| 139 | +comfort_labels = ['Uncomfortable', 'Comfortable'] |
| 140 | +person['comfort_bin'] = np.where( |
| 141 | + person['avg_comfort'] >= 3, 'Comfortable', |
| 142 | + np.where(person['avg_comfort'] != 0 , 'Uncomfortable', pd.NA) |
| 143 | +) |
| 144 | + |
| 145 | +# %% |
| 146 | +# Identify complete rmove households (for comfort classification - 04_classify.py) |
| 147 | +print(f"Total households: {hh['hh_weight'].sum():,.2f}") |
| 148 | + |
| 149 | +hh_rmove = hh[hh['diary_platform'] == 'rmove'] |
| 150 | +print(f"Total households with rmove: {hh_rmove['hh_weight'].sum():,.2f}") |
| 151 | + |
| 152 | +hh_rmove_complete = hh_rmove[hh_rmove['num_days_complete'] == 7] |
| 153 | +print(f"Total number of households with 7 complete days: {hh_rmove_complete['hh_weight'].sum():,.2f}") |
| 154 | +print(f"Total sample size of households with 7 complete days: {hh_rmove_complete.shape[0]:,}") |
| 155 | + |
| 156 | +bike_complete_hh_ids = hh_rmove_complete['hh_id'].unique().tolist() |
| 157 | + |
| 158 | +# Add flag to identify complete records |
| 159 | +person['bike_complete_flag'] = 0 |
| 160 | +person.loc[person['hh_id'].isin(bike_complete_hh_ids), 'bike_complete_flag'] = 1 |
| 161 | + |
| 162 | +# %% Merge bike trips with person table |
| 163 | +# Count bike trips |
| 164 | +bike_trip_mask = (ltrips['linked_trip_mode'] == 11) |
| 165 | +bike_trips= ltrips[bike_trip_mask].copy() |
| 166 | +bike_counts = bike_trips.groupby('person_id').size().reset_index(name='bike_trips') |
| 167 | + |
| 168 | +# Keep relevant cols |
| 169 | +per_cols = ['hh_id', 'person_id', 'age_lab', 'age_bin', 'age_num', |
| 170 | + 'gender_bin', 'num_days_complete', 'bike_complete_flag', |
| 171 | + 'avg_comfort', 'comfort_bin', |
| 172 | + 'student', 'person_weight'] |
| 173 | +bike_cols_per = [col for col in person.columns if 'bike' in col] |
| 174 | + |
| 175 | +# Merge bike trip info with person data |
| 176 | +person_btrips = pd.merge( |
| 177 | + person[per_cols + bike_cols_per], |
| 178 | + bike_counts, |
| 179 | + on='person_id', |
| 180 | + how='left', |
| 181 | + validate='1:1' |
| 182 | +).fillna({'bike_trips': 0}) |
| 183 | + |
| 184 | +# Merge with hh data |
| 185 | +hh_cols = ['hh_id', 'num_bicycle_adult', 'num_bicycle_child', 'income_broad', 'income_bin', 'income_num'] |
| 186 | +person_btrips = pd.merge( |
| 187 | + person_btrips, |
| 188 | + hh[hh_cols], |
| 189 | + on = 'hh_id', |
| 190 | + how = 'left' |
| 191 | +) |
| 192 | + |
| 193 | +# Add dummy if person reported bike trip |
| 194 | +person_btrips['recorded_btrip'] = np.where(person_btrips['bike_trips'] > 0, 1, 0) |
| 195 | + |
| 196 | +# %% Merge bike trips with person info |
| 197 | +btrips_person = pd.merge( |
| 198 | + bike_trips.drop(columns = ['hh_id']), |
| 199 | + person[per_cols + bike_cols_per], |
| 200 | + how = 'left', |
| 201 | + on = 'person_id' |
| 202 | +) |
| 203 | + |
| 204 | +# Add hh data |
| 205 | +btrips_person = pd.merge( |
| 206 | + btrips_person, |
| 207 | + hh[hh_cols], |
| 208 | + how = 'left', |
| 209 | + on = 'hh_id' |
| 210 | +) |
| 211 | + |
| 212 | +# Add bike type to bike trip info |
| 213 | +ltrip_bike_ids = bike_trips['linked_trip_id'].to_list() |
| 214 | +subset = trips[trips['linked_trip_id'].isin(ltrip_bike_ids)] # bike type is unlinked trip table |
| 215 | + |
| 216 | +bike_modes = [2, 3, 4, 5, 69, 70, 82, 103, 300] # bike modes and hierarchy |
| 217 | + |
| 218 | +def get_highest_bike_mode(modes): |
| 219 | + found_bikes = [m for m in modes if m in bike_modes] |
| 220 | + if found_bikes: |
| 221 | + return max(found_bikes) |
| 222 | + else: |
| 223 | + return None |
| 224 | + |
| 225 | +modes = subset.groupby('linked_trip_id')['mode_1'].agg(list).reset_index().rename(columns = {'mode_1':'mode_list'}) |
| 226 | +modes['bike_mode'] = modes['mode_list'].apply(get_highest_bike_mode) |
| 227 | + |
| 228 | +# Add bike type |
| 229 | +btrips_person = pd.merge( |
| 230 | + btrips_person, |
| 231 | + modes[['linked_trip_id', 'bike_mode']], |
| 232 | + how = 'left', |
| 233 | + on = 'linked_trip_id', |
| 234 | +) |
| 235 | + |
| 236 | +ebikes = [70, 82] # ebike from bikeshare, ebike in household |
| 237 | +btrips_person['bike_type'] = np.where(btrips_person['bike_mode'].isin(ebikes), 'ebike', 'standard') |
| 238 | + |
| 239 | +# %% Export |
| 240 | +person_btrips.to_csv(f'{interim_dir}/person_btrips.csv', index=False) |
| 241 | +btrips_person.to_csv(f'{interim_dir}/btrips_person.csv', index=False) |
0 commit comments