Water_Quality_Analysis/USGS_functions.py at master · sdehm/Water_Quality_Analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Feb 18 16:36:55 2017

@author: shaffer
"""
import pytz
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

def drop_constant_cols(df_dropping,n=1):
    ''' This function takes in a dataframe name and outputs a copy of the
        dataframe with columns removed that only have 'n' unique values.
        The function prints the name of each column deleted along with
        the unique value.
    '''
    for name in df_dropping.columns:
        if len(df_dropping[name].unique())<=n:
            print(name+ ' deleted...')
            print(df_dropping[name].unique())
            df_dropping = df_dropping.drop(name, axis=1)
    print('Finished----------------------------------------------------------')
    return df_dropping

def fix_timezones(df,dateTime,timezone):
    ''' This function takes a pandas dataframe, column name, and timezone column
        as an input. The column name must be in a datetime format. The funcion
        outputs a copy of the dataframe with the specified column converted to
        UTC.
        The timezone column currently must be in the format used by the USGS for
        water quality data, ```EST, EDT, CST, CDT```. Other formats may need to
        be added when data from other state is used.
    '''
    est = pytz.timezone('Etc/GMT+5')
    edt = pytz.timezone('Etc/GMT+4')
    cst = pytz.timezone('Etc/GMT+6')
    cdt = pytz.timezone('Etc/GMT+5')

    df.loc[df[timezone]=='EST',[dateTime]] = \
          df.loc[df[timezone]=='EST',['dateTime']].apply(
                  lambda x: x.dt.tz_localize(est).dt.tz_convert('UTC'))

    df.loc[df[timezone]=='EDT',[dateTime]] = \
          df.loc[df[timezone]=='EDT',['dateTime']].apply(
                  lambda x: x.dt.tz_localize(edt).dt.tz_convert('UTC'))

    df.loc[df[timezone]=='CST',[dateTime]] = \
          df.loc[df[timezone]=='CST',['dateTime']].apply(
                  lambda x: x.dt.tz_localize(cst).dt.tz_convert('UTC'))

    df.loc[df[timezone]=='CDT',[dateTime]] = \
          df.loc[df[timezone]=='CDT',['dateTime']].apply(
                  lambda x: x.dt.tz_localize(cdt).dt.tz_convert('UTC'))

    df = df.drop(timezone,axis=1)
    print('Timezone column: '+timezone+' deleted...\n')
    return df

def merge_scale_delete(df,columns,scale):
    '''
        This function takes a dataframe, column names, and scaling list as an
        input. The column names are names of columns to be combined and scaled
        by the specified scaling factor. The first column name in the list
        will be used to store the merged columns. The scaling array must be
        the same length as the columns list as the values for scale relate to
        corresponding column names. The function outputs a copy of the input
        dataframe with the columns merged and extra columns deleted.
        Deleted column names are printed.
    '''
    if len(columns) != len(scale):
        print('List of scales must be the same length as the list of columns')
        # command to exit funciton and give error
    if scale[0] == 1:
        for i in range(1,len(columns)):
            df[columns[0]] = df[columns[0]].combine_first(scale[i]*df[columns[i]])
            print(columns[i]+ ' deleted...')
            df.drop(columns[i], axis=1,inplace=True)

        return df
    else:
        print('List the column to keep, scale=1, first for this function.')
        return

def outlier_std(column,stds=3,loops=1,plot=False):
    ''' This function trims an input column by a specified number of
        standard deviations from the mean default of three. The function
        will loop a specified number of times, default of only one.
        This serves to remove outliers from the data however a more robust
        and general algorithm will eventually replace it.

        The number of values deleted is printed along with the mean before
        and after outlier reduction.

        The plot flag can be used to generate distribution plots for the
        data before and after outlier reduction.
    '''
    column_int = column.copy()
    for i in range(0,loops):
        std_val = column_int.std(axis = 0)
        mean_val = column_int.mean()

        column_modified = column_int.copy()
        column_modified[column < mean_val - stds*std_val] = np.nan
        column_modified[column > mean_val+stds*std_val]   = np.nan
        column_int = column_modified.copy()

    points = column.count() - column_modified.count()

    print("Before Mean=%f ----- After Mean=%f" \
                             % (column.mean(),column_modified.mean()))
    print("%d points deleted out of %d total.----------------------" \
                                              % (points, len(column)))
    # if the plot flag is true then print a dist plot before and after
    if(plot):
        plt.figure()
        sns.distplot(column.dropna())
        plt.figure()
        sns.distplot(column_modified.dropna())

    column = column_modified

    return column