pyFire/matr.py at master · guoliu/pyFire · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
from conf import*
import pandas as pd

####################################################################################################
####################################################################################################
####################################################################################################
####################################################################################################
def reBin(a, shape):
    sh = shape[0],a.shape[0]//shape[0],shape[1],a.shape[1]//shape[1]
    return np.nanmean(a.reshape(sh),axis=-1).mean(1)

####################################################################################################
def cleaner(dataList, mask=None, nameList=None, NaNCut=True, scalingPoint=None, maxPoint=None):
    """
    Through away the NaNs in multiple dataset with the same size, and/or output to Panda DataFrame if *nameList* is provided.

    Args:
        dataList: list or tuple. Input data arrays.
        mask: matrix of bool, optional. Mask of invalid data.
        nameList: list of str, optional. List of columns names. Output to DataFrame if provided. Otherwise output is a list of arrays.
        NaNCut: bool, optinal. Throw away NaNs.

    Return:
        list (when *nameList* not provided) or DataFrame (when *nameList* is provided).
    """
##########Calculate common mask and standard scale
    if NaNCut:
        comMask = ~np.isnan(dataList[0])
        for i in range(1,len(dataList)):
            comMask = comMask & (~np.isnan(dataList[i]))

    if maxPoint is not None:
        for i in range(len(dataList)):
            dataList[i] = dataList[i]*maxPoint/np.nanmax(dataList[i])
    elif scalingPoint is not None:
        percenF = lambda x: np.percentile(x[~np.isnan(x)], scalingPoint) #calculate percentile
        percenList = [percenF(dataList[i]) for i in range(len(dataList))]
        if standard is None:
            standard = np.max(percenList)
        print 'Scaling standard:', scalingPoint, 'percentile =', standard
        for i in range(len(dataList)):
            dataList[i] = dataList[i]*standard/percenList[i]

##########Apply additional mask if needed
    if mask is not None:
        if NaNCut:
            comMask = comMask & (~mask)
        else:
            for i in range(len(dataList)):
                dataList[i][mask] = np.nan

    if nameList is not None: #return pandas dataFrame
        if len(dataList)!=len(nameList):
            print 'Cleaner: Wrong length for variable name.'
        else:
            return pd.DataFrame({nameList[i]: dataList[i][comMask] for i in range(len(dataList))})
    elif NaNCut: #return list of 1-d arrays
        return [ dataList[i][comMask] for i in range(len(dataList)) ]
    else: #return list of origional shape arrays
        return dataList

####################################################################################################
def arraySamp(array, size = 50):
    pool = np.array(np.nonzero(array)).T
    if np.size(pool,0) < size:
        print 'Not enough sample. Sample size: ', np.size(pool,0)
        return None
    ind = np.array(random.sample(pool, size))
    new_array = np.zeros(array.shape,dtype=bool)
    new_array[ind] = True
    return new_array

####################################################################################################
def maskSamp(mask, contr, minEdge = None, maxEdge = None, inter = 100, size = 50):
    #sample mask layer according to contr to avoid distribution error in data
    new_mask = np.zeros(mask.shape,dtype=bool)
    if minEdge is None:
        minEdge = contr[mask].min().astype(int)
    if maxEdge is None:
        maxEdge = (contr[mask].max()+0.999*inter).astype(int)

    for edge in range(minEdge, maxEdge, inter):
        temMa = arraySamp((contr<=edge+inter)&(contr>edge)&mask, size = size)
        if temMa is None:
            print str(edge)+' to '+str(edge+inter)
            return None
        new_mask = new_mask|temMa
    return new_mask

####################################################################################################
def quaReg(x, y):
    import rpy2.robjects as robjects
    rObj = robjects.r
    qreg = robjects.packages.importr(name = 'quantreg')

    rObj.assign('xr', robjects.FloatVector(x))
    rObj.assign('yr', robjects.FloatVector(y))
    rObj("dt <- data.frame(xr,yr)")
    fmla = robjects.Formula('')

####################################################################################################
def binPer(x, y , nbin = None, percen = 90, med = True):
    print 'Calculating upper boundary. Percentage:', percen
    from accumarray import accum
    from scipy import stats
    x = x.flatten()
    y = y.flatten()

    xOrd = stats.rankdata(x,method='dense')
    if nbin is None:
        nbin = np.max((xOrd.max()//100, 5))
        print 'Automatic bin number:', nbin
    else:
        print 'User assigned bin number:', nbin
    binWid = xOrd.max()*1.0/nbin
    mapper = (xOrd//binWid).astype(np.int)

    yNew = accum(mapper, y, func=lambda x: np.percentile(x[:], percen))
    xNew = accum(mapper, x, func='mean')

    if med:
        return xNew, yNew, accum(mapper, y, func=lambda x: np.percentile(x[:], 50))
    else:
        return xNew, yNew

####################################################################################################
def autoBin(data, nBin, minWid=1e-3):
    """
    Find boundaries that divide data into bins with equal sample size, while bin wideth>=minWid.

    Args:
        data: Data to calculate boundaries from.
        nBin: Number of desired bin.

    Return:
        Array of boundaries.
    """

    nanMask = ~np.isnan(data)
    nData = nanMask.flatten().sum()
    dataSort = np.sort(data[nanMask])
    widBin = nData//nBin
    bound = dataSort[0::widBin] #normal situation
    if len(bound)<nBin+1: #rounding error
        bound = np.append(bound, dataSort[-1])
    else:
        bound[-1] = dataSort[-1]

    #####Restriction on bin width
    for i in range(nBin-1):
        if bound[i+1]-bound[i]<minWid:
            bound[i+1]=bound[i]+minWid
            bound[i+1:]=autoBin(data[data>=bound[i+1]], nBin-1-i)

    return bound