-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathmatr.py
More file actions
155 lines (133 loc) · 6.09 KB
/
matr.py
File metadata and controls
155 lines (133 loc) · 6.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
from conf import*
import pandas as pd
####################################################################################################
####################################################################################################
####################################################################################################
####################################################################################################
def reBin(a, shape):
sh = shape[0],a.shape[0]//shape[0],shape[1],a.shape[1]//shape[1]
return np.nanmean(a.reshape(sh),axis=-1).mean(1)
####################################################################################################
def cleaner(dataList, mask=None, nameList=None, NaNCut=True, scalingPoint=None, maxPoint=None):
"""
Through away the NaNs in multiple dataset with the same size, and/or output to Panda DataFrame if *nameList* is provided.
Args:
dataList: list or tuple. Input data arrays.
mask: matrix of bool, optional. Mask of invalid data.
nameList: list of str, optional. List of columns names. Output to DataFrame if provided. Otherwise output is a list of arrays.
NaNCut: bool, optinal. Throw away NaNs.
Return:
list (when *nameList* not provided) or DataFrame (when *nameList* is provided).
"""
##########Calculate common mask and standard scale
if NaNCut:
comMask = ~np.isnan(dataList[0])
for i in range(1,len(dataList)):
comMask = comMask & (~np.isnan(dataList[i]))
if maxPoint is not None:
for i in range(len(dataList)):
dataList[i] = dataList[i]*maxPoint/np.nanmax(dataList[i])
elif scalingPoint is not None:
percenF = lambda x: np.percentile(x[~np.isnan(x)], scalingPoint) #calculate percentile
percenList = [percenF(dataList[i]) for i in range(len(dataList))]
if standard is None:
standard = np.max(percenList)
print 'Scaling standard:', scalingPoint, 'percentile =', standard
for i in range(len(dataList)):
dataList[i] = dataList[i]*standard/percenList[i]
##########Apply additional mask if needed
if mask is not None:
if NaNCut:
comMask = comMask & (~mask)
else:
for i in range(len(dataList)):
dataList[i][mask] = np.nan
if nameList is not None: #return pandas dataFrame
if len(dataList)!=len(nameList):
print 'Cleaner: Wrong length for variable name.'
else:
return pd.DataFrame({nameList[i]: dataList[i][comMask] for i in range(len(dataList))})
elif NaNCut: #return list of 1-d arrays
return [ dataList[i][comMask] for i in range(len(dataList)) ]
else: #return list of origional shape arrays
return dataList
####################################################################################################
def arraySamp(array, size = 50):
pool = np.array(np.nonzero(array)).T
if np.size(pool,0) < size:
print 'Not enough sample. Sample size: ', np.size(pool,0)
return None
ind = np.array(random.sample(pool, size))
new_array = np.zeros(array.shape,dtype=bool)
new_array[ind] = True
return new_array
####################################################################################################
def maskSamp(mask, contr, minEdge = None, maxEdge = None, inter = 100, size = 50):
#sample mask layer according to contr to avoid distribution error in data
new_mask = np.zeros(mask.shape,dtype=bool)
if minEdge is None:
minEdge = contr[mask].min().astype(int)
if maxEdge is None:
maxEdge = (contr[mask].max()+0.999*inter).astype(int)
for edge in range(minEdge, maxEdge, inter):
temMa = arraySamp((contr<=edge+inter)&(contr>edge)&mask, size = size)
if temMa is None:
print str(edge)+' to '+str(edge+inter)
return None
new_mask = new_mask|temMa
return new_mask
####################################################################################################
def quaReg(x, y):
import rpy2.robjects as robjects
rObj = robjects.r
qreg = robjects.packages.importr(name = 'quantreg')
rObj.assign('xr', robjects.FloatVector(x))
rObj.assign('yr', robjects.FloatVector(y))
rObj("dt <- data.frame(xr,yr)")
fmla = robjects.Formula('')
####################################################################################################
def binPer(x, y , nbin = None, percen = 90, med = True):
print 'Calculating upper boundary. Percentage:', percen
from accumarray import accum
from scipy import stats
x = x.flatten()
y = y.flatten()
xOrd = stats.rankdata(x,method='dense')
if nbin is None:
nbin = np.max((xOrd.max()//100, 5))
print 'Automatic bin number:', nbin
else:
print 'User assigned bin number:', nbin
binWid = xOrd.max()*1.0/nbin
mapper = (xOrd//binWid).astype(np.int)
yNew = accum(mapper, y, func=lambda x: np.percentile(x[:], percen))
xNew = accum(mapper, x, func='mean')
if med:
return xNew, yNew, accum(mapper, y, func=lambda x: np.percentile(x[:], 50))
else:
return xNew, yNew
####################################################################################################
def autoBin(data, nBin, minWid=1e-3):
"""
Find boundaries that divide data into bins with equal sample size, while bin wideth>=minWid.
Args:
data: Data to calculate boundaries from.
nBin: Number of desired bin.
Return:
Array of boundaries.
"""
nanMask = ~np.isnan(data)
nData = nanMask.flatten().sum()
dataSort = np.sort(data[nanMask])
widBin = nData//nBin
bound = dataSort[0::widBin] #normal situation
if len(bound)<nBin+1: #rounding error
bound = np.append(bound, dataSort[-1])
else:
bound[-1] = dataSort[-1]
#####Restriction on bin width
for i in range(nBin-1):
if bound[i+1]-bound[i]<minWid:
bound[i+1]=bound[i]+minWid
bound[i+1:]=autoBin(data[data>=bound[i+1]], nBin-1-i)
return bound