from readfuncs import *
from gemmaclass import *
import numpy as np
import pandas
import datetime as dt
import os
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
[docs]def percentile(A, pct=.5, W=1, method='interpolate'): #[.25,.50,.75]
'''Calculate weighted percentiles of A, given weights in W
Parameters
----------
A : MxN (masked) array of values to determine percentiles on
pct : list of P percentiles to be determined
W : Mx1 array of weigths
method : what to return if there is no exact percentile?
!!only interpolate implemented: weighted mean of two surrounding values
Returns
-------
P : PxN array of calculated percentiles
'''
A = pandas.DataFrame(A)
W = pandas.Series(W,name='WGHT',dtype=float)
pct = np.atleast_1d(pct)
# percentages or fractions?
if pct.max() > 1.:
pct = pct / 100.
P = np.empty((pct.shape[0],A.shape[1]))
# sort A low-high for each column
ic=0
for c in A:
C = pandas.concat((A[c],W),axis=1,keys=[c,'WGHT'])
C = C[C[c].notnull()] # exclude missing data
# still rows in C?
if C.shape[0] > 0:
C.sort(columns=[c],axis=0,inplace=True)
CW = C['WGHT'].cumsum(axis=0) / C['WGHT'].sum(axis=0)
for ip,p in enumerate(pct):
# determine values at pct items (by iterpolation)
if np.any(CW==p):
pa = C[c][CW == p].mean()
elif CW.min() >= p:
pa = C[c].min()
else:
try:
plow = CW[CW <= p].max()
phigh = CW[CW >= p].min()
clow = C[c][CW==plow].values[0]
chigh = C[c][CW==phigh].values[0]
pa = clow + (p - plow)/(phigh-plow)*(chigh-clow)
except:
print p
print CW
raise
P[ip,ic] = pa
else:
P[:,ic] = np.nan
ic+=1
return pandas.DataFrame(P, index=pct,columns=A.columns)
def _get_comb(S):
emc = []
for i in xrange(len(S)):
if S.ix[i] > 0:
emc.append(S.index[i])
return ', '.join(emc)
[docs]class sample(object):
''' Class that contains one g-emma sample run
Reads in sample header information and further acts as an iterator object along datarows. Data from samples are not read by default.
Data can be accessed as sample[column]
Parameters
----------
gemma : gemma-object
read_data : bool. Read data (default False)
Attributes
----------
gemma : gemma object
parent gemma object
endmembers : list of strings
end-member names
name : string
name of sample (sample code)
date : datetime
date/time information of sample
nbehave : int
number of behavioural samples
data : pandas.DataFrame
sample data
'''
def __init__(self, gemma=None, read_data=False):
self.gemma = gemma
self.endmembers = self.gemma.endmembers
try:
self.name = readstring(self.gemma.f)
self.date = dt.datetime.strptime(readstring(self.gemma.f),
'%Y-%m-%dT%H:%M:%S')
self.nbehave = readlong(self.gemma.f, self.gemma.x64)
self.nrows = readint(self.gemma.f)
self.data_pos = self.gemma.f.tell()
self.cur_pos = self.gemma.f.tell()
self.cur_row = 0
self.has_data = False
self.data = None
self.rowsize = 4 * (4 + (self.gemma.nendmembers + 1) * \
self.gemma.nsolutes + self.gemma.nendmembers)
if self.gemma.diag:
self.rowsize += 4 * (self.gemma.nsolutes + 1) + 1
iter(self)
if read_data:
self.data = self.read_data()
else:
self.gemma.f.seek(self.data_pos + self.nrows * self.rowsize)
except struct.error:
raise
[docs] def read_data(self, nrows=None):
''' Read data. Not done by default
Parameters
----------
nrows : int, no of rows to read from sample data. Use None (default) for all
'''
if nrows is None:
nrows = self.nrows
if self.nbehave:
self.gemma.f.seek(self.data_pos)
self.data = []
if self.gemma.diag:
ncols = (4 + (self.gemma.nendmembers+1)*self.gemma.nsolutes
+ self.gemma.nendmembers + 1 + self.gemma.nsolutes + 1)
fmt = '3i%if?'%(ncols-4)
else:
ncols = (3 + (self.gemma.nendmembers+1)*self.gemma.nsolutes
+ self.gemma.nendmembers + 1)
fmt = '3i%if'%(ncols-3)
fmt = ''.join([fmt for i in xrange(nrows)])
data = struct.unpack(fmt,self.gemma.f.read(nrows*self.rowsize))
for i in xrange(self.nrows):
self.data.append(data[i*ncols:i*ncols+ncols])
del data
self.data = pandas.DataFrame(self.data, columns=self.gemma.columns)
self.data = self.data.replace(-9999.,np.nan)
# set NA in fractions to 0
self.data[self.gemma.endmembers] = \
self.data[self.gemma.endmembers].fillna(0)
self.has_data = True
[docs] def percentile(self, pct=[.25,.50,.75], params=None, mask=None):
'''Return DataFrame with values of params @ percentiles, weighed by likelihood (LH) column
Parameters
----------
pct : list of percentiles
params : list of parameters to calculate (default: all)
mask : boolean Series or ndarray to select data
Returns
-------
p : DataFrame PxN array of calculated percentiles
'''
if not self.has_data:
self.read_data()
if params == None:
params = filter(lambda item: not (item == 'LH'), self.gemma.columns)
if isinstance(mask,(pandas.Series,np.ndarray)):
return percentile(self.data[params][mask], pct, self.data['LH'])
else:
return percentile(self.data[params], pct, self.data['LH'])
[docs] def em_contrib(self):
'''Function returns Series with frequencies for the separate endmembers
and end-member combinations'''
if not self.has_data:
self.read_data()
self.data['emcomb'] = self.data[self.endmembers].apply(_get_comb,axis=1)
emc = (self.data[self.endmembers]>0).sum()
evc = self.data['emcomb'].value_counts()
emcontrib = pandas.concat((emc, evc))
emcontrib.name = self.name
return emcontrib / float(len(self.data))
[docs] def plot_residuals(self):
'''Plot boxplot of residuals'''
if self.gemma.diag:
if self.data == None:
self.read_data()
fig = plt.figure()
ax = fig.add_subplot(111)
cols = ['Res_WB']
for s in self.gemma.solutes:
cols += ['Res_'+s]
d2 = self.data[cols]
box = ax.boxplot(d2.values)
ax.hlines(0,0,len(cols))
ax.set_xticklabels(cols)
plt.title('Residuals for Sample '+self.name,fontsize='large')
plt.show()
else:
print 'No residuals to plot, not in diagnostics mode'
[docs] def boxplot(self, params, violin=True):
''' Create boxplot or violin plot
Parameters
----------
params : list of parameters from data to draw
violin : bool, draw violin plot around boxplot if True (default)
'''
# code from http://pyinsci.blogspot.nl/2009/09/violin-plot-with-matplotlib.html
if not self.has_data: self.read_data()
assert(self.has_data)
fig = plt.figure()
ax = fig.add_subplot(111)
pos = range(len(params))
if violin:
dist = len(params)
w = 0.9*min(0.15*max(dist,1.),0.5)
for prm,p in zip(params,pos):
k = gaussian_kde(self.data[prm].values)
m = k.dataset.min()
M = k.dataset.max()
x = np.arange(m,M,(M-m)/100.)
v = k.evaluate(x)
v = v/v.max()*w
ax.fill_betweenx(x,p,v+p,facecolor='y', alpha=0.3)
ax.fill_betweenx(x,p,-v+p,facecolor='y', alpha=0.3)
ax.boxplot(self.data[params].values, notch=1, positions=pos,vert=1)
ax.set_xticklabels(params)
plt.show()
[docs] def dottyplot(self, params):
'''Function to draw dotty plot of given parameters
Parameters
----------
params : list of parameters from data to draw
'''
if not self.has_data: self.read_data()
assert(self.has_data)
fig = plt.figure()
nplots = len(params)
if nplots > 3:
nr = 2
nc = int((nplots+1) / 2)
else:
nr = 1
nc = nplots
ax = []
for i,p in enumerate(params):
if '_' in p:
e,s = p.split('_')
else:
e = p
s = 'fraction'
if i:
ax += [fig.add_subplot(nr,nc,i+1,sharey=ax[0])]
else:
ax += [fig.add_subplot(nr,nc,i+1)]
ax[-1].plot(self.data[p],self.data['LH'],'.')
if p in self.gemma.endmembers:
ax[-1].set_xlim(0,1)
ax[-1].set_title('%s (n=%i)'%(p,self.data[p].count()))
fig.suptitle('Sample: '+self.name, fontsize='x-large')
plt.show()
# overload []: pass to self.data
def __getitem__(self, key):
if not self.has_data: self.read_data()
assert(self.has_data)
return self.data[key]
def __setitem__(self, key, value):
if not self.has_data: self.read_data()
assert(self.has_data)
self.data[key] = value