GEMMA python module home

Source code for gemma.sampleclass

from readfuncs import *
from gemmaclass import *
import numpy as np
import pandas
import datetime as dt
import os
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde

[docs]def percentile(A, pct=.5, W=1, method='interpolate'): #[.25,.50,.75] '''Calculate weighted percentiles of A, given weights in W Parameters ---------- A : MxN (masked) array of values to determine percentiles on pct : list of P percentiles to be determined W : Mx1 array of weigths method : what to return if there is no exact percentile? !!only interpolate implemented: weighted mean of two surrounding values Returns ------- P : PxN array of calculated percentiles ''' A = pandas.DataFrame(A) W = pandas.Series(W,name='WGHT',dtype=float) pct = np.atleast_1d(pct) # percentages or fractions? if pct.max() > 1.: pct = pct / 100. P = np.empty((pct.shape[0],A.shape[1])) # sort A low-high for each column ic=0 for c in A: C = pandas.concat((A[c],W),axis=1,keys=[c,'WGHT']) C = C[C[c].notnull()] # exclude missing data # still rows in C? if C.shape[0] > 0: C.sort(columns=[c],axis=0,inplace=True) CW = C['WGHT'].cumsum(axis=0) / C['WGHT'].sum(axis=0) for ip,p in enumerate(pct): # determine values at pct items (by iterpolation) if np.any(CW==p): pa = C[c][CW == p].mean() elif CW.min() >= p: pa = C[c].min() else: try: plow = CW[CW <= p].max() phigh = CW[CW >= p].min() clow = C[c][CW==plow].values[0] chigh = C[c][CW==phigh].values[0] pa = clow + (p - plow)/(phigh-plow)*(chigh-clow) except: print p print CW raise P[ip,ic] = pa else: P[:,ic] = np.nan ic+=1 return pandas.DataFrame(P, index=pct,columns=A.columns)
def _get_comb(S): emc = [] for i in xrange(len(S)): if S.ix[i] > 0: emc.append(S.index[i]) return ', '.join(emc)
[docs]class sample(object): ''' Class that contains one g-emma sample run Reads in sample header information and further acts as an iterator object along datarows. Data from samples are not read by default. Data can be accessed as sample[column] Parameters ---------- gemma : gemma-object read_data : bool. Read data (default False) Attributes ---------- gemma : gemma object parent gemma object endmembers : list of strings end-member names name : string name of sample (sample code) date : datetime date/time information of sample nbehave : int number of behavioural samples data : pandas.DataFrame sample data ''' def __init__(self, gemma=None, read_data=False): self.gemma = gemma self.endmembers = self.gemma.endmembers try: self.name = readstring(self.gemma.f) self.date = dt.datetime.strptime(readstring(self.gemma.f), '%Y-%m-%dT%H:%M:%S') self.nbehave = readlong(self.gemma.f, self.gemma.x64) self.nrows = readint(self.gemma.f) self.data_pos = self.gemma.f.tell() self.cur_pos = self.gemma.f.tell() self.cur_row = 0 self.has_data = False self.data = None self.rowsize = 4 * (4 + (self.gemma.nendmembers + 1) * \ self.gemma.nsolutes + self.gemma.nendmembers) if self.gemma.diag: self.rowsize += 4 * (self.gemma.nsolutes + 1) + 1 iter(self) if read_data: self.data = self.read_data() else: self.gemma.f.seek(self.data_pos + self.nrows * self.rowsize) except struct.error: raise
[docs] def read_data(self, nrows=None): ''' Read data. Not done by default Parameters ---------- nrows : int, no of rows to read from sample data. Use None (default) for all ''' if nrows is None: nrows = self.nrows if self.nbehave: self.gemma.f.seek(self.data_pos) self.data = [] if self.gemma.diag: ncols = (4 + (self.gemma.nendmembers+1)*self.gemma.nsolutes + self.gemma.nendmembers + 1 + self.gemma.nsolutes + 1) fmt = '3i%if?'%(ncols-4) else: ncols = (3 + (self.gemma.nendmembers+1)*self.gemma.nsolutes + self.gemma.nendmembers + 1) fmt = '3i%if'%(ncols-3) fmt = ''.join([fmt for i in xrange(nrows)]) data = struct.unpack(fmt,self.gemma.f.read(nrows*self.rowsize)) for i in xrange(self.nrows): self.data.append(data[i*ncols:i*ncols+ncols]) del data self.data = pandas.DataFrame(self.data, columns=self.gemma.columns) self.data = self.data.replace(-9999.,np.nan) # set NA in fractions to 0 self.data[self.gemma.endmembers] = \ self.data[self.gemma.endmembers].fillna(0) self.has_data = True
[docs] def percentile(self, pct=[.25,.50,.75], params=None, mask=None): '''Return DataFrame with values of params @ percentiles, weighed by likelihood (LH) column Parameters ---------- pct : list of percentiles params : list of parameters to calculate (default: all) mask : boolean Series or ndarray to select data Returns ------- p : DataFrame PxN array of calculated percentiles ''' if not self.has_data: self.read_data() if params == None: params = filter(lambda item: not (item == 'LH'), self.gemma.columns) if isinstance(mask,(pandas.Series,np.ndarray)): return percentile(self.data[params][mask], pct, self.data['LH']) else: return percentile(self.data[params], pct, self.data['LH'])
[docs] def em_contrib(self): '''Function returns Series with frequencies for the separate endmembers and end-member combinations''' if not self.has_data: self.read_data() self.data['emcomb'] = self.data[self.endmembers].apply(_get_comb,axis=1) emc = (self.data[self.endmembers]>0).sum() evc = self.data['emcomb'].value_counts() emcontrib = pandas.concat((emc, evc)) emcontrib.name = self.name return emcontrib / float(len(self.data))
[docs] def plot_residuals(self): '''Plot boxplot of residuals''' if self.gemma.diag: if self.data == None: self.read_data() fig = plt.figure() ax = fig.add_subplot(111) cols = ['Res_WB'] for s in self.gemma.solutes: cols += ['Res_'+s] d2 = self.data[cols] box = ax.boxplot(d2.values) ax.hlines(0,0,len(cols)) ax.set_xticklabels(cols) plt.title('Residuals for Sample '+self.name,fontsize='large') plt.show() else: print 'No residuals to plot, not in diagnostics mode'
[docs] def boxplot(self, params, violin=True): ''' Create boxplot or violin plot Parameters ---------- params : list of parameters from data to draw violin : bool, draw violin plot around boxplot if True (default) ''' # code from http://pyinsci.blogspot.nl/2009/09/violin-plot-with-matplotlib.html if not self.has_data: self.read_data() assert(self.has_data) fig = plt.figure() ax = fig.add_subplot(111) pos = range(len(params)) if violin: dist = len(params) w = 0.9*min(0.15*max(dist,1.),0.5) for prm,p in zip(params,pos): k = gaussian_kde(self.data[prm].values) m = k.dataset.min() M = k.dataset.max() x = np.arange(m,M,(M-m)/100.) v = k.evaluate(x) v = v/v.max()*w ax.fill_betweenx(x,p,v+p,facecolor='y', alpha=0.3) ax.fill_betweenx(x,p,-v+p,facecolor='y', alpha=0.3) ax.boxplot(self.data[params].values, notch=1, positions=pos,vert=1) ax.set_xticklabels(params) plt.show()
[docs] def dottyplot(self, params): '''Function to draw dotty plot of given parameters Parameters ---------- params : list of parameters from data to draw ''' if not self.has_data: self.read_data() assert(self.has_data) fig = plt.figure() nplots = len(params) if nplots > 3: nr = 2 nc = int((nplots+1) / 2) else: nr = 1 nc = nplots ax = [] for i,p in enumerate(params): if '_' in p: e,s = p.split('_') else: e = p s = 'fraction' if i: ax += [fig.add_subplot(nr,nc,i+1,sharey=ax[0])] else: ax += [fig.add_subplot(nr,nc,i+1)] ax[-1].plot(self.data[p],self.data['LH'],'.') if p in self.gemma.endmembers: ax[-1].set_xlim(0,1) ax[-1].set_title('%s (n=%i)'%(p,self.data[p].count())) fig.suptitle('Sample: '+self.name, fontsize='x-large') plt.show() # overload []: pass to self.data
def __getitem__(self, key): if not self.has_data: self.read_data() assert(self.has_data) return self.data[key] def __setitem__(self, key, value): if not self.has_data: self.read_data() assert(self.has_data) self.data[key] = value