Source code for gemma.sampleclass

from readfuncs import *
from gemmaclass import *
import numpy as np
import pandas
import datetime as dt
import os
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde

[docs]def percentile(A, pct=.5, W=1, method='interpolate'): #[.25,.50,.75]
    '''Calculate weighted percentiles of A, given weights in W

    Parameters
    ----------
    A : MxN (masked) array of values to determine percentiles on
    pct : list of P percentiles to be determined
    W : Mx1 array of weigths
    method : what to return if there is no exact percentile?
        !!only interpolate implemented: weighted mean of two surrounding values

    Returns
    -------
    P : PxN array of calculated percentiles
    '''
    A = pandas.DataFrame(A)
    W = pandas.Series(W,name='WGHT',dtype=float)
    pct = np.atleast_1d(pct)
    # percentages or fractions?
    if pct.max() > 1.:
        pct = pct / 100.
    P = np.empty((pct.shape[0],A.shape[1]))
    
    # sort A low-high for each column
    ic=0
    for c in A:
        C = pandas.concat((A[c],W),axis=1,keys=[c,'WGHT'])
        C = C[C[c].notnull()] # exclude missing data
        # still rows in C?
        if C.shape[0] > 0:
            C.sort(columns=[c],axis=0,inplace=True)
            CW = C['WGHT'].cumsum(axis=0) / C['WGHT'].sum(axis=0)
            for ip,p in enumerate(pct):
                # determine values at pct items (by iterpolation)
                if np.any(CW==p):
                    pa = C[c][CW == p].mean()
                elif CW.min() >= p:
                    pa = C[c].min()
                else:
                    try:
                        plow = CW[CW <= p].max()
                        phigh = CW[CW >= p].min()
                        clow = C[c][CW==plow].values[0]
                        chigh = C[c][CW==phigh].values[0]
                        pa = clow + (p - plow)/(phigh-plow)*(chigh-clow)
                    except:
                        print p
                        print CW
                        raise
                P[ip,ic] = pa
        else:
            P[:,ic] = np.nan
        ic+=1
    return pandas.DataFrame(P, index=pct,columns=A.columns)
    
def _get_comb(S):
    emc = []
    for i in xrange(len(S)):
        if S.ix[i] > 0:
            emc.append(S.index[i])
    return ', '.join(emc)
        
[docs]class sample(object):
    ''' Class that contains one g-emma sample run

        Reads in sample header information and further acts as an iterator object along datarows. Data from samples are not read by default.
        
        Data can be accessed as sample[column]

        Parameters
        ----------
        gemma : gemma-object
        read_data : bool. Read data (default False)

        Attributes
        ----------
        gemma : gemma object
            parent gemma object
        endmembers : list of strings
            end-member names
        name : string
            name of sample (sample code)
        date : datetime
            date/time information of sample
        nbehave : int
            number of behavioural samples
        data : pandas.DataFrame
            sample data
        
        '''
    def __init__(self, gemma=None, read_data=False):
        self.gemma = gemma
        self.endmembers = self.gemma.endmembers
        try:
            self.name = readstring(self.gemma.f)
            self.date = dt.datetime.strptime(readstring(self.gemma.f),
                                                      '%Y-%m-%dT%H:%M:%S')
            self.nbehave = readlong(self.gemma.f, self.gemma.x64)
            self.nrows = readint(self.gemma.f)
            self.data_pos = self.gemma.f.tell()
            self.cur_pos = self.gemma.f.tell()
            self.cur_row = 0
            self.has_data = False
            self.data = None
            self.rowsize = 4 * (4 + (self.gemma.nendmembers + 1) * \
                           self.gemma.nsolutes + self.gemma.nendmembers)
            if self.gemma.diag:
                self.rowsize += 4 * (self.gemma.nsolutes + 1) + 1
            iter(self)
            if read_data:
                self.data = self.read_data()
            else:
                self.gemma.f.seek(self.data_pos + self.nrows * self.rowsize)
        except struct.error:
            raise
        
[docs]    def read_data(self, nrows=None):
        ''' Read data. Not done by default

        Parameters
        ----------
        nrows : int, no of rows to read from sample data. Use None (default) for all
        '''
        if nrows is None:
            nrows = self.nrows

        if self.nbehave:
            self.gemma.f.seek(self.data_pos)
            self.data = []
            if self.gemma.diag:
                ncols = (4 + (self.gemma.nendmembers+1)*self.gemma.nsolutes
                         + self.gemma.nendmembers + 1 + self.gemma.nsolutes + 1)
                fmt = '3i%if?'%(ncols-4)
            else:
                ncols = (3 + (self.gemma.nendmembers+1)*self.gemma.nsolutes
                         + self.gemma.nendmembers + 1)
                fmt = '3i%if'%(ncols-3)
                

            fmt = ''.join([fmt for i in xrange(nrows)])
            data = struct.unpack(fmt,self.gemma.f.read(nrows*self.rowsize))
            for i in xrange(self.nrows):
                self.data.append(data[i*ncols:i*ncols+ncols])
            del data
            self.data = pandas.DataFrame(self.data, columns=self.gemma.columns)
            self.data = self.data.replace(-9999.,np.nan)
            # set NA in fractions to 0
            self.data[self.gemma.endmembers] = \
                                self.data[self.gemma.endmembers].fillna(0)
            self.has_data = True

[docs]    def percentile(self, pct=[.25,.50,.75], params=None, mask=None):
        '''Return DataFrame with values of params @ percentiles, weighed by likelihood (LH) column
            
        Parameters
        ----------
        pct : list of percentiles
        params : list of parameters to calculate (default: all)
        mask : boolean Series or ndarray to select data

        Returns
        -------
        p : DataFrame PxN array of calculated percentiles
        '''
        if not self.has_data:
            self.read_data()
        if params == None:
            params = filter(lambda item: not (item == 'LH'), self.gemma.columns)
        if isinstance(mask,(pandas.Series,np.ndarray)):
            return percentile(self.data[params][mask], pct, self.data['LH'])
        else:
            return percentile(self.data[params], pct, self.data['LH'])

[docs]    def em_contrib(self):
        '''Function returns Series with frequencies for the separate endmembers
            and end-member combinations'''
        if not self.has_data:
            self.read_data()
        self.data['emcomb'] = self.data[self.endmembers].apply(_get_comb,axis=1)
        emc = (self.data[self.endmembers]>0).sum()
        evc = self.data['emcomb'].value_counts()
        emcontrib = pandas.concat((emc, evc))
        emcontrib.name = self.name
        return emcontrib / float(len(self.data))

[docs]    def plot_residuals(self):
        '''Plot boxplot of residuals'''
        if self.gemma.diag:
            if self.data == None:
                self.read_data()

            fig = plt.figure()
            ax = fig.add_subplot(111)
            cols = ['Res_WB']
            for s in self.gemma.solutes:
                cols += ['Res_'+s]
            d2 = self.data[cols]
            box = ax.boxplot(d2.values)
            ax.hlines(0,0,len(cols))
            ax.set_xticklabels(cols)
            plt.title('Residuals for Sample '+self.name,fontsize='large')
            plt.show()
        else:
            print 'No residuals to plot, not in diagnostics mode'

[docs]    def boxplot(self, params, violin=True):
        ''' Create boxplot or violin plot

        Parameters
        ----------
        params : list of parameters from data to draw
        violin : bool, draw violin plot around boxplot if True (default)
        '''
        # code from http://pyinsci.blogspot.nl/2009/09/violin-plot-with-matplotlib.html
        if not self.has_data: self.read_data()
        assert(self.has_data)
        
        fig = plt.figure()
        ax = fig.add_subplot(111)
        pos = range(len(params))
        if violin:
            dist = len(params)
            w = 0.9*min(0.15*max(dist,1.),0.5)
            for prm,p in zip(params,pos):
                k = gaussian_kde(self.data[prm].values)
                m = k.dataset.min()
                M = k.dataset.max()
                x = np.arange(m,M,(M-m)/100.)
                v = k.evaluate(x)
                v = v/v.max()*w
                ax.fill_betweenx(x,p,v+p,facecolor='y', alpha=0.3)
                ax.fill_betweenx(x,p,-v+p,facecolor='y', alpha=0.3)
        ax.boxplot(self.data[params].values, notch=1, positions=pos,vert=1)
        ax.set_xticklabels(params)
        plt.show()

[docs]    def dottyplot(self, params):
        '''Function to draw dotty plot of given parameters

        Parameters
        ----------
        params : list of parameters from data to draw
        '''
        if not self.has_data: self.read_data()
        assert(self.has_data)
        
        fig = plt.figure()
        nplots = len(params)            
        if nplots > 3:
            nr = 2
            nc = int((nplots+1) / 2)
        else:
            nr = 1
            nc = nplots
        ax = []
        for i,p in enumerate(params):
            if '_' in p:
                e,s = p.split('_')
            else:
                e = p
                s = 'fraction'
                
            if i:
                ax += [fig.add_subplot(nr,nc,i+1,sharey=ax[0])]
            else:
                ax += [fig.add_subplot(nr,nc,i+1)]
            ax[-1].plot(self.data[p],self.data['LH'],'.')
            if p in self.gemma.endmembers:
                ax[-1].set_xlim(0,1)
            ax[-1].set_title('%s (n=%i)'%(p,self.data[p].count()))
        fig.suptitle('Sample: '+self.name, fontsize='x-large')
        plt.show()

    # overload []: pass to self.data
    def __getitem__(self, key):
        if not self.has_data: self.read_data()
        assert(self.has_data)
        return self.data[key]
    def __setitem__(self, key, value):
        if not self.has_data: self.read_data()
        assert(self.has_data)
        self.data[key] = value
Navigation

Quick search

Source code for gemma.sampleclass

Navigation