Source code for ecolopy_dev.utils

#!/usr/bin/python
"""
13 Jul 2011

some utilities for abundance, essentially for computation of K(D,A)
"""

__author__  = "Francois-Jose Serra"
__email__   = "francois@barrabin.org"
__licence__ = "GPLv3"
__version__ = "0.12"

from warnings import warn

try:
    from gmpy2            import log, lngamma, exp, gamma, mpfr, mul, div
except ImportError:
    warn("WARNING: GMPY2 library not found, using numpy")
    from numpy            import log, exp, float128 as mpfr
    from scipy.special    import gamma, gammaln as lngamma
    from operator         import mul, div

from sys import stdout
try:
    from matplotlib import pyplot
    import numpy as np
except ImportError:
    print 'matplotlib or numpy not found'

global STIRLINGS
STIRLINGS = {}


[docs]def generate_random_neutral_abundance (model_name, size, **kwargs): ''' :argument model_name: model name (ewens, etienne, lognorm) :argument size: size of the community (J), if log-normal distribution size should be equal to number of species (S) :returns: random neutral distribution of abundance other args should be of kind theta, I, m **Example:** :: import ecolopy_dev ecolopy.generate_random_neutral_abundance('ewens', 100, theta=12, I=12) ''' # import inside because otherwise strange never-ending import... from ecolopy_dev.models import EcologicalModel model = EcologicalModel (model_name, **kwargs) return model.rand_neutral(size)
def lpoch (z, m): ''' returns log Pochhammer symbol taking advantage of: Pochhammer symbol (z)_m = (z)(z+1)....(z+m-1) = gamma(z+m)/gamma(z) ''' return lngamma(z+m) - lngamma(z)
[docs]def shannon_entropy(abund, inds): ''' computes Shannon entropy (H) for a given abundance table and number of individuals. :argument abund: distribution of abundances as list :returns: Shannon entropy ''' return (sum ([-spe * log(spe) for spe in abund]) + inds * log (inds)) / inds
def table_mpfr (out, spp=None): ''' data to contingency table any kind of data :argument out: species abundance :returns: list of counts of individuals by species ''' setout = set (out) if spp == None: spp = len (setout) counts = dict (zip (setout, [float(0.)]*spp)) for ind in out: counts[ind] += mpfr(1) return [counts[x] for x in sorted (counts)]
[docs]def table (out, spp=None): ''' data to contingency table any kind of data :argument out: species abundance :returns: list of counts of individuals by species ''' setout = set (out) if spp == None: spp = len (setout) counts = dict (zip (setout, [float(0.)]*spp)) for ind in out: counts[ind] += 1 return [counts[x] for x in sorted (counts)]
def factorial_div (one, two): ''' computes a!/b! ''' if one < two: return div(1., reduce (mul, xrange(one, two))) elif one > two: return reduce (mul, xrange(two, one)) else: return mpfr (1.)
[docs]def power_polyn (polyn_a, power): ''' ''' return reduce (mul_polyn, (polyn_a for _ in xrange (power)))
[docs]def mul_polyn(polyn_a, polyn_b): ''' computes the product of 2 polynomials, depending of the differences in length of the two polynomials, this function will call one of: * _mul_uneq_polyn: when length of polyn_a is >= length of polyn_b, will iterate over coefficient. * _mul_simil_polyn: in case both polynomials have equal length, will iterate over factors. to test multiplication of pylnomials try equality of the two functions: mul_uneq_polyn(polyn_a, polyn_b, len_a, len_b) == _mul_simil_polyn(polyn_a, polyn_b, len_a, len_b) **Example:** :: from ecolopy_dev.utils import mul_polyn # (2 + 3^x + 5x^2) * (x) mul_polyn([2,3,5], [0,1]) # will return: [mpfr('0.0'), mpfr('2.0'), mpfr('3.0'), mpfr('5.0')] # that is: 0 + 2x + 3x^2 + 5x^3 :argument polyn_a: list of indices of polynomial :argument polyn_b: list of indices of polynomial (e.g.: [1,3,5,0,2] for :math:`2 + 3^x + 5x^2 + 0x^3 + 2x^4`) :returns: a list representing multiplication of the two polynomials ''' if not polyn_a: return polyn_b if not polyn_b: return polyn_a # set default values len_a = len (polyn_a) len_b = len (polyn_b) diff = abs (len_a - len_b) if len_a >= len_b: polyn_b = polyn_b + [mpfr(0)] * (diff) else: _ = polyn_a + [mpfr(0.)] * (diff) polyn_a = polyn_b[:] polyn_b = _ len_a = len_b len_b = len (polyn_b) # switcher if len_a > len_b*2: # most return _mul_uneq_polyn(polyn_a, polyn_b, len_a, len_b) return _mul_simil_polyn(polyn_a, polyn_b, len_a, len_b)
def _mul_simil_polyn(polyn_a, polyn_b, len_a, len_b): ''' fast polynomial multiplication when polynomials are nearly equal -> iterates over factors ''' def mult_one (la, lb, stop, start=0): ''' internal that computes row multiplication of 2 lists start and stops are here to skip multiplications by zero ''' return [mul (la[i], lb[i]) for i in xrange (start, stop)] max_len = len_a + len_b diff = len_a - len_b new = [] for i in xrange (1, len_a +1): new.append (sum (mult_one (polyn_a[:i], polyn_b[i-1::-1], i))) len_a2 = len_a * 2 - 1 len_a1 = len_a + 1 len_a3 = len_a - 1 for i in xrange (len_a, max_len - 1): new.append (sum (mult_one (polyn_a[i-len_a3 : len_a1], polyn_b[len_a : i-len_a :-1], len_a2-i, diff))) return new def _mul_uneq_polyn(polyn_a, polyn_b, len_a, len_b): ''' fast polynomial multiplication when 1 polynomial >~ 2 times larger. -> iterates over coefficients ''' new = [mpfr(0)] * (len_a + len_b - 1) for i in xrange (len_a): pai = polyn_a[i] for j in xrange (len_b): new [i + j] += pai * polyn_b[j] return new
[docs]def pre_get_stirlings(max_nm, needed, verbose=True): ''' takes advantage of recurrence function: .. math:: s(n, m) = s(n-1, m-1) - (n-1) \cdot s(n-1, m) and as *s(whatever, 0) = 0* : .. math:: s(n+1, 1) = -n \cdot s(n, 1) keep only needed stirling numbers (necessary for large communities) :argument max_nm: max number of individuals in a species :argument needed: list of individuals count in species in our dataset, needed in order to limit the number of stirling numbers kept in memory. :argument True verbose: displays information about state. ''' STIRLINGS [1, 1] = mpfr (1) for one in xrange (1, max_nm): if verbose and not one % 1000: stdout.write('\r %s of %s, size: %s' % (one, max_nm, STIRLINGS.__sizeof__())) stdout.flush() STIRLINGS [one+1, 1] = -one * STIRLINGS [one, 1] for two in xrange (2, one+1): STIRLINGS[one+1, two] = STIRLINGS[one, two-1] - one * STIRLINGS[one, two] STIRLINGS[one+1, one+1] = STIRLINGS[one, one] if one-1 in needed: continue for j in xrange (1, one): del (STIRLINGS[one-1,j]) if verbose: stdout.write('\n')
def stirling (one, two): ''' returns log unsigned stirling number, taking advantage of the fact that if x+y if odd signed stirling will be negative. takes also advantage of recurrence function: s(n,m) = s(n-1, m-1) - (n-1) * s(n-1,m) ''' # return abs of stirling number if (one + two)%2: return -STIRLINGS [one, two] return STIRLINGS [one, two]
[docs]def fast_etienne_likelihood(mod, params, kda=None, kda_x=None): ''' same as Abundance inner function, but takes advantage of constant m value when varying only theta. :argument abd: Abundance object :argument params: list containing theta and m :argument kda_x: precomputed list of exp(kda + ind*immig) ''' theta = params[0] immig = float (params[1]) / (1 - params[1]) * (mod.community.J - 1) log_immig = log (immig) theta_s = theta + mod.community.S if not kda_x: kda_x = [exp(mod._kda[val] + val * log_immig) for val in \ xrange (mod.community.J - mod.community.S)] poch1 = exp (mod._factor + log (theta) * mod.community.S - \ lpoch (immig, mod.community.J) + \ log_immig * mod.community.S + lngamma(theta)) gam_theta_s = gamma (theta_s) lik = mpfr(0.0) for val in xrange (mod.community.J - mod.community.S): lik += poch1 * kda_x[val] / gam_theta_s gam_theta_s *= theta_s + val return ((-log (lik)), kda_x)
[docs]def draw_contour_likelihood (abd, model=None, theta_range=None, m_range=None, num_dots=100, local_optima=True, write_lnl=False): """ Draw contour plot of the log likelihood of a given abundance to fit Etienne model. :argument abd: Community object :argument None model: model name, if None current model is used :argument None theta_range: minimum and maximum value of theta as list. If None, goes from 1 to number of species (S) :argument None m_range: minimum and maximum value of m as list. If None, goes from 0 to 1 :argument 100 num_dots: Number of dots to paint :argument True local_optima: display all local optima founds as white cross :argument False write_lnl: allow to write likelihood values manually by left click on the contour plot. """ if not theta_range: theta_range = [1, int (abd.S)] if not m_range: m_range = [1e-16, 1-1e-9] if not model: model = abd.get_current_model_name() x = np.linspace(theta_range[0], theta_range[1], num_dots) y = np.linspace(m_range[0], m_range[1], num_dots) kda = None if model == 'etienne': lnl_fun = fast_etienne_likelihood elif model=='ewens': lnl_fun = lambda x: abd.ewens_likelihood(x[0]) elif model == 'lognorm': lnl_fun = abd.lognorm_likelihood else: raise Exception ('Model not available.') mod = abd.get_model(model) z = np.zeros ((num_dots, num_dots)) if model=='etienne': kdas = {} for k, i in enumerate(x): print k, 'of', num_dots for l, j in enumerate (y): if not j in kdas: lnl, kda_x = lnl_fun(mod, [i, j]) lnl = -lnl kdas[j] = kda_x else: lnl = -lnl_fun(mod, [i, j], kda_x=kdas[j])[0] z[l][k] = lnl else: for k, i in enumerate(x): print k, 'of', num_dots for l, j in enumerate (y): lnl = -lnl_fun ([i, j]) z[l][k] = lnl if local_optima: loc_opt = [] for i in xrange(1, len(z)-1): for j in xrange(1, len(z)-1): if z[i][j] > z[i][j-1] and \ z[i][j] > z[i][j+1] and \ z[i][j] > z[i-1][j] and \ z[i][j] > z[i+1][j]: loc_opt.append ((i, j)) for i, j in loc_opt: pyplot.plot(x[j], y[i], color='w', marker='x') # define number of color categories... perhaps not the best way perc10 = np.percentile(z, 10) perc90 = np.percentile(z, 90) levels = list (np.arange (perc90, perc10, -(perc90-perc10)/num_dots))[::-1] + \ list (np.arange (perc90, z.max(), (z.max()-perc90)/num_dots))[1:] + [z.max()] pyplot.plot(abd.theta, abd.m, color='w', marker='*') pyplot.contourf (x, y, z, levels) pyplot.colorbar (format='%.3f') pyplot.vlines(abd.theta, 0, abd.m, color='w', linestyle='dashed') pyplot.hlines(abd.m, 0, abd.theta, color='w', linestyle='dashed') if write_lnl: fig = pyplot.contour (x, y, z, levels) pyplot.clabel (fig, inline=1, fontsize=10, colors='black', manual=True) pyplot.title ("Log likelihood of abundance under Etienne model") pyplot.xlabel ("theta") pyplot.ylabel ("m") pyplot.axis (theta_range + m_range) pyplot.show()
[docs]def draw_shannon_distrib(neut_h, obs_h): ''' draws distribution of Shannon values for random neutral :argument neut_h: list of Shannon entropies corresponding to simulation under neutral model :argument obs_h: Shannon entropy of observed distribution of abundance ''' neut_h = np.array ([float (x) for x in neut_h]) obs_h = float (obs_h) pyplot.hist(neut_h, 40, color='green', histtype='bar', fill=True) pyplot.axvline(float(obs_h), 0, color='r', linestyle='dashed') pyplot.axvspan (float(obs_h) - neut_h.std(), float(obs_h) + neut_h.std(), facecolor='orange', alpha=0.3) pyplot.xlabel('Shannon entropy (H)') pyplot.ylabel('Number of observations over %s simulations' % (len (neut_h))) pyplot.title("Histogram of entropies from %s simulations compared to \nobserved entropy (red), deviation computed from simulation" % (len (neut_h))) pyplot.show()
def mean (x): """ :argument x: list of values :returns: mean of this list """ return float (sum(x))/len (x) def std (x): """ :argument x: list of values :returns: standard deviation of this list """ mean_x = mean (x) return (sum([(i - mean_x)**2 for i in x])/len (x))**.5

Table Of Contents