"""Simple statistical routines"""

from operator import add
import random, math
import _statistics

def total(numbers):

    return reduce(add, numbers, 0)

def mean(sample):

    return total(sample) / float(len(sample))

def chi_squared(actual, expected):

    return ((actual - expected)**2)/expected

def log_prob(prob):

    """Log of prob/(1-prob)"""

    return log(prob) - log(1-prob)


def sample(events, number):

    chosen = {}
    while len(chosen) < number:
        chosen[random.choice(events)] = None
    return chosen.keys()

_cache = {}

def log_factorial(n):
    if n in _cache:
        return _cache[n]
    return _cache.setdefault(n, reduce(add, map(log, range(2,n+1)),0))

def log_choose(n, m):
    if (n, m) in _cache:
        return _cache[n, m]
    rv = log_factorial(n) - (log_factorial(m) + log_factorial(n - m))
    return _cache.setdefault((n, m), rv)

log_choose = _statistics.log_choose

def _precise_likelihood(count, total, prob):

    """Add  up  the  likelihoods  for  all counts  between  the  given
    expected count and the actual  count.  Move away from the expected
    count, so  that the likelihoods drop,  and we can  abandon the sum
    once  they  become insignificant.   This  has  been superseded  by
    _statistics.binomial_likelihood. """

    likelihood = 0;

    # ...determine  which tail  of  the distribution  is shorter,  and
    # hence  fastest to compute.   arrange the  counts to  compute the
    # likelihoods  for  in  order  of  increasing  distance  from  the
    # expected value,  so that computation  can be abandoned  when the
    # drop below a reasonable level of precision.
    exp_cnt = total * prob
    if count < (total/2):

        # Left-hand tail is shorter; compute that.
        if count < exp_cnt:

            # We  actually want  the  left-hand tail,  so include  the
            # actual event in the computation.
            extreme_values = [count]
        else:
            extreme_values = []
        extreme_values.extend(range(count-1, -1, -1))
    else:

        # Right-hand tail is shorter; compute that.
        if count >= exp_cnt:

            # We  actually want  the right-hand  tail, so  include the
            # actual event in the computation.
            extreme_values = [count]
        else:
            extreme_values = []
        extreme_values.extend(range(count+1, total+1))
    priors = [cnt for cnt in extreme_values if cnt < exp_cnt]
    priors.reverse()
    followers = [cnt for cnt in extreme_values if cnt >= exp_cnt]
    logprob = log(prob)
    logcomp = log(1-prob)
    for values in (priors, followers):
        for cnt in values:
            current_likelihood = logprob*cnt + logcomp*(total - cnt)
            current_likelihood += log_choose(total, cnt)
            if current_likelihood < -20:

                # Likelihoods are now insignificant.  Don't compute
                # anymore.
                break
            current_likelihood = math.exp(current_likelihood)
            likelihood = likelihood + current_likelihood

    # Default to the right-hand tail, for simplicity
    if count < (total/2):

        # ...it was the  left-hand tail that was computed,  so get the
        # right-hand tail.
        likelihood = 1 - likelihood
    if count < prob*total:
        return 1-likelihood
    else:
        return likelihood

precise_likelihood  = _statistics.binomial_likelihood
binomial_likelihood = _statistics.binomial_likelihood
    
def stat_significance(count, total, prob):

    """ -> log odds of drawing count events of probability prob from a
    sample of size total"""

    expected = total * prob
    if expected >= 12:

        # Can use the chi-squared approximation in this case.
        return ((count - expected)**2) / expected

    # Have to compute log-likelihood explicitly.
    rv = precise_likelihood(count, total, prob)
    if rv == 0:
        return -10e30
    else:
        return -log(rv)

def single_signifcance(dependant_count, # How often pattern occurs in dependant cases.

                       # How often in 'real' cases.
                       real_count,
                       
                       # How often offset pattern occurs.
                       dependant_totals,

                       # How often offset pattern occurs.
                       real_totals
                       ):

    '''Compute the likelihood of the pattern occurring as often as it
    did in the dependant cases, given its frequency in the real cases,
    and return the log of this likelihood.'''

    import warnings
    warnings.warn(
        '*check* this before using.  ripped most of the code out')

    if real_count == 0:

        # Blur this case a little bit, to stop infinities creeping in.
        real_count = 1

    assert real_totals > 0
    real_likelihood = real_count / float(real_totals)
    expected_dependant_count = real_likelihood * dependant_totals

    return stat_significance(dependant_count, dependant_totals,
                             real_likelihood)

def significance(true_count, # How often the pattern occurs in the real eg's

                 # How often in the false examples.
                 null_count,

                 # How often the offset pattern occurs in the real eg's
                 true_total,

                 # How often the in the false examples.
                 null_total
                 ):

    '''Compute the log likelihood of the pattern occuring as often as
     it did in the true cases, given its frequency in the null cases,
     and vice versa.  Return the maximum of these two log
     likelihoods.'''

    null_likelihood = single_signifcance(true_count, null_count,
                                         true_total, null_total)
    true_likelihood = single_signifcance(null_count, true_count,
                                         null_total, true_total)
    return min(true_likelihood, null_likelihood)        
    
