"""Computes  for   each  sequence  in  an  MSA   the  distribution  of
nucleotides  that  could  be  expected  at each  position,  given  the
sequence's local similarity to the other sequences."""

def dist(seq, prior_seqs, all_seqs, winlength):

    """Compute the  distributions of  nucleotides at each  position in
    seq,  given  the  distribution   of  nucleotides  in  the  MSA  of
    sequences."""

    matches = []
    for oseq in prior_seqs:
        match_list = [(c1==c2) and (c1 in 'ACGTU') \
                      for c1,c2 in zip(seq,oseq)]
        matches.append(match_list)

    # We want windows of length winlength in the *ungapped* version of
    # seq, so make a map between gapped and ungapped indices in seq.
    gappedindcs = []
    for seqposidx, char in enumerate(seq):
        if char in 'ACGTU':
            gappedindcs.append(seqposidx)

    # Now  make the distributions  for the  positions in  seq that
    # actually have a nucleotide.
    distributions = [{} for dummy in range(len(seq))]
    for ungapidx, gapidx in enumerate(gappedindcs):

        # ...compute the  similarities of the  remaining sequences
        # to the central one.  
        winstart = ungapidx - (winlength/2)
        winstart = max(0, winstart)
        winstart = min(len(gappedindcs)-winlength-1, winstart)
        similarities = []

        # ...switch  the  boundaries  to  the indices  in  the  gapped
        # sequence
        winend = gappedindcs[winstart+winlength]
        winstart = gappedindcs[winstart]
        for match in matches:
            count = sum(match[winstart:winend])
            similarities.append(count/float(winlength))
            assert similarities[-1] <= 1

        # Compute the local distribution of nucleotides
        nuke_counts = dict.fromkeys('ACGT', 0)
        for cseq in all_seqs:
            cseq = list(cseq[winstart:winend])
            for nuke in 'ACGT':
                nuke_counts[nuke] += cseq.count(nuke)
        total = float(sum(nuke_counts.values()))
        standard_dist = {}
        for nuke in 'ACGT':
            standard_dist[nuke] = nuke_counts[nuke] / total

        # Get normalized distributions for mutations.
        mute_dists = {}
        for nuke in 'ACGT':
            mute_dists[nuke] = standard_dist.copy()
            del mute_dists[nuke][nuke]
            total = sum(mute_dists[nuke].values())
            if total:
                for onuke, prob in mute_dists[nuke].items():
                    mute_dists[nuke][onuke] = prob / total
            else:
                for onuke, prob in mute_dists[nuke].items():
                    mute_dists[nuke][onuke] = 0

        # ...construct  the  distribution   from  those.   This  is  a
        # weighted  sum of distributions,  based on  the possibilities
        # that  the  sequence  came  from  mutations, or  is  a  "new"
        # sequence entirely.

        # ...the distributions and  their weights.  First distribution
        # is standard, associated to the sequence being "new"
        curdists, weights = [], []
        curnuke = seq[gapidx]

        # ...add up  the weighted  counts from the  nucleotides at
        # this position in the other sequences.
        for oseq, similarity in zip(prior_seqs, similarities):
            weights.append(similarity**2)
            nuke = oseq[gapidx]
            if nuke == curnuke:

                # Position   is   unchanged,   so   add   a   constant
                # distribution, times the weight for this sequence.
                curdists.append({nuke: 1})
            elif nuke == '-':

                # An  entirely  new   character,  so  add  a  standard
                # distribution:
                curdists.append(standard_dist)
            else:
                assert nuke in 'ACGT'

                # A mutation,  so add  the distribution given  that we
                # have a mutation from this character.
                curdists.append(mute_dists[nuke])

        # Add  a  copy of  the  local  distribution  of the  sequence,
        # weighted by complement to degree of similarity to others.
        local_dist = dict.fromkeys('ACGT', 0)
        localseq = list(seq[winstart:winend])
        for nuke in 'ACGT':
            local_dist[nuke] += localseq.count(nuke)
        local_total = float(sum(local_dist.values()))
        for nuke in 'ACGT':
            local_dist[nuke] /= local_total
            
        curdists.append(local_dist)
        weights.append((1 - max(similarities+[0]))**2)
            
        # ...normalize the sums, to make a distribution
        total = sum(weights)
        distribution = {}
        for nuke in 'ACGT':
            score = 0
            for curdist, weight in zip(curdists, weights):
                score += curdist.get(nuke, 0) * weight
            distribution[nuke] = score / total
        assert abs(sum(distribution.values()) - 1) < 1e-6
        assert min(distribution.values()) >= 0
        distributions[gapidx] = distribution

    return distributions

complements = {'A': 'T', 'C': 'G', 'G': 'CT', 'T': 'AG', '-': ''}

def pairing_prob(var1, var2):

    """Return the probability that complementary nucleotides are drawn
    independantly from var1 and var2"""

    rv = 0
    for nuke in 'ACGT':
        p1 = var1[nuke]
        for comp in complements[nuke]:
            rv += p1 * var2[comp]
    return rv
