"""New MSA statistical  analysis based on probabilistically reasonable
null hypothesis for nucleotide distributions.  Also, use thermodynamic
analysis to pick only the likely regions for base-pairing."""

from RNA.structure import ViennaOdds
from RNA.MSARi.MSA.Jaynes import Sequence
reload(ViennaOdds); reload(Sequence)
from RNA.MSARi.MSA import clean_MSA, BaseMSA
import sets

class MSA(BaseMSA.BaseMSA):

    def __init__(self, sequences, winlength, regvar=2):

        if type(sequences) == dict:
            sequences = sequences.values()
        sequences = clean_MSA.clean_MSA(sequences)
        self.winlength = winlength
        self.regvar = regvar
        self.lengths = len(sequences[0])

        self.sequences = []
        for seqidx, seq in enumerate(sequences):
            otherseqs = sequences[:]
            del otherseqs[seqidx]
            self.sequences.append(Sequence.Sequence(seq,
                                                    sequences[:seqidx], # otherseqs,
                                                    sequences,
                                                    winlength, regvar))

    def get_chosen_pairs(self, pthrsh=0.05):

        """Determine which pairs in the sequences to consider."""
        
        # ...pairs close to chosen  pairs, which therefore do not need
        # to be considered separately.
        pair_neighbors = sets.Set()
        chosen_pairs = sets.Set()
        for seq in self.sequences:
            chosen_pairs.union_update(ViennaOdds.distinct_pairings(
                seq.seq, self.winlength, pthrsh))
        chosen_pairs = list(chosen_pairs)
        chosen_pairs.sort()
        return chosen_pairs

    def get_matches(self):

        self.matches = []
        for pairidx, (pos1, pos2) in enumerate(self.get_chosen_pairs()):
            compsig = self.pair_likelihood(pos1, pos2)
            self.matches.append((compsig, (pos1, pos2)))
        self.matches.sort()

    def pair_likelihood(self, pos1, pos2, display=False):
    
        compsig = 1
        for seq in self.sequences:
            count,p1,p2=seq.best_pairing(pos1,pos2,self.winlength)
            if ((count > 4) and (max(abs(pos1-p1), abs(pos2-p2)) < 10)):

                # There was a non-trivial count, and the positions are
                # roughly accurate, so include this in the probability
                # computation.
                compsig *= seq.gapped_pairing_prob(
                    p1, p2, self.winlength, count)
        return compsig
                
if __name__ == '__main__':

    from RNA.data.sequences.rnase import sequences
    from RNA.MSARi.MSA.Jaynes import MSA
    from tools.Network import clustalw_server
    align_client = clustalw_server.Align_client()
    import random
    reload(MSA)
    all_names = sequences.keys()
    seqnames = []
    for dummy in range(15):
        name = random.choice(all_names)
        assert name not in seqnames
        seqnames.append(name)
        all_names.remove(name)
    seqs = [(name, sequences[name]) for name in seqnames]
    minlen = min(map(len, [seq for name, seq in seqs]))
    seqs = [(name, seq[:minlen]) for name, seq in seqs]
    cleanseqs = [(name, seq.replace('-', '')) for name, seq in seqs]
    alignedseqs = align_client.align(dict(seqs))
    msa = MSA.MSA([t[1] for t in alignedseqs], 10)
    msa.get_matches()
