from Numeric import array, reshape, zeros
import math, operator
import _msa2, _msa3, _msa4, _msa5
from tools.Stats import statistics
from RNA.MSARi.MSA import BaseMSA

try:
    import clusters
    reload(clusters)
except ImportError:
    print 'Failed to import clusters!'

MAX_TUPLE_LENGTH = 10
MIN_TUPLE_LENGTH = 5

class MSA(BaseMSA.BaseMSA):

    complements = {'A': 'T', 'C': 'G', 'G': 'CT', 'T': 'AG', '-': ''}

    def __init__(self, sequences, show_progress=False):

        self.sequences = self.clean_sequences(sequences)
        self.show_progress = show_progress
        self.lengths = lengths = len(self.sequences[0])
        assert map(len, self.sequences)==len(self.sequences)*[lengths]
        self.winlength = MAX_TUPLE_LENGTH

        # Compute  the  distribution of  nucleotides  over the  entire
        # ensemble.
        self.distribution = {'A':0., 'C':0., 'G':0., 'T':0.}
        total = 0.
        for seq in self.sequences:
            for char in seq:
                if char in 'ACGT':
                    total += 1
                    self.distribution[char] += 1
        for char in 'ACGT':
            self.distribution[char] /= total

        # Probability of  drawing a  pair of complementary  bases from
        # this distribution.
        self.comp_prob = 0
        for nuke1 in MSA.complements:
            if nuke1 == '-':
                continue
            prob1 = self.distribution[nuke1]
            for nuke2 in MSA.complements[nuke1]:
                prob2 = self.distribution[nuke2]
                self.comp_prob += prob1 * prob2

        # self.comp_distributions[n1]   gives   the  distribution   of
        # nucleotides, given that n1 is *not* drawn.
        self.comp_distributions = {}
        for char in 'ACGT':
            self.comp_distributions[char] = cdist = {}
            total = 0
            for char2 in 'ACGT':
                if char2 != char:
                    total += self.distribution[char2]
            for char2 in 'ACGT':
                if char2 != char:
                    cdist[char2] = self.distribution[char2]/total

        self.clusters = clusters.get_clusters(self.sequences)

        self.column_distributions_array = zeros((lengths, 4), 'f')
        for posidx in range(lengths):
            for nukeidx, nuke in iterate('ACGT'):
                prob = self.distribution[nuke]
                self.column_distributions_array[posidx,nukeidx] =prob
        self.pair_odds_array = zeros((lengths, lengths), 'f')
        _msa2.get_pair_odds(self.pair_odds_array,
                            self.column_distributions_array)

    def clean_sequences(self, sequences):

        sequences = [seq.replace('U', 'T') for seq in sequences]
        sequences = [re.sub('[^ACGT]', '-', seq) for seq in sequences]
        rv = [[] for seq in sequences]
        for i in range(len(sequences[0])):
            column = [seq[i] for seq in sequences]
            if max([column.count(nuke) for nuke in 'ACGT']) > 0:
                for rseq, char in zip(rv, column):
                    rseq.append(char)
        return [''.join(seq) for seq in rv]

    def sequence_pair_prob(self, posidx1, posidx2, seq):

        run_length, offset = _msa5.longest_run(
            seq, seq, posidx1, posidx2,
            MIN_TUPLE_LENGTH, MAX_TUPLE_LENGTH)
        if run_length >= MIN_TUPLE_LENGTH:
            rv = _msa3.window_odds(
                self.pair_odds_array, posidx1, posidx2,
                MIN_TUPLE_LENGTH, MAX_TUPLE_LENGTH)
            prob = rv[run_length-MIN_TUPLE_LENGTH]
            return prob, run_length, offset
        return 1, 0, None

    def pair_cluster_likelihoods(self, posidx1, posidx2, cluster, display=None):

        # Find  the pair  in  the cluster  with  the most  significant
        # tuple-wise complementarity.
        complementarities = []
        for seqidx in cluster:
            complementarities.append(
                (self.sequence_pair_prob(posidx1, posidx2,
                                         self.sequences[seqidx]),
                 seqidx))
        complementarities.sort()

        (prob, run_length, offset), seqidx = complementarities[0]
        main_seq = self.sequences[seqidx]
        if display:
            print cluster
            pp((self.clusters[posidx1], self.clusters[posidx2]))
            print seqidx, 
            print main_seq[posidx1:posidx1+MAX_TUPLE_LENGTH],
            print main_seq[posidx2:posidx2+MAX_TUPLE_LENGTH],
            print complementarities[0]

        if (len(complementarities) == 1) or (prob == 1):
            return 1
        
        comp_mutes = all_mutes = 0
        
        other_indices = [comp[-1] for comp in complementarities[1:]]
        other_seqs = [self.sequences[idx] for idx in other_indices]
        if offset < 0:
            idx1 = posidx1-offset
            idx2 = posidx2+MAX_TUPLE_LENGTH-1
        else:
            idx1 = posidx1
            idx2 = posidx2+MAX_TUPLE_LENGTH-1-offset

        seen_pairs = {}

        while (idx1 < posidx1+MAX_TUPLE_LENGTH) and \
              (idx2 > posidx2-1):
            for seqidx, seq in iterate(other_seqs):
                c1 = seq[idx1]
                c2 = seq[idx2]
                if (idx1, c1, idx2, c2) in seen_pairs:
                    continue
                mc1 = main_seq[idx1]
                mc2 = main_seq[idx2]
                if '-' in (c1, c2, mc1, mc2):
                    continue
                if (c1, c2) != (mc1, mc2):
                    all_mutes += 1
                    seen_pairs[idx1, c1, idx2, c2] = None
                    if c2 in MSA.complements[c1]:
                        comp_mutes += 1
            idx1 += 1
            idx2 -= 1

        # Compute the significance  of drawing this many complementary
        # mutations, given the overall distribution of nucleotides.
        if comp_mutes >= all_mutes * self.comp_prob:
            logprob = statistics.stat_significance(
                comp_mutes, all_mutes, self.comp_prob)
            if display:
                print comp_mutes, all_mutes, math.exp(-logprob)
            prob *= math.exp(-logprob)
        if display:
            print prob
            for idx, seq in zip(other_indices, other_seqs):
                print idx, 
                print seq[posidx1:posidx1+MAX_TUPLE_LENGTH],
                print seq[posidx2:posidx2+MAX_TUPLE_LENGTH]
            pp(seen_pairs)
        return prob

    def pair_likelihood(self, posidx1, posidx2, display=None):

        prob = 1
        cluster_list = clusters.refine_clusters(
            self.clusters[posidx1], self.clusters[posidx2])
        for cluster in cluster_list:
            cprob = self.pair_cluster_likelihoods(
                posidx1, posidx2, cluster, display)
            prob *= cprob
        if display:
            print prob
        return prob

    def get_matches(self):

        self.matches = []
        for i in range(self.lengths-MAX_TUPLE_LENGTH*2+3):
            if ((i % 10) == 0) and self.show_progress:
                print i
            for j in range(i+MAX_TUPLE_LENGTH+3,
                           self.lengths-MAX_TUPLE_LENGTH):
                odds = self.pair_likelihood(i, j)
                self.matches.append((odds, (i, j)))
        self.matches.sort()
        self.matches.reverse()

    def best_odds(self):

        if self.matches:
            return self.matches[-1][0]
        return 1

    def print_matches(self):

        pp(self.matches[-10:])

    def print_structure(self):

        for p1, p2 in self.structure.chosen_pairs():
            print self.pair_likelihood(p1, p2)
            
if __name__ == '__main__':

    import profile, pstats, time
    from tools.Alignment import clustalw
    from RNA.MSARi.MSA.tuples import MSA
    reload(MSA)

    if 0:
        alg_file = open('/scratch2/data2/bacteria/sequences/msas/++53GK3C89qjP77Z1XbAVg.aln')
        sequences = clustalw.parse_clustalw(alg_file)
        for n, s in sequences.items():
            s = s.replace('U', 'T')
            sequences[n] = re.sub('[^ACGT]', '-', s)

        if 1:
            t = MSA.MSA(sequences.values())
            t.get_matches()
        if 0:
            profile.run('t.get_matches()', 'MSAprofile')
            p = pstats.Stats('MSAprofile')
            p.strip_dirs().sort_stats('cumulative').print_stats()

    # sequences = eval(open('/scratch2/data2/bacteria/sequences/msas/tst.txt').read())

    # Genuine
    sequences = ['------------------------------------------------------------------------------------------------------------------------TAG--TTCTGGCCTG-CCC-TGTCCA-AGCAC-AGAGT----AGGATCA----CGAGGCCCAAATGAAAATA--TGGGCTTCT-TGATTC-CT-AAAG-CGGGGCG--GACCGCATGAGGCTGGCTTCACAGAGCAGTGAACAGC-TCC-CGCTCT-GTGC-A-GTGGAAGGATAATGGGTCGGTG--TCTT-ATCAAGTTC-AGT-AA-CGCCTAATGG----GTTGCT----C-CAACTAAACCACCACTTTT',
 '-GCC-GAGCTTA-GTAATGTGGGCTTGTAACCCA-------AA---TGGG--GGC-ATTAATGTGGTGG-AATG-TTG---GGCTGTA----CCAGATGGT--TG--GGCTTGGT-GGGCTAAT-TTCTGGCCTG-CCC-TATCCA-AGCAC-AGAGTT----GGATCA----TGTGGTCCAATTGAAAGAA--TGGGCTACT-AGATTC-CT-AAAG-TGGGGTG--GACCGCGTGAGGCTGGTTTCACAGAGCAACGAAAAAC-TTC-CGCTTT-GTGC-A-GTGGAAGGATAACGGACCGGTG--CCTT--CCAAGTTC-CATTAA-TGCCTGATGG----GCTGCT----C-CAATTAAACCACCACTTTT',
 '-GCC-GAGCTCA-GTAGCGTGGGCTTGTAACTCA-------AG---TGAG--GGC-ATCAGTGTGGTGG-AATG-TTG---AACATGC----CCAGAAGGT--A--GGGCTTGTG--TGACAATTCTCTGGCCTG-CCC-TGTCCA-AGCAA-AGAGTT----GGGTCT----TGTGGGCCAAGCGAAGGCT--TGGGCTGCT-GGGCTC-CT-AAAG-TGGGGTG--GACTGCGTGAGGCTGGTTTCACAGAGCAACGAATAGC-TCC-CGCTCT-GTGC-A-GTGGAAGGATAACGAGCCAGTG--TCAT--ATAAGTTC-AACGAA-TGTCTGATGG----GCTGTT----C-TAGTTAAACTACCACATTT',
 '-GTC-GAGCTAA-GTAACATGAGCTTGTAACCCA-------TG---TGGG--GAC-ATTTAGATGGTGG-AACAC-TG---GTTCGGG----TCCACGGGC--C--GGTTCTGTTGTTGGCATGTTTCTGGGCTG-CCCAG-TCCA-AGCTG-TGAGT-----AAGACG----TGTGTGTCAAGCGAAGGCT--TGGCTCAAA-CGGCTT-CT-AAAGTTGGAGGG--TAATGCGTGAGGCTGGTTTCACAGAGCAGCGACTACT-TCC-CGCTTA-CAGC-A-GTGGACGGATCACAGTTTAGCG--TCGC--TCAGAACC-ACTATG--GCCTGCTGG----TCCGAT----CTCATATGAACCACCATTT--',
 '-GCC-GCGAATT-C--GC-GGCG--------CCG-------AG---CGGG--GGC-AATAAGGTGGTGCGGATGCCTGG-TCGTTTGC----TCTTTGGCC--T--GGGCCTGTGGTTCGCT-CTATGCGGCCCG-CCCGT-TCCA-AGTTGCGTAGC----GTGTATG----GGGCGCCCTGGCGAAAGCT--AGAGCGTCC-CTGCAC-CT-AGAG-TGGCGGG--TATTGCGTGAGGCTGGTTTCACAGAGCAGCGAGTACC-GCC-CGCTTC-CAAC-G-GTGGAAGGATTACGGGCCGCTG--CAGC--CCTGGCCC-GCTTCCG-GCC-CATGTG----CCTTC----CA-TGGCAGACCCCCATCTTT',
 '-NCC-GAGCTCT-GTAGCGAGAGCTTGTAACCCG-------AG---CGGG--GGC-ATTAAGGTGGTGCGGATTCTTT--GCGATGGCT---TTCTGGGCC--C--GGGCTCGCTATGTGCCTTTGGCCGGCCTG-CCCGT-CCCA-AGTTG-GTAGTG---GCTGGC-----GGAGGCTTTAGCGGAAGCT--TTGGTCTCT-CCAGAC-CT-GAAG-TGGCAGG--AATGGCGTGAGGCTGGCTTCACAGAGCAGCGATCACTCGCC-CGCTTC-CAAC-G-GTGGGAGGATAACAGGCCGCTG--CACT--TCGAGCCC-AACTCAG-GCC-CAGAG-----CCTCA----CT-AAGCAGACCACCATCTT-',
 '------GGTCTTAGCAACTTGGTCCTGTAACCCA-------AG---TGGG--GGC-ATGTGGGAAATGG-AACT-TTG---GGTCAAC----CCAGTGGTN--C--GGGNCCAGTGCTAGCTGCTTACCCGGCTG-CCCAT-TCCA-AGCCG-GGAGGT----GGGCTG----AGGGACCTGGGCGAAGGCT--NNNGGNGCG-CA-CTC-CT-AGAC-TGGAGGG--CAATGCGTGAGGCTGGCTTCACAGAGCAGCGACTACC-TCC-CGCTCT-CGGC-A-GTGGAAGGATAACGGGCCGGTG--CGCC--TTGAGCCA-ACACTG--TTCAACTGGG----CTGACT---CTTAATAGGACCATTTCTTTT',
 '-GCCAGGGTAG----CA---ATGCCTGTGA------CCTCGTG-------T-GGC--AT-AAACAATGTA----GTTA---GAATAGTCAGAGTATGAGA--GTTTGATGAGACAGTTTGCT----AAGAGTGTG-CCCGT-TCCA-AACAG-TGAGT------TGCTA-TGGTGGAAACCTAGTAATAG------GACTTCACCAGGGCA-TTAAAGA-GCTGAGG-AACCGCCTCAGGCTGGCA--ACAGAACAGGGAAAACT-TGC-CGCTTA-TTGT-G-GTGGAAGGATAACATACTC--AT-TAACTGTCAA-----GCTAATC-----GGTTCT--G-CTATTC-----TGATCAG-ACACT------',
 '-GCC-GGGCGCG-GTGGCGCGTGCCTGTAG-TCCC--AGCTACTC--GGGA-GGC---TGAGGCTGGAG-GATCGCTT--GAGTCCAGGA-GTTCTGGGC---TG-TAGTGC--GCTATGCC----GATCGGGTG-TCCGC-ACTA-AGTTC-GGCAT------CAATA----TGGTGACCTCCCGGGAGCGG-GGGACCACCAGGTTG--CCTAAGG-AGGGGT--GAACCGGCCCAGGTCGGAA--ACGGAGCAGGTCAAAAC-TCC-CGTGCT-GATC-A-GTAGTGGGATCGCGCCTGT--GAATAGC-CACTG--C--ACTC-------CAG--------CCTGTGCAACATAGCGAGACCCCGTCTCTT',
 '-GCC---------------------TGTAG-TTCC--AGCTACTC--GGGA-GGC---TGAGACAGGAG-GATCGCTT--GAGTCCAAGA-GTTCTGGGC---TG-TAGTGC--GCTATGCC----GATCGGGTG-TCCGC-ACTA-AGTTC-GGCA------ACAATA----TGGTGACCTCCCGGGAGCGG-GGGACCACCAAGTTG--C-------------------------------------------------------------------------------------------------------------------------------------------------------------------',
 '-GTC-AGGCATA-GTGGTGCATGCCTGTGG-TCCT--GGCTACTC--CGGA-GGC---TGAGGTGGGAG-GATCACTT--GAGCCAAGGA-CTTCTGGGC---TG-TAGCAC--GCTATGCT----GATCAAGTG-TCCAC-ACTA-AGTTC-AGCAT------CTATA----TGGTGACCTCCCAGGGGCGG-GGGACCACGAGGTTG--CCTAAGA-AAGGGT--GAACTGGCCCAGGTTGGAA--ACGGAGCAGGTCAAAAC-TCC-CGTGCT-GATC-A-GTAGTGGGATCTTGCCTGT--GAATACT-CACTG--T--GCTC-------CAG--------CCTGGGCAACACAGTGAGACCCTGAT----',
 '-GACTGGAAG-G-TTGGCAG-CTTCTGTAA-TCACG---CTTC---TGTGA-GGTC----TGATTGTGG-GATGGCCT--GAGGCTGGGATCTACTGCGTA-GCGG-ACCAG--CTCATGTT----GACGGAACG-TCCGC-ACTA-AGCTT-GCCAT------CAATA----TGGGTGCCATGGAGGAGTCC-GTGGCATTCAGGTTGG-CT-AAGG-AGGGATG--AACCGGGCCAGGGGTGAA--AACCAGCAGCCAAGAGT-TCC-CGTGGT-AGGC-A-GTAGTGGGATAGCGTACCG--GAGTGGA---CTGC---CG-TTAT-----CAG--------CCCAACCGATATGGTTGGACCACAATCTTT',
 '---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------GCC-GAGCTCA-GCAGTGTGAGCTTGCAA-----YTCG--------------AA------CGGG-GGC---ATCAAGG-TGGTTGGAGGTTGGACAGNACACAGAGCAGGGCATT-TCGAGACACA------------------------------------------------------------------------',
 '-CTT-----------AGC-------TGTAATGGCA----TTT---TGTCG-GAG------TGGTAAATCGTCTTCTTGTT--GTGCGTTCGAGTCTTGGGCT-CTG-CACTTGGC-CAT---------TTGGTTG-TCCTT-TCCG-AATTC-TGCGGTTGATGGGC---GTCTCG-GTCTGAGTAATCG-GCTTTGAGATTTCCGT----TCTAAGA-TTAACTG-GGAA-ACTTCAGTGGAGCA--ATCCAGCAGAGATCCAG-TTG--CCGTGGGTAT-G-GCGGTGGGATCGCAACCAA----GTG-GTATATG--T--------------TATG--GAAGATATTT--------ACGAT-CACGATT---',
 'CGCTG----------------------TAA---------------------TGGC----TTGGTCGAA---GTGTTTA--GTACTCCCAAT--AGTGCAT--GTT-CGGTGGTC-TCGGGTT-----CGAGTCTCG-CTT-T-CGAT-CCCTC-GATC------TGCCAC---GTCTGTTCGAAGAGTAGTCT-TCGTG-GCAACTGGCA-GTTAAAC-CGTGTAG--TACCGATGGAGGTTGGAA--ACAATGCACATCA-CTA-CCGG-GTCTT-GGGC-A-GTGCGATAGCGATGGGATTC---A-CCT--TCGCAG---GATGTGC-----A-------TGGAAGTAT---AAACACAA-CGGTCGTT---']

    # Control from ['HUM.LU-B',_'TRY.BR-A',_'PER.AME.',_'GYN.AUR.',_'XEN.LAE.',_'SCH.POM.',_'LEP.COL.',_'ORY.SAT.',_'ZEA.MA-H',_'CAE.EL-A',_'DRO.MEL.',_'LYC.ES-G',_'ZEA.MA-B',_'LYC.ES-I',_'LYC.ES-H']
    sequences = ['-CGCGGGTTCCCTACGCCCCATATTTCTGAT-CAAGCAGGCTCTCATCAGGGTTC-TCGTATTGTCAGATGA---GCGGTATCAGCGCGTGGTTGGTGTCTGTGGC--GATGCAGTGGCTCGTGGGCCCCTGGGAGAATTAGACG--------CGACGCGGACTTCTGGGTACAAGGT-ACCGTCCCAAGGGTAGGT----CCAG--GCAGAATCGTCACCAGCCTGCGCAGCCCTACTGGAACGCTAGGACGTGGCGTGTTGTCAAGAAGAGGGGGTGTTCCAGGAGATAGCAACATAGGCCTGC-TGGGAACAGAGCGC', '-CGCGGGGCCGTCAGGCGCTATATGCCTGACGCAAGGGGGCTCTGATTAAGGGTCGTTGTACGATCAGACATCTCACAATACTTGCTC-TGGCCGGTGGTTGTAGC--TAGTTAGGGGCTTGCGGGACGCTCGCACG-TTAGGT---------CGTTCCGGACGCGCTAGCGCGGGACCACTGTCTAAAAGAAGGGT----ATAG--CGGATATGTTTACCGGCATGCCATTGCCTAGTGAAAC-CTAGTAAGGTATGTGTGGTTAAGATG-GTGGGTGTCCAAGTAAGTGGCGACACAGGACTACGCGGGAACAGAGCAC', '-CCCGCCGGC-TATTCCATCA-ACGAT-GTCTCGG---CATCCCGATCTGGGTCCGGTG-GCCTGGTGGGAA----GGGGGCGCGT---TAGGCGGTGTTCGTGGT-GGACTCAAGAGCCTCGGGTCCCCTGGGAGA---CGGCAGCC-----CAAGTCCTAAGCCCTGGCCCCGACT--CTGTGTCGCTCTTCGGG----GGAT--TTAGATAGTCTTCGG---CGACCTGTTCAAGTGGGAC-ATGGCGCGCTTGGGTTAGATCAGTGCTGATGAGCTGCAGGGGTGTGACGGT-TATAGCGAGGCGG-----------', '--CCGCCGCCGCAGCTCGTTA-ACGGTTGTGTCGAAT-CGTCCCGATCGGGGTCCGGTGTGCCTTGGGAGAC----GGGTAACGGC---AGCCCGGTGTTCGTCTTTGGACTCAA-AGCCTTGGGTCCCCTGGGCCAGCGCGGCAGCC-----GGACGCTTAAATC-TGGCCCCGACT--CTGTGTCCGCGCTCGAG----TGGG--TGAGCCTATCGTCGCATCCGACCCGTCCAAGCGGGAT-GACGCCC-CCTGGGGTCGACAAGAACCGACGAGCTCC-GGGTTGTGACAAT-T-TAACAAGGTCGC----------', '--AAAAATTCTCCAGATTCTACAGGATTAGGGCAAGGGG-CTCAGATCTTGGCTACCGTGTTG-TTAGGTACATGTACCTGCTTTGCTTCGGTGGGCTCGTATCTTAGGACGCAAAAGGCCTCGGACGTATAGAATGGTAGCTTGAGTTGT-TACAG-CAGGCACCTTGGGGCCGCTG-CTAGAGACGGTACCTGAT---TGAAG--TCGGAAGTCTTATGTAGCTACTACGCCTTTGTAGGAT-AAGGGAAATGGATCTGGCCTAGGAAGCAGTTGGTTAGGCATAATTTATGGTTAGACAT-TCAAGATA-CGGTTC--', '--AGAAACTCGCCAGATTCTATAGGATTAGGGCGAGGGGGTTCAGATCTTGGCTACCGTGTTG-TTAGGTACA-GTACCTGTTTCGCTTCGGTGGGCCCGTACCTTAGGACGCAAAAGGC-TCGGACGTAT-GAATGGTAGTCCGAGTTGT-TACAG-CAGGCACATTGGGGCCGCTG-CTGGAGATGGTACCTGGT---TGAAG--TCGGAAGTTTTATGTAGCTCCTACGCCCTTGTAGGAT-AAAAGTATTGAATCTGGCCTAGGAGGCAGTTGGTCAAGCAAAATTTATGGTTAGACAT-TCAAGTTA-TGGTTT--', '--AAAAACTCGCCAGACTCTATAGGATTAGGGCAAGGAGGTTCA-ATCTCGATTACCGTGTTGCTTAGGTACA--GGCCCGCTTCGCTTCGGTGGGCCCGTATCTTATGACGCAAAAGGCCTCGGGCGTAT-GAATGGTAGCCTGAGTTGT-TCCAGTCAGGTACATTGGGGCCGCTG-CTGAAGACGGCACCTGGT---TGAAG--TCGGAAGTTTTATGTAGCTCCTATGCCCTTGTAGGAC-AAAAGAATTGGATCTGGCCTAGGAGGCAGTTGGTCAGGCATAGTTTAAGGTTAGACATATCAAGTTA-TGGTCC--', 'CGTAGAACTCGCCAGATTCTGCAAGCCCGGAGCAAGCGG-CTCG-ATCTTAGGTCTTGTGCTGCCTCTCTATA-GGCTCTCATCGGCTTCGATCG--CCGTTACCTTCGACGCAATAGGCCGTGGGCGTATAGAACAGTAGTCGTAGT-GT-TATGG-CCGGCGCTTGGGGGCCGGTG-TTAGAGATGGCCCTCAGT---TAAGGGTTTGAGATGGCCATGTAGCCCCAA-GCCCTTGTAGGAG-TAGGGCACCGGTCCTG-CCTGGGGGGGGACGAGACGCGGAGTATT-ACGGCTAGATCTCCCAAGGGACTGGTCC--', '---GTGGTCCGCTAGA-TTGCGGCGGT-TCTACGAGGCGCCATTATTCC-AGCCCG-GTGGCT-TTTTATACATGGCCCCTTTTCGGCTGTGTCG-CCCGGTGCCCACGACGCAATAGTCCGCGGGCGTATCGAACGGCAGACCGCCGTGTGTTTGCTCGGGCACTTGGGGTCCCGGA-TCTGGGGCGCCCTTGGG---GTAGGA--AC-GCTGGCTCTCGTTGCCCTAG--ACCTTGCGGCAG--ACGGCGGCCGGTCTTGCCTTGGGGG-AGTAGCCCACGGCAAGTGCATCGGCTAGGCCTCCAAGGCACAGGACC--', '-GGAAAACCCGCTAGAGTCGCCAGGTTGTGTACGAAGCGCCACTACCCCCAGTCCGTGTGGCT-TGATTTACATGGCCCGTTTCGTTTTGCCT-G-CTCGACCCACAAGACGCAATAGTCTGTGGGCACATCGAATGTCAGACCGCCGTGTGTTACATCANGCACCTGGGGGCCCG-A-TCTGAGGTGGCTTACGGTGCGTAAGG--TCAGATGATCTCTGTAGCCCCAG--CCCTCGTAGCAG-CAAGGGTGTCAAGTCTGCCCTGGAGA-AGGGATTCACG-CAAGGGTTTGGCCTAGGCCTCCAAGGCACTGGATC--', '--GAAAACCCG-TAGA--TTCAAAGTC--AGGCGAGTCGTC--TAATCC--AGCTGTGTAGCT-TTCT-TAAATG--CCATGTTCGGC-GAAT-A-CACGACCCACA-GACGCAATAGGCCGCGGGTGCATTGAATAACAAGCCGCTGC---TTTCTACGNGC-CTTGGGGCCCT----CTTGAGGTGGTTCTTGG----TGAGG--GT-TATGGTCTA-GTAGCTAT-----CCTCGT-GTAG---AGGGGGCGAG--TTGCCTTGGGGG-AGTTGCCCACGGAAAGCAC---GGAAAGGTCGTCAAGG-ACGGCC----', '--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ACTAGACTGCTCTATGTAAGCGGTAG--AATCTAGTGATGGGTGAACCAGAGGACGTGACGGAGAGAA-----------------------------------', '----------------------------------ACGGTAGACAGTCTTAATACTTTGGAGACGCCCTAAACGGGACATACGTTTTAGTCGGTTCAGTTAATTACGTAGAATTCATGAGATCTAAGTTTTGGGGTCGTTATAATTAGC-----TGGTTCCCCCCCTGGGATACCGTTCGTTTGGTTCAATGGGTCACG---TCGAGTGTAGCGGGGCGGCCTATCCGTGGCATGCAGCCTGGACCCCCCGGAGTAACCGGTACAGGAGGTTTACGGTTGGGCTACGAATGTACAGCTC-----------------------', '----------------------------------------------------------------------------------------------------------------------------GGGACTCGTGACAGCTCCACAATCCG---TCGCCTTGGATCGTGAGTGGCGTGG-TCGGTGCCGTGAGCCGGT----GCAACTGCGGCCAAGCAGCTCTGGGGGCCCGGTTTGGTGATAT--TGTAGATGAGAGGACCTCCTGGGGATGCATGCGGGTTGGGAGGTCCCAGT-------------------------', '--CTCGGGCCGTCACA---CCTATGTTGTGTGCAAGGGGGCGCGATCATCGTCTTCGTGTAATGTAGGTGATAGGATAAGAACGCTT--TGGCTGGCGGCTGTATG-CCAAGCAGCCGCCCCGGGGGCTCTGGTGAGTTTAGGCA---------CGTTCGGATACTCAGAACAAGGCA--CTGTCATGAGGGCAAGGT--TGGAGTTGTGTACAGTGGATCGGTCGGCCGACTCCCAATTTTAAACGCAAGGAAGTCGCGTCTGTTGCGGTTCGTGGGCGCTCCAGGAATCAGAACGCAGAGTCGGCCGTACAAAGAACGC']

    # Low-scoring Genuine from ['Z.bailii',_'MACRNASEP',_'S.pastorianus',_'Xenopus-Chris',_'Lachesis-2',_'P.mississi',_'Euproctus',_'Z.florentinus',_'Discoglossus',_'S.cerevisi',_'P.strasburgensis',_'HSH1RNA',_'CHPRNASEP',_'A.telluris',_'MUSRNASEP']
    # Study closely!
    sequences = ['-NNNNGCG-GAAGGAAG-CTCA----CTGTA---GAGG--C-TACATGCA----GAGTG--------CATATGT----------CACTCAAGTA----------CCTT-GGGGAAGGTCTGAG-ACAGG---AGCC----GGTAAT------------------------------------------------------------------------------------------------TGCCCCAGCAGGGCA-----------TCCCCT--GAAA---AGGGGA------------------GCGAGCTCCCAGAGACCCGGGGCC--CAAAGGGGGT--GGC--ATGAGAC---GCGTGACCG--TGTATC-----------------------------------TGGT--A-CTCGC--TGAT--GCTTA---CTCCTTCC---GGGC----------------------------------ATTGGAA-GTGGCTCGC-----GGCT--GGC----------------------------------------------------ACTCCCCTGAGTGCAAT--GGCTGAGGTGAGG-----------TAATGGCG-----------CCTCAT------AACCCAATTCAGACCACTCTCCGGAATTCCGNNN-----', '-NNNNGCG-GAAGGAAG-CTCA----CTGTA---GAGG--T-TACATACT----GAGTGCGTCA---ATTCTG-----TGATGTCACTTCATTATTC-------CCTC-CGGGAAGGTCTGAG-ACAAG---AGCC----AATAGC------------------------------------------------------------------------------------------------CGCCCTTACAGGGCG-----------TCACTT--GAGC---GGGTGA------------------GTGAGTCCCCAGCAACCCCACGGT--GG-AGTGGA-CCGGC--CCGCTGTGT--GGAGGAGGC-TCT-GCA-GGAGAGGCGGGGTGGGCTTGTGGTGAGAGGGAGTGGTC----GCAGATTGA--GGCTGCTCCCACCTTACACACGGCTCCACCCGGCTGATGCTGCCTCCTCCCCGGGCC--TTGGAA-GTGGCTCTC-----GGCT--GGC-------------------------------------------------------TGTATGAGTCCAAT--GGCTGAGGTGGGGCAGCTGCACG-GGA-----TGTGTGCGC-GCCCCAT------AACCCAATTCAGACCACTGTCCGGAATTCCGNNN-----', '-NNNNGCG-GAAGGAAG-CTCA----CTGTA---GAGG--C-CACTAGCTT---GGGTG--------TCTAT------------CACCCTACTTCCG-------CCTG-TGGGAAGGTCTGAG-GGAAG---GGCC---AAATTGC------------------------------------------------------------------------------------------------CGCCCTAGCAGGGTG-----------CCACCTT-CTG---AAGGTGG-----------------AGTGAGTTCCCACCAATGTGGG-TCTCC--GATGAA---GCC--CTAGAATTGA-GGC-A-GGA-GA-AGCGGA--GAGGGGTCGTGGGTC-GTG--GGTCGTAGGCATCCTC-CCCGCGGATGCC-TCTGT-GGCTCCGTTC---GGG----------------------------------ATTTGAGAA-TGACTTAC-----GGCT--GAC------------------------------------------------------AGCTTTGAGTACAATG-GGAAGAGGTGAAGC-----------ACCGCCA----------GCTTCATAT----AACCCAATTCAGACCACTCTCCGGAATTCCGNNN-----', 'ATAGGGCG-GAGGGAAG-CTCA---TCAGT----GGGGC---CACGAGCT----GAGTG-------CGTCCTGT----------CACTCCACTCCCAT-----GTCCCTTGGGAAGGTCTGAG-ACTA---GGGCC----AGAGGC------------------------------------------------------------------------------------------------GGCCCTAACAGGGCT----------CTCCCTG---ATCTT-CGGGGAG-----------------GTGAGTTCCCAGAGAATGGG--GCTCCGCGCGAGGTCAGAC---TGGGCAGG--AGATGCCGTGGACCCCGCCC-TTCGGGG-A-GGGGCCC-----GNCGGATGCCTCCTTTGCCGGAGC---------------------------------------------------------------TTGGAACA-GACTCAC-----GGCC--AGC-------------------------------------------------------GAAGTGAGTTCAAT--GGCTGAGGTGAGGTACCCC------GGA----------GGGGACCTCAT------AACCCAATTCANNNNNNNNNNNNNNNNNNNNNNN-----', 'ATAGGGCG-GAGGGAAG-CTCA---TCAGT----GGGGC---CACGAGCT----GAGTG-------CGTCCTGT----------CACTCCACTCCCAT-----GTCCCTTGGGAAGGTCTGAG-ACTA---GGGCC----AAAGGC------------------------------------------------------------------------------------------------GGCCCTAACAGGGCT----------CTCCCTG---AGCTT-CGGGGAG-----------------GTGAGTTCCCAGAGAACGGG--GCTCCACGCGAGGTCAGAC---TGGGCAGG--AGATGCCGTGGACCCCGCCC-TTCGGGG-A-GGGGCCC-----GGCGGATGCCTCCTTTGCCGGAGC---------------------------------------------------------------TTGGAACA-GACTCAC-----GGCC--AGC-------------------------------------------------------GAAGTGAGTTCAAT--GGCTGAGGTGAGGTACCCC------GCA----------GGGGACCTCAT------AACCCAATTCANNNNNNNNNNNNNNNNNNNNNNN-----', 'AT-GGGCG-GAGGGAAG-CTCA---TCAGT----GGGGC---CACGAGCT----GAGTG-------CGTCCTGT----------CACTCCACTCCCAT-----GTCCC-TGGGAAGGTCTGAG-ACTA---GGGCC----AGAGGC------------------------------------------------------------------------------------------------GGCCCTAACAGGGCT----------CTCCCTG---AGCTT-CAGGGAG-----------------GTGAGTTCCCAGAGAACGGG--GCTCCGCGCGAGGTCAGAC---TGGGCAGG--AGATGCCGTGGACCCCGCCC-TTCGGGG-A-GGGGCCC-----GGCGGATGCCTCCTTTGCCGGAGC---------------------------------------------------------------TTGGAACA-GACTCAC-----GGCC--AGC-------------------------------------------------------GAAGTGAGTTCAAT--GGCTGAGGTGAGGTACCCC------GCA----------GGGGACCTCAT------AACCCAATTCAGACCACTCTCCTCCGCCATTNNN-----', 'AGTGGGCG-GAGG-AAG-CTCA---TCAGC----GGGGC---CACGTGCT----GAGTG----------CTCGT----------CACTC--TT-CG-------GCCCC-TGGGAAGGTCTGAG-ACTN---GGGCC----TCCCGC------------------------------------------------------------------------------------------------GGCCCTAACCGGGCT----------CTCCCC--G-AGC----GGGGAG-----------------GTGAGTTCCCAGAGAGCAGG--GCTCTACGCTTGGGCAGAC---TGGGCAGG--AGAAA--A-GG-CCCCG--AGTTCGG-----------------GGCGGATGCCTCCCTCGCCGGAGC---------------------------------------------------------------TTGGAACA-GACTCAC-----GGCC--AGC-------------------------------------------------------AGTGCGAGTTCAAT--GGCTGAGGTGAGGCACCT-------CCC------------GGGCCTCAT------AACCCAATTCANNNNNNNNNNNNNNNNNNNNNNN-----', '-AGATGTG-GAGGGAAG-CTCA----GTGCT---GAGG---CTACAGACA----GAGTG---------CTGGGGGT--------CACTC-AGTGAGTGTGA----CTC-AGGGAAGGTCTGAG-ATTAG---AGCC----CCTTGC------------------------------------------------------------------------------------------------TGCTCTCACAGGGCA-----------TCACTT---GGATG-AAGTGA------------------GTGGGTACTCAGAAAGC-TTG--------GCTCTGATTGGCCAGTGAGTGCAAAAGATGCAGTGAGTGCTCCTGGGTTGAGC------TCCTGTAGATGCAAAGGCATCAGGC-----------------------------------------------------------------------TTGGCA--CTGCCCAC-----AGCT--GGC------------------------------------------------------ACATTCAAGCTCAAT--GGCTGAGAGG-------------GAACTG---------------CCN-C------AACCCAATTCAGACTACTTTTTTTCCACATTTCA-----', '-TTGAACAGTGGTGAT-TCCTACGATTAAGAAACCTG-TTTGCAAAAG---GGCCTGC------CCAC---------GCGGGTC--ATCTTAT-TTAT---CAGGT--GGGAAATTCGGTG-GAACACAGTGGAGC-CTTGCCTTCC-GGGC-CATTGTC---TCTGG-CAGTGGCCC---------------------------------------------------------------CTGCTCGTGA-GAGAAG------AAACTTGCTGG-GGAA--CCAGT---CTTTACCGA------CCGTTGCT-ATCAGAAATTGAA--------------------------------------------------CGGGGCCCGGT---------CCC-------GCCGGGCCTCG-------------------------------------------------------------------------ATGGGAAC-GGCAACGG-TT-GTTCCGTTT------GACTTGTCGCCCG------CCA-------CGGCGCGCTGCAAGGTC----TGTTGAGTGCAAT--CGTAGGA----------CGCCA--TTCG------TGGCG----------------AACCCGATACCGATTACCACTGCTGTTAAGGGCGAATTC', 'GTGGAACAGTGGTAAT-TCCTACGATTAAGAAACCTG-TTTACAGAAG---GATCCCCACC----TATG-------GGCGGGTT--ATCAGATATTAT---CAGGT--GGGAAATTCGGTG-GAACACAGTGGAGC-CTTGTCCTCC-GGGT-TAATGTCGCTTTTGG-CATTGGCCC---------------------------------------------------------------CTGCTCCTGA-GAGAAGAAATAT------ACTGG-GGAA--CCAGT---CTTTACCGA------CCGTTGTT-ATCAGAAATTCA---------------------------------------------------CGGAGTTCGGC---------CTAG------GTCGGACTCCG-------------------------------------------------------------------------ATGGGAAC-GGCAACGG-TT-GTTCCGTTT------GACTTGTCGCCCGC------TACGGC-----GTGAGCGTCAAGGTC----TGTTGAGTGCAAT--CGTAGGA----------CGTCA--TTAG------TGGCG----------------AACCCGATACCGATTACTGCTGCTGTTCCAGC-------', '--------------------------------------------------------------------------------------------------------------------------GAACACTGTGGGAC----ACTATGCCGACC-CGCTGACTTTCTCGGTCAACGGGCCG---------------------------------------------------------------TTTCCGTGA-GGAAA------------GTCTGG-GGAA--CTAGGC----TTGCAG-------CTGCTGTC-ACAAGAAATCA----------------------------------------------------------ACGCT--------CACAGT-----AGCGG-------------------------------------------------------------------------------ATGGGAA--GACAGCAG-A--GTTCCATTT-------------AGCTTGAACA---CAACGT---TGAT--TGTTTAAGGCT----TGTTGAGTGCAAT--CGTAGGA--------------------------------------------CAAGAA-------------------------------------', '--------------------------------------------------------------------------------------------------------------------------GAATTCAGCGGGAC-GGCAATGGGTTAGTCGTCATGACTAATTCGGCGCATAACCGGTCGCCTCTGCAGCGTAAGCTACATTGGGGAC-------------------------------TTTTCCGTGA-GGAGGGA----------TCTTGG-GGAA--CCAAGA-------CC--ATATGGTTGTTGTC-ATCAGAAATCA--------------------------------------------------------ACCAGGTTAC------GTAA------GTTACCTGGT--------------------------------------------------------------------------ATGGGAA--GGCAACTGTAA-GTTCCGTTT-------------GACTTGTGCT----AATT----------AGTATAAGGTC----TGTTGAGTGCAAT--CGAAGGA--------CAAGTCT--TAT-------AGATTTG--------------AA-------------------------------------', '--------------------------------------------------------------------------------------------------------------------------AAACGCAG-AGGACCAGAACTTTGGGACGGGAGGCGGGGGTTTTGTTCGCAGGCTCCTTCTTCCCGCTCCTGGTTCTACGCGGCGTGCGCTGTACCACCATCGGTGGTTATGGCGCTTA--CCCCGTGA-GGGG------------GGCTTGG-GGAA--CCGAGTC------TAG--------CGTTGTT-ATAAGAAATCAA----TCT----TTT----GGA-TCCAGC--CAGGGTCCTCTAGCAGAAAAGGGGGATGGGGCTC------TCTT------GGGCTCTCCCCTCCACGCCTGCGGGATTCTG------------------------------------------------------ATGGGAAT-GACAACG--GTCGGTCCATTTGACTTTATCATTACGATGGT----TCTA----GTCGTGTTATGGTAAAGGTC----TGTTGAGTGCAAT--TGTAGAG--------CAAATTG--ATT-------CAATTTG--------------AA-------------------------------------', '-----------------------TGG------ACCTG-CCCACAAAAG---GAGC---------TCCGT----------------GCTCAGAATTACG---CAGGT--GGGAAATTCGGTG-GTTCACG-CTGTCC-AACATTACCCTT----------------------------------------------------------------------------------------------TCTCTTGA-GAGA--T--------CCTGGCGA-GGAA--TCGCTGG----------------GTGCGGCC-ATAAGAAATCA--------------------------------------------------------GCCCCC-----------TAAA-----AAGGGC------------------------------------------------------------------------------ATGGGA---GGCTGCAC----GGACAGTTG-----------GTCCTT----GGA---AA--------GAGGGC-------------TATTGAGTGCAAT-ATACAGA------CTGGGGGC----TTTG------GCCCCTGG-------------AA-------------------------------------', '----TTCTCATCAAAG-TCTGTATTT------ACCTG-CCTACACAAG---GGGCCCTCT----TCCCC---------CAGAGGGACCCAGCAGGAG----CAGGT--GGGAAATTCGGTG-GTACACG-CCGCCC----ATTAATTT----------------------------------------------------------------------------------------------CTCTCTTGA-GAGAG-T--------CCAGGCGA-GGAA--TCGCTAGG---------------GTGTTGCC-ATAAGAAATTCA-------------------------------------------------------GTCCGC----------CTA-------GCGGGC------------------------------------------------------------------------------ATGGGA---GGCTTCAC----GGGCGGTTG-----------GTCCCTAGTGCAA---AACT-----TTCGCGCTAGGAGGC-----TTTTGAGTGCAAT-ATACAGA--CCGCCTAGTGGTTC--CTTG---GGATCATCTGGGCGG---------AACGCGATACCGATTATTTTGGTGAGAA-----------']

    # Low-scoring Genuine from ['S.dairensi', 'Szo.octosp', 'Salmon-2', 'Phrynosoma', 'Z.rouxii', 'S.servazzi', 'RATRNASEP', 'P.mississi', 'C.opuntia', 'P.canadens', 'Discoglossus', 'S.cerevisi', 'S.globosus', 'CHPRNASEP', 'GORRNASEP']
    sequences = ['-----ACTAAACAGTGATGAT-TCCTACG---TTTTAAGA-ACCTG-TTTACAGAAGGAGGATAACAC---TTTA-------GTGCTTTTCTC-ATATAAGTT-CAGGT---GGGAAATTCGGTG-AAACACAGCGGAAC-CAACAACATCTACCAAGGTGTCGTTGCCCTCCCCAAGCCTAGACGGTGTCTTCTTCTAGGAGGAGCTTAAGTCCAGGTCCTTCCC-GTGA---GGGCAGTTTTCTTTGCAATTTTAAATTTT------ACTGG-GGAA---CCGGT------CTGCCTAAAAATTGCCATTGAAGAATTTATTGGGA--CGTCGTT-AGCAGAAATTCA------------------------------------------------------CAGAGACCTCTTTC-------AAA-------GAAATTGGTCCCTG------------------------------------------------------------------ATGGGAAC-AGCGACG--GTA------------------GTTCCA--TTT-----GACTTGTGGTGTG--AAATAA---TATCGACCATAAGGTC-TGTTGAGTGCAAT--CGTAGGA--------CAGATTTGATTTGA--AAATCTG---------------------------AACCCGATACCGATTACCATTGTTGTTTAGTC-------', '-----GTGGAACAGTGGTAAT-TCCTACG---ATTAAGAA-ACCTG-TTTACAGAAGGATCCCCACC----TATG------GGCGGGTTATC--AGATATTAT-CAGGT---GGGAAATTCGGTG-GAACACAGTGGAGC-CTTGTCCTCC-GGGTTAATGTCGCTTTTGGCATTGGCCC----------------------------------------CTGCTC-CTGA---GAGAAGAAATAT-----------------------ACTGG-GGAA---CCAGT------CTTTACCGA---------------------------CCGTTGTT-ATCAGAAATTCA------------------------------------------------------CGGAGTTCGGC---------CTAG----------GTCGGACTCCG------------------------------------------------------------------ATGGGAAC-GGCAACGG--TT------------------GTTCCG--TTT-----GACTTGTCGCCCGC--TACGGC---GTGAGCGTCAAGGTC-TGTTGAGTGCAAT--CGTAGGA----------CGTCA--TTAG---TGGCG-----------------------------AACCCGATACCGATTACTGCTGCTGTTCCAGC-------', '----------------------TCCTACG-CAATTTT----ACCTG-CCTTCATAAGGAACTCAAAGGGT-TCAGTTAGTACTTAACTTACT--ATATAAG---CAGGT---GGGAAATTCGGTG-GAACTCTA-AGAGA--C-AATCAGCCGGATCTTTAGCTTAATTGTTAAAGAACTG-----------------------------------------TTTCCGTGA--GGAAA------------------------------GTCTGG-GGAA---CTGGAC-----TTGCAG-------------------------------CTGCTGTC-ACAAGAAATCA------------------------------------------------------------ACGCC--------CTTTGTT--------GGTGG------------------------------------------------------------------------ATGGGAA--GGCAGTAG--AG------------------GTCTCT--TTT-----GGCTTGGATG--CAAATTTGGA-------CGTCCAAGGCT-TGCTGAGTGCAAT--CGTGGGA-------------------------------------------CAAAG----------AA-------------------------------------', '-----TTAGAGCAGTAGCAAC-TCCTACG---ATTTTA---ACCTG-CTTACGAAC--TATTAG-------TCTGAT---------CTAATA--ATACATG---CAGGT---GGGAAATTCGGTG-AAACACAACAAA----TGAGGT-TTCAGTTATTTGG------------------------------------------------------------GCTC-GTGA---GAGC-----------------------------AGCCTGG-GGAA--CCTGGTT----ATAATA-------------------------------GCGGCGCT-ATCAGAAATTCAA------------------------------------------------------------CGCTATCAA-------ATT------TTGATGGTG----------------------------------------------------------------------ATGGGAAC-GGCGCTAGC-AACAAACATATTTTATGTACCTCACGTGTTT-----GACTTAATCA------TTTAC--------TGGTTAAAGTC-TGTTCAGTGCAAT--CGTAGGA----------------------------------------CTTCTAAGCAATTAGTTGCACCCGATACCGATCAATGGTACTGTTCTAAT-------', '------------------------------T-GG-------ACCTG-CCCACAAAAGGAGC----------TCCGT------------GCTC-AGAATTACG--CAGGT---GGGAAATTCGGTG-GTTCACG--CTGTCC-AACATTACCCTT--------------------------------------------------------------------TCTC-TTGA---GAGA--T-----------------------CCTGGCGA-G--GAA----TCGCTGG---------------------------------------GTGCGGCC-ATAAGAAATCA---------------------------------------------------------------GCCCCC--------TAAA-----AAGGGC--------------------------------------------------------------------------ATGGGA---GGCTGCAC---------------------GGACAG---TTG------------GTCCTT---GGAAA----GAGGGC----------TATTGAGTGCAAT-ATACAGA-------CTGGGGGC----TTTG-----------------GCCCCTGG--------------------------AA----------------------', '-----------TTCCCTCCAAAG-TCTGTAT-TTT------ACCTG-CCTACAAAAGGAGGAGTCC-----CCGGC------GGACTTCCTC-AGTATTCG---CAGGT---GGGAAATTCGGTGAAATCGCT--CTGCCC-ACCAGGGAAAAGGTAAAACTCTCCCTGGTCCTTGGAAGGACTTGTCCTTCTGAG--------------------------TCTC-GTGA---GAGA--TG----------------------CCAAGCGT-G--GAG----ACGCTAGG-------------------------------------GTGGTCGCC-ATAAGAAACTTC--------------------------------------------------------------AACAG-------GTCACA------CTGTT--------------------------------------------------------------------------ATGGGA---GGCGCCAC---------------------GGGCAG---TTG--GTCCCTTTGCATCC-----AGAA-----GGAAGCTTTGGGGC--TGTTGAGTGCAAT-ATACAGA----GCGCTAGAAGGA-GTCCT-----------------TCCTTCTACGCGT----------------------AA----------------------', '------------------------------------------------------------------------------------------------------------------------------GTTCTCC--CCATCC-CTCTCTGGGTGCTTCTGCATTCAGAGCGATATAAT---------------------------------------------GCTC-TTGA---GAGC--TT---------------------CCTGGGCGTAG-ATAG--TTACGCCGG---------------------------------------GCGTCGCC-ATCAGAAAAACA-------------------------------------------------------------GCAGAGC--------TAC-------GCTTTGC------------------------------------------------------------------------ATGGGA---GGCGGCGC---------------------GGATGG---TTG------------GTCTTTC--AATACGAG-GAAAGGC---------TGTTGAGTGCAAT-TTGCGGC------------------------------------------CTTTGG-------------CA-----------------------------------', '------------------------------------------------------------------------------------------------------------------------------GAACACAGAGGATCG-ATAATGACCAGGAACTGTAACAGGTTCTGTTT------------------------------------------------CCTC-GTGA---GAGG-----------------------------GGCTTGG-GGAA--CCGAGTC---TGG------------------------------------CGTTGCT-ATAAGAAATCTA---------------------------------------------------CAGCGTTGTTCGTTA--------CTTG------TAACGTGGCAACCCTG-----------------------------------------------------------------ATGGGAAT-GGTAACG---GA-----------------TGATCCA--TTT-GACGTACTGCTACCA-----CTTG----TGATAGCGGTAAGGTC-TGTTGAGTGCAAT--CGTAGAG---------CAAATG---TTCG------------------CATTTG---------------------------AA----------------------', 'CATGCCGGACGTACGGACAAAC-GCCGCAC-T--------TCCTC-AAATTCAGAC-GCACTT-------TTAC-----------AAGTGT--TACGCGCATT-GAGGG---GGTAAGGTCGGAG-GAACT------------TCTTC-GTTGC------------------------------------------------------------------ATGCTC-GTGA--GGAGCGG---------------------------AGGAC-G--AAA---GTCCT-----GC-----------------------------------CGGGTGT--ACCAGAAATTCG--------------------------------------------------------------ATCTC---TTGGTTCGTCCTTT--GAGAT--------------------------------------------------------------------------CTTGAAAC--GCACCCG---A------------------GAAGA---TGT----------------------------------------------CTTTTAGTGCAAT-GTGCGGC-----------ACCTGTGAAAAGT-CAGGC-----------------------------AACTCGATTCCGACTAATCTTGTCTGTATGTCTGGTATG', '------NNGGGCG-GAGGGAAG-CTCA----TCAGT-----GGGG---TCACATGCAGGGTG----------CTTGT-----------CACTC--AACCAATCA--CCTT-GGGGAAGGTCTGAGACAAG------AGCC----AATCAC-----------------------------------------------------------------------TTCCC--TAGCA-GGGTA-----------------------------TCAC-CT-GAT------GTGA-----------------------------------------GTGAGTG-CACAGAGAATATGA--GCC--ATTGTGAG--CTGTCTTTACAGGA------GAAAC---CAGCCAGTGCAGCTCTCCAGCACTACTGGT-------------------------------------------GGATGTGACCTC-ACT-GGC-------------------------TTGGAAA--CACTCAT-----------------------GGCT-----GGC---------------------------------------------ATATGAGTGCAAT-GTGTGAG-TTGGAG----------TACAC--------------------CTCCAG-----------AACCCAATTCAGACCACTCTCCGGAATTCCGNNN-----', '------NNNNGCG-GAAGGAAG-CTCA-----CTGTA----GAGG---TTACATACTGAGTGCGTCA-----ATTCTG----TGATGTCACTT--CATTATTC---CCTC-CGGGAAGGTCTGAGACAAG------AGCC----AATAGC-----------------------------------------------------------------------CGCCC--TTACA-GGGCG-----------------------------TCACTT--GAGC---GGGTGA-----------------------------------------GTGAGTC-CCCAGCAACCCCAC-GGTG--G-AGTGGA--CCGGC-CCGCTGTGT-GGAGGAGGC-TCT-GCA-GGAGAGGCGGGGTGGGCTTGTGGTGAGAGGGAGTGGTC---GCAGATTGA--GGCTGCTCCCACCTTACACACGGC-TCCACCCGGCTGATGCTGCCTCCTCCCCGGGCC--TTGGAA-GTGGCTCTC-----------------------GGCT-----GGC--------------------------------------------TGTATGAGTCCAAT-GGCTGAG-GTGGGGCAGC------TGCACG-GGA-----TGTGTGCGCGCCCCAT-----------AACCCAATTCAGACCACTGTCCGGAATTCCGNNN-----', '------NNNNGCG-GAAGGAAG-CTCA-----CTGTA----GAGG---TCACGCGCTGGGTG---------TATCGAT----------CACCC-TCCTTGCTA---CCTT-TGGGAAGGTCTGAGAGAAG------GGCC----AAGTGT-----------------------------------------------------------------------CGCCT--TAGCA-GGGCG-----------------------------CCATCTT-AGG---AAGGTGG-----------------------------------------GTGAGTT-CCCAGCAATAGGGGTCTCC-TGTAAT-GCC-CCA-GTTTGA-GGC-AGGAGA-AGCGG--GGCTGGAGGGG-----CTAC-TCCCTCCTT-------------CCTGCGGATGCC--------TCCGT-GGCTTACGCGGG----------------------------------GCTTGGGAA-TGACTCAC-----------------------GGCC-----GGC--------------------------------------------ACGTTGAGTACAATGGGCAGAG-GTGAAGC---------GGCC--------------------GCTTCAT-----------AACCCAATTCAGACCACTCTCCGGAATTCCGNNN-----', '-----ATAGGGCG-GAGGGAAG-CTCA----TCAGT-----GGGGC---CACGAGCTGAGTG-------CGTCCTGT-----------CACTC-CACTCCCAT--GTCCCTTGGGAAGGTCTGAGACTA------GGGCC----AAAGGC-----------------------------------------------------------------------GGCCC--TAACA-GGGCT------------------------------CTCCCTG-AGCTT-CGGGGAG----------------------------------------GTGAGTT-CCCAGAGAACGGG---GCTCCA-CGCGAGGTCAGACTGGGCAGG------AGATGCCGTGGACCCCGCCC----TTCGGGG-A-GGGGCCC-----GGCGGATGCCTCCTTTGCCGGAGC---------------------------------------------------------TTGGAACA-GACTCAC-----------------------GGCC-----AGC--------------------------------------------GAAGTGAGTTCAAT-GGCTGAG-GTGAGGTACCCC-----GCA----------------GGGGACCTCAT-----------AACCCAATTCANNNNNNNNNNNNNNNNNNNNNNN-----', '-----ATAGGGCG-GAGGGAAG-CTCA----TCAGT-----GGGGC---CACGAGCTGAGTG-------CGTCCTGT-----------CACTC-CACTCCCAT--GTCCCTTGGGAAGGTCTGAGACTA------GGGCC----AGAAGC-----------------------------------------------------------------------GGCCC--TAACA-GGGCT------------------------------CTCCCTG-AGCTT-CGGGGAG----------------------------------------GTGAGTT-CCCAGAGAACGGG---GCTCCG-CGCGAGGTCAGACTGGGCAGG------AGATGCCGTGGACCCCGCCC----TTCGGGG-A-GGGGCCC-----GGCGGATGCCTCCTTTGCCGGAGC---------------------------------------------------------TTGGAACA-GACTCAC-----------------------GGCC-----AGC--------------------------------------------GAAGTGAGTTCAAT-GGCTGAG-GTGAGGTACCCC-----GTA----------------GGGGACCTCAT-----------AACCCAATTCANNNNNNNNNNNNNNNNNNNNNNN-----', '-----NNNNNNNN-NNNNNNNN-NNNN----NNNNN-----NNNNC---CACGCGCTGAGTG----------CTCGT-----------CACTC--TCTCG-----GCCCC-TGGGAAGGTCTGAGACTG------GGGCC----TCCCGC-----------------------------------------------------------------------CGCCC--TAACC-GGGCT------------------------------CTCCCC--GAGT---GGGGAG----------------------------------------GTGAGTT-CCCAGAGAGCAGG---GCTCCA-CGCGTGGGCAGACTGGGCAGG------AGAAGCC-----CCC------------GGG-TG-------------GGCGGATGCCTCCCTCGCCGGGGC---------------------------------------------------------TTGGAACA-GACTCAC-----------------------GGCC-----AGC--------------------------------------------AATTCGAGTTCAAT-GGCTGAG-GTGAGGCACCT-----CGC-------------------GGGCCTCAT-----------AACCCAATTCANNNNNNNNNNNNNNNNNNNNNNN-----']
    
    # t = MSA.MSA(sequences, True)
    # t.get_matches()
    # pp(t.matches[-100:])
    t.__class__ = MSA.MSA
    print t.combined_odds()
    for p1, p2 in t.structure.chosen_pairs():
        print p1, p2

"""

Notes about the program:

1.  The total probability  for a given pair seems to  be lower than it
    ought.  I need to look carefully at what's going on there.

2.  Need to  deal with  not counting dashes  more gracefully,  both in
    partitioning  the MSA  and in  scoring the  longest run  between a
    given pair of sequences.

"""
