"""Retrieve  the base-pair  probabilities for  a sequence  from Vienna
RNAFold."""

import tempfile, time

def pair_probs(sequence):

    """Retrieve the base-pair probabilities for a sequence from Vienna
    RNAFold.

    Returns a  list of  lists, one for  each position in  the sequence
    passed,   each    containing   (probability,   position)   tuples,
    representing the probability of the corresponding position pairing
    with  the  position in  the  tuple.   These  lists are  sorted  by
    decreasing probability."""

    # First strip out alignment  dashes, but record which positions in
    # the original sequence the nucleotides correspond to.
    positions, raw_seq = [], []
    for seqidx, nuke in enumerate(sequence):
        if nuke != '-':
            positions.append(seqidx)
            raw_seq.append(nuke)

    # Now send the stripped sequence to RNAfold...
    output_file_base = '/var/tmp/RNAfold_output' # tempfile.mktemp()
    if not os.path.isdir(output_file_base):
        os.mkdir(output_file_base)
    viennabin = 'RNAfold'
    viennaproc = os.popen(viennabin + ' -p > /dev/null', 'w')
    viennaproc.write('> %s\n%s\n' % (output_file_base, ''.join(raw_seq)))
    viennaproc.close()

    # ...for some reason, RNAfold truncates filenames like so:
    dest_file = output_file_base[:12] + '_dp.ps'
    for dummy in range(100):
        if os.path.exists(dest_file):
            break
        time.sleep(0.01)
    else:

        # ...file hasn't been created in 1 s.  Something's wrong...
        raise SystemError, 'RNAfold took too long.'

    # ...the lines  with probabilities all have two  integers, a float
    # probability and "ubox".
    results = filter(re.compile('\d+ \d+ 0\.\d+ ubox$').match,
                     open(dest_file))
    rv = [[] for dummy in len(sequence) * [None]]
    for line in results:
        n1, n2, p = line.split()[:3]

        # ...translate the positions back to the aligned sequence.
        n1, n2 = positions[int(n1)-1], positions[int(n2)-1]
        p = float(p)
        rv[n1].append((p, n2))
        rv[n2].append((p, n1))
    for pairs in rv:
        pairs.sort()
        pairs.reverse()
    return rv

def good_windows(sequence, winlength):

    """Return  starts  of   windows  of  length  winlength  containing
    high-probability basepairs."""

    prob_pairs = pair_probs(sequence)
    windows = sets.Set()
    maxpos = len(sequence) - 1
    for seqidx, probs in enumerate(prob_pairs):
        for prob, complement in probs:
            if prob < 0.05:
                break
            for offset in range(-winlength, winlength):
                start1 = min(max(0, seqidx     + offset), maxpos)
                start2 = min(max(0, complement - offset), maxpos)
                windows.add((start1, start2))
    windows = list(windows)
    windows.sort()
    return windows

def distinct_prob_pairings(sequence, winlength, pthrsh):

    """Greedily choose  most likely set  of pairs of windows  in which
    the pairings have no overlap."""

    chosen_pairs = sets.Set()
    pair_neighbors = sets.Set()
    for pos1, pairs in enumerate(pair_probs(sequence)):
        for prob, pos2 in pairs:
            if (prob<pthrsh) or ((pos1,pos2) in pair_neighbors):
                continue
            assert (pos1, pos2) not in chosen_pairs
            assert pos1 < pos2
            chosen_pairs.add((prob, (pos1, pos2)))
            for offset in range(-winlength/3, winlength/3):
                pair_neighbors.add((pos1+offset, pos2-offset))
                pair_neighbors.add((pos2-offset, pos1+offset))
    return chosen_pairs

def distinct_pairings(sequence, winlength, pthrsh):

    ppairs = distinct_prob_pairings(sequence, winlength, pthrsh)
    return [t[1] for t in ppairs]

if __name__ == '__main__':
    from RNA.structure import ViennaOdds
    reload(ViennaOdds)
    t = ViennaOdds.good_windows('---------------------------------------------------------------------------------------------------------------------------------------------------------------------G-C-CCUG-CGGCGGGAC---------------------------------------------------------------aGGGUg-aacu-CCCccaggCCCgaa--aGGGagcaaGGGuaaGC-CCgccGUCCC-GUgC-G--CAGGGU--------------------------------------------------------------------------------------------------------'
                                , 10)
    print len(t)
    

 
