"""Parse a clustalw file"""

import cStringIO, tempfile, glob, time

from tools.Sequences.files import fasta_print

binary = 'clustalw'
                            
def parse_clustalw_list(cfile=None, filestring=None):

    """fileobj -> [(sequence_name: concatenated alignment sequence)]"""

    if filestring is not None:
        assert cfile is None
        cfile = cStringIO.StringIO(filestring)
    seqs = {}
    names = []
    firstline = cfile.readline()
    assert firstline.startswith('CLUSTAL W')
    for line in cfile:
        line = line.strip()
        if not re.match('[ *:]*$', line):
            name, sequence = line.split()
            names.append(name)
            seqs.setdefault(name, []).append(sequence)
    for name, sequence in seqs.items():
        seqs[name] = ''.join(sequence)

    length = len(seqs.values()[0])
    for sequence in seqs.values():
        assert len(sequence) == length

    rv = []
    for name in names[:len(seqs)]:
        rv.append((name, seqs[name]))

    return rv
    

def parse_clustalw(cfile=None, filestring=None):

    """fileobj -> {sequence_name: concatenated alignment sequence}"""

    return dict(parse_clustalw_list(cfile, filestring))

def get_MSA(seqs):

    fastapath = '%s-%f.fa' % (tempfile.mktemp(), time.time())
    fastafile = open(fastapath, 'w')
    for name, seq in seqs.items():
        if not name.startswith('>'):
            name = '>' + name
        fasta_print(fastafile, name, seq)
    fastafile.close()
    cmd = '%s -infile=%s -outfile=%s.aln > /dev/null'
    cmd %= binary, fastafile.name, fastafile.name
    assert not os.system(cmd)
    if os.environ.get('ATHENA_SYS'):

        # On  Athena, ClustalW  is appending  weird characters  to the
        # ends of the requested output filenames.
        output_names = glob.glob(fastafile.name + '.aln*')
        assert len(output_names) == 1
        output_path = output_names.pop()
    else:
        output_path = fastafile.name + '.aln'
    rv = parse_clustalw_list(open(output_path))
    os.remove(fastafile.name)
    os.remove(output_path)
    dnd_path = os.path.join(os.path.dirname(output_path), '*.dnd')
    for path in glob.glob(dnd_path):
        try:
            os.remove(path)
        except OSError:
            pass
    return rv

def test_get_MSA():

    import string, random
    seqs = {}
    for i in range(10):
        name = ''.join([random.choice(string.letters) for dum in 20*[None]])
        seq =  ''.join([random.choice(string.letters) for dum in 200*[None]])
        seqs[name] = seq
    from tools.Alignment import clustalw
    reload(clustalw)
    pp(clustalw.get_MSA(seqs))
    
def make_ClustalW_file(seqs):

    """Takes a [(name, seq),  ...] dictionary of presumably previously
    aligned  sequences, and returns  as a  string the  ClustalW output
    they would have come from. """

    seqlen = len(seqs[0][1])
    assert len(sets.Set([len(s[1]) for n, s in seqs])) == 1
    rv = ['CLUSTAL W (1.82) multiple sequence alignment', '']
    for startidx in range(0, seqlen, 60):
        for name, seq in seqs:
            rv.append('%-18s%s' % (name, seq[startidx:startidx+60]))
        match_string = []
        for posidx in range(startidx, min(seqlen, startidx+60)):
            if len(sets.Set([s[posidx] for n, s in seqs])) == 1:
                match_string.append('*')
            else:
                match_string.append(' ')
        rv.extend([18*' ' + ''.join(match_string), ''])
    rv = '\n'.join(rv)
    assert seqs == parse_clustalw_list(filestring=rv)
    return rv
        
if __name__ == '__main__':

    from tools.Alignment import clustalw
    reload(clustalw)
    # files = glob.glob('/home/alex_c/data/MSAs/[*')
    path = random.choice(files)
    seqs = eval(file(path).readline())
    basename = os.path.basename(path)
    names = eval(re.sub('_', ' ', basename))
    from RNA.data.sequences import srp, rnase
    if names[0] in rnase.sequences:
        targets = rnase.sequences.copy()
    else:
        assert names[0] in srp.sequences['Eukar.']
        targets = srp.sequences['Eukar.'].copy()
    for name in names:
        del targets[name]
    targetname, target = random.choice(targets.items())
    target = re.sub('U', 'T', re.sub('-', '', target).upper())
    profile = file('/var/tmp/profile.gde', 'w')
    profile.write(clustalw.make_ClustalW_file(zip(names, seqs)))
    profile.close()
    fastafile = file('/var/tmp/fug.fa', 'w')
    fasta_print(fastafile, '>'+targetname, target)
    fastafile.close()
    outpath = '/var/tmp/tst.aln'
    cmd = 'clustalw -profile1=%s -profile2=%s -outfile=%s'
    cmd %= profile.name, fastafile.name, outpath
    print os.system(cmd)
    print file(outpath).read()
    
