class Flat_file_loop:

    '''Simple  base  class for  looping  over  plain  files and  fasta
    files.'''

    def __init__(self, filename=None, report_period=None, fileobj=None):

        if fileobj is None:
            self.file = open(filename)
        else:
            self.file = fileobj
        self.sequence_count=-1

        # How often to print how many sequences have been read in.
        self.report_period = report_period
        
    def __getitem__(self, index):
        if not self.read():
            raise IndexError, 'End of plain file'
        if self.report_period and \
           (not (self.sequence_count % self.report_period)):
            print self.sequence_count
        return self.name, self.sequence

    def tell(self):

        return self.file.tell()


class Plain_file(Flat_file_loop):

    '''Simple class to facilitate reading files of the form
    [<<title>
      <sequence>
      <empty line>>
     ...]

     Can be used in the form

     for title, seq in Plain_file (filename):
         pass
     '''
    
    def read(self):
        self.name = self.file.readline()
        if not self.name: # Indicates end of file
            self.sequence='';
            return 
        self.name=self.name[:-1]
        self.sequence = self.file.readline()[:-1]
        self.sequence_count=self.sequence_count+1
        assert self.file.readline() == '\n' # Intervening blank line
        return self.sequence # Empty if end is reached.
    
class Fasta_file(Flat_file_loop):

    def __init__(self, filename=None,report_period=None,fileobj=None,
                 break_size=None):
        Flat_file_loop.__init__(self, filename, report_period,fileobj)
        self.next_name = self.file.readline().strip()
        self.break_size = break_size

    def readall(self):
        sequence = []
        total_length = 0
        for line in self.file:
            if line.startswith('>'):
                self.name = self.next_name
                self.next_name = line.strip()
                break
            sequence.append(line.strip())
            total_length += len(sequence[-1])
            if (self.break_size is not None) and \
               total_length > self.break_size:
                if not hasattr(self, 'name'):
                    self.name = self.next_name
                break
        else:
            if not sequence: # Reached end of file.
                return None, None
            self.name = self.next_name
        self.sequence = ''.join(sequence)
        self.sequence_count += 1
        return self.name, self.sequence

    def read(self): return self.readall()[1]

    def tell(self):

        return self.file.tell() - len(self.file.line)

def fasta_print(output, name, sequence):

    assert name.startswith('>')
    output.write(name+'\n')
    for start_idx in range(0, len(sequence), 60):
        output.write(sequence[start_idx:start_idx+60] + '\n')
    output.flush()

def older_file(p1, p2):

    import stat
    if os.path.getmtime(p1) <= os.getmtime(p2):
        return p1
    else:
        return p2

class Plain_file_lookup:

    '''Stores offsets into a plain file so that the sequences can be
    quickly accessed.'''

    def __init__(self,

                 # Name of the plain file that is to be indexed.
                 filename,

                 # Name of the file in which the offsets into the
                 # plain file are stored.
                 offsets_filename=None):

        fn = self.filename = filename
        self.file = open(fn)
        if offsets_filename is None:
            offsets_filename = fn + '.offsets'

        of = offsets_filename
        if (not os.path.exists(of)) or (older_file(fn, of) == of):
            self._get_offsets(of)
            
        self.load_offsets(open(of))

    def _get_offsets(self, offsets_filename):

        '''Run over the  plain file and get the  offsets to each entry
        explicitly.'''

        self.offsets = {}
        plain_file = Plain_file(self.filename, 10000)
        offset = 0
        for name, sequence in plain_file:
            self.offsets[hash(name)] = offset
            offset = plain_file.tell()

        dump(self.offsets, open(offsets_filename, 'w'))

    def load_offsets(self, offsets_filename):

        self.offsets = load(offsets_filename)

    def __getitem__(self, name):

        offset = self.offsets[hash(name)]
        self.file.seek(offset)
        current_name = self.file.readline()[:-1]
        assert name == current_name, \
               'Inaccurate offset: %s for entry %s' % (offset, name)
        sequence = self.file.readline()[:-1]
        assert self.file.readline() == '\n', \
               'Entry failed to end with blank line.'
        return sequence

def test_Plain_file_lookup():

    '''Check a couple of sample entries in a plain file.'''

    names = [

        # The first entry
        '>AB000381_[[(28198,28271),(28880,28988),(34290,34586)]]_' \
        'GPI-anchored_molecule-like_protein;_GML.',

        # An entry somewhere in the middle
        '>M65101_[[(10,56),(142,513)]]_V-region;_immunoglobulin_' \
        'heavy_chain_subgroup_VH-I;_rearranged_DNA.',

        # The final entry.
        '>S81777_[[(41,209),(1691,1868)]]_.'
        ]

    # The md5 digests of the corresp. sequences
    sequence_hashes = [
        '\037\330\261\220\366,v`*{Z\223\276{AW',
        '\035\\\255\277\023p\203#\360\\\363\345A\005\236\307',
        '/i\272z\314\207\360\244\303w\264\033\275\211<\231'
        ]

    from mouse.config import DATA_DIR
    import md5
    plain_file = Plain_file_lookup(os.path.join(DATA_DIR,
                                                'gbpri.masked.plain'))
    for name, sequence_hash in zip(names, sequence_hashes):
        sequence = plain_file[name]
        assert md5.new(sequence).digest() == sequence_hash

class IndentedOutput:

    def __init__(self, indentation=4):

        self.indentation = 4
        self.clear_output()
        
    def clear_output(self):
        self.current_output = [self.indentation*' ']

    def write(self, output):

        self.current_output.append(output)
        if '\n' in output:
            sys.stdout.write(''.join(self.current_output))
            self.clear_output()

if __name__ == '__main__':
    from tools.Sequences import files
    reload(files)
    f = files.Fasta_file('/home/alex/data/ncRNA/control.fa')
    for x in f:
        print x
