// This is a C++ code file

/*** COMMENTS and improvements added by Eric Banks -- Feb '97 ***/

#include <assert.h>
#include <ctype.h>
#include <string.h>
#include <String.h>
#include <stdio.h>
#include <stdlib.h>
#include "gseq.h"


// GeneSequence implementation

GeneSequence::GeneSequence(istream& cin) {
  GeneSequence::GeneSequence();
  parse(cin);
}

GeneSequence::GeneSequence() {

  region_num = region_max = 0;
  marker_num = marker_max = 0;
  regions = NULL;
  markers = NULL;
  CDS = NULL;
  CDSsequence = NULL;

  Annotated_repeats = FALSE;
  extra_exon_line = warningflag = joinflag = cdsflag = cds_seqflag = 
    extra_cds_flag = FALSE;

  mRNAnumber = allelenumber = cds_seqlength = cds_seqmemory = 0;
  last_exon = 0;
  promoter[0] = promoter[1] = 0;

  // define our main keywords
  mainkeywords[0] = "exon           ";
  mainkeywords[1] = "intron         ";
  mainkeywords[2] = "source         ";
  mainkeywords[3] = "CDS            ";
  mainkeywords[4] = "mRNA           ";
  mainkeywords[5] = "variation      ";
  mainkeywords[6] = "TATA_signal    ";
  mainkeywords[7] = "polyA_signal   ";
  mainkeywords[8] = "polyA_site     ";
  mainkeywords[9] = "prim_transcript";
  mainkeywords[10] = "allele         ";
  mainkeywords[11] = "repeat_region  ";
  mainkeywords[12] = "promoter       ";
}

GeneSequence::~GeneSequence() {
  destroy();
}

void GeneSequence::initialize() {
  
  Sequence::initialize();
    
  region_num = 0;
  region_max = 500;
  regions = new (Region*)[region_max];
  
  marker_num = 0;
  marker_max = 50;
  markers = new (Marker*)[marker_max];

  Annotated_repeats = FALSE;
  extra_exon_line = warningflag = joinflag = cdsflag = cds_seqflag = 
    extra_cds_flag = FALSE;

  mRNAnumber = CDSnumber = allelenumber = 0;
  CDSmax = maxmRNA = maxallele = 500;

  prim_transcript[0] = prim_transcript[1] = 0;
  ptranscript_inexact[0] = ptranscript_inexact[1] = 0;
  polyA_site[0] = polyA_site[1] = 0;
  polyA_signal[0] = polyA_signal[1] = 0;
  TATA_signal[0] = TATA_signal[1] = 0;
  promoter[0] = promoter[1] = 0;
  isource = 0;
  osource = "";  csource = "";  CDSproduct = "";

  last_exon = 0;
  cds_seqmemory = 500;
  cds_seqlength = 0;
  CDS = new (ValuePair*)[CDSmax];
  CDSsequence = new char[cds_seqmemory];    
  mRNA = new (ValuePair*)[maxmRNA];
  mRNAinexact = new (ValuePair*)[maxmRNA];
  allele = new (ValuePair*)[maxallele];
}

void GeneSequence::destroy() {

  Sequence::destroy();

  for (int i = 0; i < region_num; i++)
    delete regions[i];
  region_num = 0;

  if (regions) 
    delete[] regions;
  regions = NULL;
  
  for (int i = 0; i < marker_num; i++)
    delete markers[i];
  marker_num = 0;
  
  if (markers) 
    delete[] markers;
  markers = NULL;
  
  for (int i=0; i < CDSnumber; i++) delete CDS[i];
  CDSnumber = 0;   CDSmax = 500;
  if (CDS) delete CDS;
  CDS = NULL;
  
  if (CDSsequence)
    delete CDSsequence;
  cds_seqmemory = 0;
  cds_seqlength = 0;
  
  for (int i=0; i < mRNAnumber; i++) { delete mRNA[i]; delete mRNAinexact[i];}
  mRNAnumber = 0; maxmRNA = 500;
  if (mRNA) { delete mRNA;  delete mRNAinexact; }
  mRNA = NULL;  mRNAinexact = NULL; 
  
  for (int i=0; i < allelenumber; i++) delete allele[i];
  allelenumber = 0; maxallele = 500;
  if (allele) delete allele;
  allele = NULL;
}

int GeneSequence::promoter_start(int i) const
{ return promoter[0];}

int GeneSequence::promoter_stop(int i) const
{ return promoter[1];}

int GeneSequence::promoter_len(int i) const
{ return promoter[1] - promoter[0] + 1;}

int GeneSequence::TATA_start() const
{ return TATA_signal[0];}

int GeneSequence::TATA_stop() const
{ return TATA_signal[1];}

int GeneSequence::no_CDS() const
{ return CDSnumber;}

int GeneSequence::CDS_start(int i) const
{
  if (i >= CDSnumber) return 0;
  return CDS[i]->val1;
}

int GeneSequence::CDS_stop(int i) const
{
  if (i >= CDSnumber) return 0;
  return CDS[i]->val2;
}

int GeneSequence::get_region_num() const {
  return region_num;
}

Region* GeneSequence::get_region(int index) {
  assert(index > 0 && index <= region_num);
  return regions[index-1];
}


Region* GeneSequence::firstCExon(void) {
  
  for (int i=0; i < get_region_num(); i++) 
    if (regions[i]->type == REGION_CEXON) 
      return regions[i];

  return NULL;
}


Region* GeneSequence::lastCExon(void) {
  
  for (int i = get_region_num()-1; i >= 0; i--)
    if (regions[i]->type == REGION_CEXON) 
      return regions[i];

  return NULL;
}


Region* GeneSequence::nuc2reg(long pos) {

  for (int i=0; i < get_region_num(); i++) 
    if (regions[i]->start <= pos && regions[i]->stop >= pos)
      return regions[i];

  return NULL;
}

RegionType GeneSequence::whatRegion(long pos) {

  for (int i=0; i < get_region_num(); i++) 
    if (regions[i]->start <= pos && regions[i]->stop >= pos)
      return regions[i]->type;

  return REGION_NOTHING;
}




int GeneSequence::get_marker_num() const {
  return marker_num;
}


Marker* GeneSequence::get_marker(int index) {

  assert(index > 0 && index <= marker_num);

  return markers[index-1];
}


Marker* GeneSequence::get_marker_after(long pos, MarkerType type) {
 
 // return the first marker after position pos that is of marker_type 
 //  type or of a general marker_type.
 
  for (int i = 0; i < marker_num; i++)
 
        // past the position
    if (markers[i]->pos > pos &&           // correct marker type    
        (type == MARK_ANY || markers[i]->type == type))
      return markers[i];
 
  // otherwise, no marker is found and return null
  return NULL;
}

Region* GeneSequence::insert_region(long from, long to, RegionType type, int region) 
  // Insert a region at a given location.
{
  Region* temp=add_region(from,to,type);
  for(int j=region_num-1;j>=region;j--) regions[j]=regions[j-1];
  return regions[region-1]=temp;
}

Region* GeneSequence::add_region(long from, long to, RegionType type) {

  // if we can fit no more regions into our region arrays, create
  //  bigger ones and copy the old ones over
  if (region_num == region_max) {

    // create new arrays
    int newmax = 2*region_max;
    Region **newregs = new (Region*)[newmax];

    // copy the old stuff into the new arrays
    for (int i = 0; i < region_num; i++)
      newregs[i] = regions[i];
    delete[] regions;
    regions = newregs;
    region_max = newmax;
  }

  // create and return a new region -- increment region_num in the process
  if (type == REGION_CEXON)
    last_exon = region_num;

  // if stop = 0, then stop = start
  if (to == 0)
    to = from;

  Region* r=new Region(from,to,type);
  if (type == REGION_NOTHING)
    r->frame=FRAME_ANY;

  return regions[region_num++] = r;
}


void GeneSequence::set_last_exon_experimental(void) {

  regions[last_exon]->experimental = TRUE;
}


void GeneSequence::split_region(int region, RegionType first, 
				RegionType second, long location) {
  
  // copy the old regions into a new array with the named region split into 2

  int max;

  // make sure we have enough room for the new region 
  if (region_num == region_max) 
    max = 2 * region_max;
  else
    max = region_max;

  Region **newregs = new (Region*)[max];

  // copy the old stuff into the new arrays
  for (int i = 0; i < region; i++)
    newregs[i] = regions[i];

  newregs[region] = new Region(regions[region]->start, location-1, first);
  newregs[region+1] = new Region(location, regions[region]->stop, second);
  if (regions[region]->experimental == TRUE)
    newregs[region]->experimental = newregs[region+1]->experimental = TRUE;

  for (int i = region+1; i < region_num; i++)
    newregs[i+1] = regions[i];
  
  delete[] regions;
  regions = newregs;
  region_max = max;
  region_num++;
}


Marker* GeneSequence::add_marker(long pos, MarkerType type) {

  // if we can fit no more markers into our marker arrays, create
  //  bigger ones and copy the old ones over
  if (marker_num == marker_max) {

    // create new arrays
    int newmax = 2*marker_max;
    Marker **newmarks = new (Marker*)[newmax];

    // copy the old stuff into the new arrays
    for (int i = 0; i < marker_num; i++)
      newmarks[i] = markers[i];
    delete[] markers;
    markers = newmarks;
    marker_max = newmax;
  }

  // check to make sure that there isn't a marker there already
  int i;
  for (i = 0; i < marker_num; i++) 

    // if there's a marker there
    if (markers[i]->pos == pos)

      // if it's of the same type, just return it
      if (markers[i]->type == type)
	return markers[i];

      // otherwise, print an error message
      else {
	cerr << "Different markers at the same position in " << locus << "\n";
	break;
      }

    // no reason to keep checking once we pass our marker position
    else if (markers[i]->pos > pos)
      break;

  // for all markers at positions higher than the new one, push them up
  //  one level in the array to make room for the new one in the middle
  for (int j = marker_num; j > i; j--)
    markers[j] = markers[j-1];

  // increment the marker_num and create/return a new marker
  marker_num++;
  return (markers[i] = new Marker(pos, type));
}


bool GeneSequence::parse_line(const char *buf) {

  bool ret = Sequence::parse_line(buf);

  // when we're done reading in from database, assign markers
  if (!ret) {
    
    // for each of the regions assign markers for introns
    for (int i = 1; i <= get_region_num(); i++) {
      Region *reg = get_region(i);
      
      // only mark introns
      if (reg->type != REGION_INTRON)
	continue;
      
      // only care if they are exact markers
      if (!reg->is_start_inexact())
	add_marker(reg->start, MARK_ISTART);
      
      // don't put a stop marker if stop = start
      if (!reg->is_stop_inexact() && reg->stop != reg->start)
	add_marker(reg->stop, MARK_ISTOP);
    }
    
    // Add the reading frames
    add_reading_frames();
    
    // Lastly, add REGION_NOTHING for anything that hasn't been covered already.
    if (!region_num)                 insert_region(1, seqlen, REGION_NOTHING, 1);
    else if (get_region(1)->start>1) insert_region(1,get_region(1)->start-1,REGION_NOTHING,1);
    
    for (int i=2;i<=get_region_num();i++) 
      if (get_region(i-1)->stop+1<get_region(i)->start)
	insert_region(get_region(i-1)->stop+1,get_region(i)->start-1,REGION_NOTHING,i);

    if (get_region(get_region_num())->stop<get_length())
      add_region(get_region(get_region_num())->stop+1,get_length(),REGION_NOTHING);

    // quit the input loop
    return FALSE;
  }  

  // if we aren't done then check to see whether we're dealing with the
  //  features stuff -- if we are then parse the line
  if (features) {

    // check out what the keyword from the margin is
    char keyword[leftmargin];
    strncpy(keyword, buf + leftmarginstart, leftmargin);

    // "trim" the whitespace off the end of the keyword string
    //   note: we don't actually trim -- just remember length w/out whitespace
    int length = leftmargin;
    while (length != 0 && keyword[length-1] == ' ')
      length--;

    // for blank keyword send it on to be read or throw it out
    if (length == 0) {
      if (warningflag)
        return TRUE;
      else if (cds_seqflag) {
        read_cds(buf + leftmarginstart + leftmargin);
        return TRUE;
      }
      else {
        finish_parse(buf + leftmarginstart + leftmargin);
        return TRUE;
      }
    }

    // for new keyword reset keyword index
    current_keyword_index = find_index(keyword, length);
 
    // for bad keyword
    if (current_keyword_index == -1) 
      warningflag =  TRUE; 
    // for good keyword
    else {    
      extra_exon_line = warningflag = joinflag = cdsflag = FALSE;
      finish_parse(buf + leftmarginstart + leftmargin);
    }
    return TRUE;
  } 
  // otherwise, there's nothing to do, so just return
  return TRUE;
}


int GeneSequence::find_index(char *keyword, int length) {
 
  // find the index of the matching keyword
  for (int i=0; i < numberofkeywords; i++) {
    if (strncmp(keyword, mainkeywords[i], length) == 0)
      return i;
  }
  
  // if no match is found return bad value
    return -1;
}


bool GeneSequence::read_cvalues(const char *buf, long *start, long *stop, 
				char *c1, char *c2) {

  char temp1, temp2;

  // read in the appropriate values
  //  keep in mind that the database may contain '<' or '>' before
  //  the values for uncertainties -- so read them in.
  if (sscanf(buf, "%ld%c%c%ld", start, &temp1, &temp2, stop) == 4) {
    if (temp1 == '.' && temp2 == '.')
      return TRUE;
    else
      return FALSE;
  }

  if (sscanf(buf, "%c%ld%c%c%ld", c1, start, &temp1, &temp2, stop) == 5) {
    if ((*c1 == '<' || *c1 == '>') && (temp1 == '.' && temp2 == '.'))
      return TRUE;
    else
      return FALSE;
  }

  if (sscanf(buf, "%ld%c%c%c%ld", start, &temp1, &temp2, c2, stop) == 5) {
    if ((*c2 == '<' || *c2 == '>') && (temp1 == '.' && temp2 == '.'))
      return TRUE;
    else
      return FALSE;
  }

  if (sscanf(buf, "%c%ld%c%c%c%ld", c1, start, &temp1, &temp2, c2, stop) == 6) 
    {
      if ((*c1 == '<' || *c1 == '>') && (*c2 == '<' || *c2 == '>')
	  && (temp1 == '.' && temp2 == '.'))
	return TRUE;
      else
	return FALSE;
    }
  
  // here's a weird case:
  if (sscanf(buf, "%ld%c%ld", start, &temp1, stop) == 3 && temp1 == '^')
    return FALSE;

  // make sure fields are blank again
  *start = *stop = *c1 = *c2 = 0;

  // it's also possible that it's a single value
  if (sscanf(buf, "%ld", start) == 1)
    return TRUE;
  
  if (sscanf(buf, "%c%ld", c1, start) == 2) {
    if (*c1 == '<' || *c1 == '>')
      return TRUE;
    else
      return FALSE;
  }
  
  // no valid values read in  
  return FALSE;
}
  

bool GeneSequence::read_values(const char *buf, long *start, long *stop) {
    
  // read in the appropriate values
  if (sscanf(buf, "%ld%*c%*c%ld", start, stop) == 2) 
    
    // we've read in the values
    return TRUE;

  // make sure fields are blank again
  *start = *stop = 0;

  // it's also possible that it's a single value
  if (sscanf(buf, "%ld", start) == 1)

    // we've read in the values
    return TRUE;
  
  return FALSE;
}
  

void GeneSequence::finish_parse(const char *buf) {

    long start, stop;
    char c1, c2;
    c1 = c2 = 0;
    bool flag = FALSE;

  switch(current_keyword_index) {
  
  case EXON: 

    if (!extra_exon_line) {

      // read in the values for the exon start/stop
      flag = read_cvalues(buf, &start, &stop, &c1, &c2);
      extra_exon_line = TRUE;

      // there's still another possibility if flag is false
      // check to see whether there's a complement
      if (!flag && strncmp (buf, "complement(", 11) == 0)
        flag = read_cvalues(buf+11, &start, &stop, &c1, &c2);

    }
    
    // for appropriate lines create the region
    if (flag) {
      Region *reg;
      reg = add_region(start, stop, REGION_CEXON); 
      
      // note uncertainties
      if (c1 == '<') reg->set_start_inexact();
      if (c2 == '>') reg->set_stop_inexact();
    }

    else if (strncmp (buf, "/evidence=experimental", 22) == 0 &&
	     get_region_num())
      set_last_exon_experimental();
    
    break;

  case INTRON:

    // read in the values for the exon start/stop
    flag = read_cvalues(buf, &start, &stop, &c1, &c2);

    // there's still another possibility if flag is false
    // check to see whether there's a complement
    if (!flag && strncmp (buf, "complement(", 11) == 0)
      flag = read_cvalues(buf+11, &start, &stop, &c1, &c2);
        
    // for appropriate lines create the region
    if (flag) {
      Region *reg;
//       if (get_region_num()) {  // Add implied exons.
// 	reg=get_region(get_region_num());
// 	if ((reg->type==REGION_INTRON)&&(reg->stop+1<start))
// 	  add_region(reg->stop+1,start-1,REGION_CEXON);
//       } else if (start>1) add_region(1,start-1,REGION_CEXON); 
//       // Implied exon at beginning?
      reg = add_region(start, stop, REGION_INTRON); 

      // note uncertainties
      if (c1 == '<') reg->set_start_inexact();
      if (c2 == '>') reg->set_stop_inexact();
    }

    break;

  case SOURCEfeature:

    // parse the organism source 
    if (strncmp (buf, "/organism", 9) == 0) 
      osource += buf + 10;

    // parse the chromosome source
    else if (strncmp (buf, "/chromosome", 11) == 0) 
      csource += buf + 12;

    // read in the values for the source start/stop
    else {  
      flag = read_values(buf, &start, &stop);
    
      // for appropriate lines note points
      if (flag) 
	isource = stop - start + 1;
    }
    break;

  case CDSfeature:   

    // read in the values for the CDS 
    if (!cdsflag) {

      // get stuff ready to pass to parse_join()
      long vals[2*CDSmax][2];
      char tempbuf[maxbuf], c[2*CDSmax][2];
  
      // can't pass a constant to a non-constant function
      // get rid of the "join(" at the beginning...  (or "complement")
      if (!joinflag && strncmp (buf, "join", 4) == 0)
        strcpy(tempbuf, buf+5);
      else if (!joinflag && strncmp (buf, "complement", 10) == 0)
        strcpy(tempbuf, buf+11);
      else
        strcpy(tempbuf, buf);

      int counter = parse_join(tempbuf, vals, c);
      if (!counter)
        cdsflag = TRUE;
      
      // we need to check the accuracy of the CDS join numbers.  somtimes
      // there are genes with 2 different versions of the same CDS entry.
      // we'll ignore any overlapping join sections for now
      for (int j=0; j < counter; j++) {
	for (int k=0; k < CDSnumber; k++) { 
	  
	  // illegal overlaps
	  if ((vals[j][0] <= CDS[k]->val2 && vals[j][1] >= CDS[k]->val2) ||
	      (vals[j][0] <= CDS[k]->val1 && vals[j][1] >= CDS[k]->val1) ||
	      (vals[j][0] >= CDS[k]->val1 && vals[j][1] <= CDS[k]->val2)) {
	    
	    cerr << "Overlapping CDS join areas: (at gene locus)" << locus 
		 << endl; 
	    extra_cds_flag = TRUE;
	    break;	    
	  }
	}
      }
      
      // read in the values
      for (int i=0; i < counter; i++) {
	
	if (CDSnumber == CDSmax) {
	  
	  // create new arrays
	  int newmax = 2*CDSmax;
	  ValuePair **newCDS = new (ValuePair*)[newmax];
	  
	  // copy the old stuff into the new arrays
	  for (int i = 0; i < CDSnumber; i++) 
	    newCDS[i] = CDS[i];
	  delete CDS;
	  CDS = newCDS;
	  CDSmax = newmax; 
	}
	
	CDS[CDSnumber] = new ValuePair(vals[i][0], vals[i][1]); 
	CDSnumber++;
      }
      
      joinflag = TRUE;    
    }

    else {    // if seqflag == TRUE
      
      if (extra_cds_flag)
	break;

      // product line
      if (strncmp (buf, "/product", 8) == 0) 
        CDSproduct += buf + 9;

      // translation line
      else if (strncmp (buf, "/translation", 12) == 0) {
        cds_seqflag = TRUE;
        read_cds(buf + 14);  // read the sequence in -- skip header
      }
    }
    break;

  case MRNA:

    // read in the values for the mRNA start/stop (unless we're in the
    //  middle of a join statement
    if (!joinflag)
      flag = read_cvalues(buf, &start, &stop, &c1, &c2);

    // for appropriate lines note points
    if (flag) {

      if (mRNAnumber == maxmRNA) {

	// create new arrays
	int newmax = 2*maxmRNA;
	ValuePair **newmRNA = new (ValuePair*)[newmax];
	ValuePair **newmRNAinexact = new (ValuePair*)[newmax];
	
	// copy the old stuff into the new arrays
	for (int i = 0; i < mRNAnumber; i++) {
	  newmRNA[i] = mRNA[i];
	  newmRNAinexact[i] = mRNAinexact[i]; 
	}
	delete mRNA;
	delete mRNAinexact;
	mRNA = newmRNA;
	mRNAinexact = newmRNAinexact;
	maxmRNA = newmax; 
      }
      
      mRNA[mRNAnumber] = new ValuePair(start, stop); 
      // note uncertainties
      mRNAinexact[mRNAnumber] = new ValuePair(c1, c2);
      mRNAnumber++;
    }

    else {

      // see if it's a join statement
      if (joinflag || strncmp(buf, "join(", 5) == 0) {

        // get stuff ready to pass to parse_join()
        long vals[2*maxmRNA][2];
	char tempbuf[maxbuf], c[2*maxmRNA][2];
        // can't pass a constant to a non-constant function
        if (joinflag)
  	  strcpy(tempbuf, buf);
        else
     	  strcpy(tempbuf, buf+5);

	int counter = parse_join(tempbuf, vals, c);

        // read in the values
        for (int i=0; i < counter; i++) {

	  if (mRNAnumber == maxmRNA) {

	    // create new arrays
	    int newmax = 2*maxmRNA;
	    ValuePair **newmRNA = new (ValuePair*)[newmax];
	    ValuePair **newmRNAinexact = new (ValuePair*)[newmax];
	    
	    // copy the old stuff into the new arrays
	    for (int i = 0; i < mRNAnumber; i++) {
	      newmRNA[i] = mRNA[i];
	      newmRNAinexact[i] = mRNAinexact[i]; 
	    }
	    delete mRNA;
	    delete mRNAinexact;
	    mRNA = newmRNA;
	    mRNAinexact = newmRNAinexact;
	    maxmRNA = newmax; 	    
	  }
	  
          mRNA[mRNAnumber] = new ValuePair(vals[i][0], vals[i][1]); 
          // note uncertainties
          mRNAinexact[mRNAnumber] = new ValuePair(c[i][0], c[i][1]);
          mRNAnumber++;
        }
	joinflag = TRUE;
      }
    }
    break;
    
  case VARIATION:
    
    break;
    
  case TATA_SIGNAL:

    // read in the values for the TATA_signal start/stop
    flag = read_values(buf, &start, &stop);
    
    // for appropriate lines note points
    if (flag) {
      TATA_signal[0] = start; 
      if (stop)
        TATA_signal[1] = stop;
    }
    break;

  case POLYA_SIGNAL:

    // read in the values for the polyA_signal start/stop
    flag = read_values(buf, &start, &stop);
    
    // for appropriate lines note points
    if (flag) {
      polyA_signal[0] = start; 
      if (stop)
        polyA_signal[1] = stop;
    }
    break;

  case POLYA_SITE:

    // read in the values for the polyA_site start/stop
    flag = read_values(buf, &start, &stop);
    
    // for appropriate lines note points
    if (flag) {
      polyA_site[0] = start; 
      if (stop)
        polyA_site[1] = stop;
    }
    break;

  case PRIM_TRANSCRIPT:

    // read in the values for the prim_transcript start/stop
    flag = read_cvalues(buf, &start, &stop, &c1, &c2);
    
    // for appropriate lines note points
    if (flag) {
      prim_transcript[0] = start; 
      if (stop)
        prim_transcript[1] = stop;
      
      // note uncertainties
      if (c1) 
	ptranscript_inexact[0] = c1;
      if (c2)
	ptranscript_inexact[1] = c2;
    }
    break;

  case ALLELE:

    // read in the values for the allele start/stop
    flag = read_values(buf, &start, &stop);
    
    // for appropriate lines note points
    if (flag) {

      if (allelenumber == maxallele) {

	  // create new arrays
	  int newmax = 2*maxallele;
	  ValuePair **newallele = new (ValuePair*)[newmax];
	  
	  // copy the old stuff into the new arrays
	  for (int i = 0; i < allelenumber; i++) 
	    newallele[i] = allele[i];
	  delete allele;
	  allele = newallele;
	  maxallele = newmax; 
      }

      if (stop)
	allele[allelenumber] = new ValuePair(start, stop); 
      else
	allele[allelenumber] = new ValuePair(start); 
      allelenumber++;
    }
    break;
    
  case REPEAT_REGION:
    Annotated_repeats = TRUE;
    break;

  case PROMOTER:
    // read in the values for the promoter start/stop
    flag = read_values(buf, &start, &stop);
    
    // for appropriate lines note points
    if (flag)
      {
	promoter[0] = start;
	promoter[1] = stop;
      }
    break;
    
  default:    

     break;
  }
}


void GeneSequence::read_cds(const char *buf) {

  int i=0;
  char c; 

  // read in the sequence 
  while ((c = buf[i]) != '\0') {

    // '"' denotes end of CDS sequence
    if (c == '"') {
      cds_seqflag = FALSE;
      return;
    }

    if (c != '\r') {

      CDSsequence[cds_seqlength] = c;
      cds_seqlength++;
  
      // allocate more memory if necessary
      if (cds_seqlength == cds_seqmemory) {
	
	cds_seqmemory += 300;
	char *newCDS = new char[cds_seqmemory];
	memcpy(newCDS, CDSsequence, cds_seqlength);
	delete CDSsequence;
	CDSsequence = newCDS;
      }
    }

    i++;
  }
}


int GeneSequence::parse_join(char *buf, long vals[][2], char c[][2]) {
  
  // this function parses numbers from a join "statement" 

  char *token, *separators = ",)";
  long start, stop;
  char c1, c2;
  int counter = 0;

  token = strtok(buf, separators); 
  while (token) {

    c1 = c2 = stop = 0;
    bool flag = read_cvalues(token, &start, &stop, &c1, &c2);
    if (flag) {

      // update start/stop values
      vals[counter][0] = start;
      vals[counter][1] = stop;
    
      // check for uncertainties
      c[counter][0] = c1;
      c[counter][1] = c2;

      // increment totals
      counter++;
    }

    token = strtok(NULL, separators);
  }

  // let them know how many values we read in
  return counter;
}

void GeneSequence::add_reading_frames() {
  // we take advantage of the initilization of all of our 
  // fregions to FRAME_ANY in this routine
  int i,j;

  int CDS_idx = 0;
  ReadingFrame current = FRAME_0;
  int delta;
  bool never_seen = TRUE;

  // assume that all exons are experimental unless we see otherwise
  All_exons_experimental = TRUE;

  // if there are no regions given, make it all one big exon
  if (get_region_num() == 0 && !CDSnumber) 
    add_region(1, get_length(), REGION_CEXON);
  int EXONSsofar[1000][2], EXONcount = 0;
  int INTRONSsofar[1000][2], INTRONcount = 0;
  for (i=0; i<1000; ++i) EXONSsofar[i][0] = EXONSsofar[i][1] = INTRONSsofar[i][0] = INTRONSsofar[i][1] = 0;

  for (i=1; i<= get_region_num(); i++) {
    Region *r = get_region(i);
    if (r->type == REGION_CEXON) {
      //      cout << "**Exon: " << r->start << ", " << r->stop << endl;
      EXONSsofar[EXONcount][0] = r->start;
      EXONSsofar[EXONcount][1] = r->stop;
      EXONcount++;
    }
    if (r->type == REGION_INTRON) {
      INTRONSsofar[INTRONcount][0] = r->start;
      INTRONSsofar[INTRONcount][1] = r->stop;
      INTRONcount++;
    }
  }
  int *endptrs = new int[seqlen+1], *endptrEnds = new int[seqlen+1];
  for (i=0; i<=seqlen; ++i) endptrs[i]=endptrEnds[i]=0;

  for (i=0; i<EXONcount; ++i) {
    //   cout << "Exon: " << EXONSsofar[i][0] << " , " << EXONSsofar[i][1] << endl;
    if (EXONSsofar[i][0] > seqlen || EXONSsofar[i][1] > seqlen || EXONSsofar[i][1] <= EXONSsofar[i][0]) {
      //      cout << "WARNING: bad exon" << endl;
      continue;
    }
    endptrs[EXONSsofar[i][0]]    = REGION_CEXON;
    endptrEnds[EXONSsofar[i][0]] = EXONSsofar[i][1];
  }
  for (i=0; i<INTRONcount; ++i) {
    //    cout << "Intron: " << INTRONSsofar[i][0] << " , " << INTRONSsofar[i][1] << endl;
    if (INTRONSsofar[i][0] > seqlen || INTRONSsofar[i][1] > seqlen || INTRONSsofar[i][1] <= INTRONSsofar[i][0]) {
      //      cout << "WARNING: bad intron" << endl;
      continue;
    }
    endptrs[INTRONSsofar[i][0]]    = REGION_INTRON;
    endptrEnds[INTRONSsofar[i][0]] = INTRONSsofar[i][1];
  }
  int lastCDSend = 0;
  
  for (i=0; i<CDSnumber; ++i) {
    if (i && CDS[i]->val1 <= lastCDSend) {
      for (j=i; j<CDSnumber-1; ++j) {
	CDS[j]->val1 = CDS[j+1]->val1;
	CDS[j]->val2 = CDS[j+1]->val2;
      }
      CDSnumber--;
      i--;
    }
    else lastCDSend = CDS[i]->val2;
  }

  for (i=0; i<CDSnumber; ++i) {
    assert(CDS[i]->val1 <= seqlen && CDS[i]->val2 <= seqlen);
    //        cout << "CDS: " << CDS[i]->val1 << " , " << CDS[i]->val2 << endl;
    if (endptrs[CDS[i]->val1]) continue; 
    endptrs[CDS[i]->val1]    = REGION_CEXON;
    endptrEnds[CDS[i]->val1] = CDS[i]->val2;
  }
  int newmax = 2*region_max, newRegCount=0;
  Region **newregs = new (Region*)[newmax];

  for (i=0; i<seqlen; ++i) {
    
    switch(endptrs[i]) {

    case 0: continue;
    case REGION_CEXON:
      //      cout << "CEXON at : " << i << " , " << endptrEnds[i] << endl;
      newregs[newRegCount++] = new Region(i, endptrEnds[i], REGION_CEXON);
      for (j=i+1; j<seqlen; ++j) if (endptrs[j]) break;

      //      cout << " Next region begins at " << j << endl;
      if (j>=seqlen )         { i = j;   break; }
      if (j <= endptrEnds[i]) {
	i = j+1; 
	break;
      }

      if (endptrs[j] == REGION_CEXON || endptrs[j] == REGION_NCEXON)
	newregs[newRegCount++] = new Region(endptrEnds[i]+1, j-1, REGION_INTRON);

      else if (endptrs[j] == REGION_INTRON)
	newregs[newRegCount++] = new Region(endptrEnds[i]+1, j-1, REGION_NCEXON);
      
      break;
    case REGION_INTRON:
      //      cout << "INTRON at : " << i << " , " << endptrEnds[i] << endl;
      newregs[newRegCount++] = new Region(i, endptrEnds[i], REGION_INTRON);

      for (j=i+1; j<seqlen; ++j) if (endptrs[j]) break;

      if (j>=seqlen) { i = j; break; }
      if (j <= endptrEnds[i]) break;

      if      (endptrs[j] == REGION_INTRON) newregs[newRegCount++] = new Region(endptrEnds[i]+1, j-1, REGION_CEXON);
      else if (endptrs[j] == REGION_CEXON)  newregs[newRegCount++] = new Region(endptrEnds[i]+1, j-1, REGION_NCEXON);
      
      break;
    default:
      assert(0);
    }
  }
  for (i=1; i<newRegCount; ++i) {
    Region *newr = newregs[i];
    Region *prevr = newregs[i-1];

    if (newr->start > prevr->stop+1) {
      if ( (newr->type == REGION_CEXON || newr->type == REGION_NCEXON) &&
	   (prevr->type == REGION_CEXON || prevr->type == REGION_NCEXON) ) {
	Region *tempr = new Region(prevr->stop+1, newr->start-1, REGION_INTRON);
	for (j = newRegCount; j > i; --j) newregs[j] = newregs[j-1];
	newregs[i] = tempr;
	newRegCount++;
      }
    }
  }

  delete[] endptrs;
  delete[] endptrEnds;
  delete[] regions;
  regions = newregs;
  region_max = newmax;
  region_num = newRegCount;

  for (i=0; i<region_num; ++i)
    if (regions[i]->start > regions[i]->stop) {
      delete regions[i];
      for (j=i; j<region_num; ++j) regions[j] = regions[j+1];
      region_num--;
      i--;
    }

  // we need to compute differently depending on whether or not there is
  //  any CDS information for this gene...
  if (CDSnumber) {

    for (i = 1; i <= get_region_num(); i++) {

      Region *r = get_region(i);
      //      cout << "Examining region: " << r->start << " , " << r->stop << endl;
      //      cout << "CDSidx = " << CDS_idx << endl;
      //      cout << "CDS[CDSidx] : " << CDS[CDS_idx]->val1 << " , " << CDS[CDS_idx]->val2 << endl;
      // check to see whether the exons are experimental or not
      if (r->type == REGION_CEXON) { 
	if (r->experimental == FALSE)
	  All_exons_experimental = FALSE;
        never_seen = FALSE;
      }

      // check for an extra (non)coding regions
      if (CDS_idx >= CDSnumber) {
	if (r->type == REGION_CEXON)
	  r->type = REGION_NCEXON;
	r->frame = FRAME_ANY;
	continue;
      }

      // we don't need to assign frames to introns
      if (r->type == REGION_INTRON) {
        r->frame = FRAME_ANY;
        continue;
      }

      // check for noncoding regions
      if (CDS[CDS_idx]->val1 > r->start) {
	//	cout << "CDS[CDSidx]->start > " << r->start << endl;
        if (CDS[CDS_idx]->val1 > r->stop) { // CDS starts beyond the region.
          r->type = REGION_NCEXON;
          r->frame = FRAME_ANY;
          continue;
        } 
	//	cout << "  splitting region " << i-1 << " , " << CDS[CDS_idx]->val1 << endl;
        split_region(i-1, REGION_NCEXON, REGION_CEXON, CDS[CDS_idx]->val1);

        r = get_region(i);               // CDS starts after region starts
        r->frame = FRAME_ANY;
        r = get_region(++i); 
        r->frame = current;

	if (CDS[CDS_idx]->val2 < r->stop) {
	  //	  cout << " Now " << CDS[CDS_idx]->val2 << " < " << r->stop << endl;
	  //	  cout << "  splitting region " << i-1 << " , " << CDS[CDS_idx]->val2 << endl;
	  split_region(i-1, REGION_CEXON, REGION_NCEXON, CDS[CDS_idx]->val2+1);
	  r = get_region(i);
	  r->frame = current;
	  r = get_region(++i);
	  r->frame = FRAME_ANY;
	  CDS_idx++;
	}
	delta = CDS[CDS_idx]->val2 - CDS[CDS_idx]->val1 + 1;
	current = ReadingFrame((int(current) + delta) % 3);
        CDS_idx++;
      }
      else if (CDS[CDS_idx]->val2 < r->stop) { // CDS starts on or before region and 
	                                    // stops before end, but stops after region starts
  
        split_region(i-1, REGION_CEXON, REGION_NCEXON, 
                     CDS[CDS_idx]->val2+1);

        r = get_region(i); 
        r->frame = current;
        delta = CDS[CDS_idx]->val2 - CDS[CDS_idx]->val1 + 1;
        current = ReadingFrame((int(current) + delta) % 3);
        CDS_idx++;
        r = get_region(++i);
        r->frame = FRAME_ANY;
      }
      else if (CDS[CDS_idx]->val2 == r->stop && CDS[CDS_idx]->val1 == r->start) { // CDS==region
        r->frame = current;
        // the 1st frame we assign is FRAME_0
        delta = CDS[CDS_idx]->val2 - CDS[CDS_idx]->val1 + 1;
        current = ReadingFrame((int(current) + delta) % 3);
        CDS_idx++;
      }
    } // end of loop over fregions of this sequence
  }  // end if (CDSnumber)

  else {

    for (i = 1; i <= get_region_num(); i++) {

      Region *r = get_region(i);
      
      if (r->type == REGION_INTRON)
        r->frame = FRAME_ANY;

      else {

	// check to see whether the exons are experimental or not
	if (r->experimental == FALSE) 
	  All_exons_experimental = FALSE;
        never_seen = FALSE;

        r->frame = current;
        // the 1st frame we assign is FRAME_0
	delta = r->stop - r->start + 1;
        current = ReadingFrame((int(current) + delta) % 3);
      }
    }
  }

  // in the case of no exons
  if (never_seen)
    All_exons_experimental = FALSE;
}

void GeneSequence::unparse_header(ostream& cout) {

  Sequence::unparse_header(cout);
  unparse_features(cout);

  // print out all the intron and exon regions
  for (int i = 1; i <= get_region_num(); i++) {
    Region *reg = get_region(i);
    if (reg->type == REGION_INTRON)
      cout << "INTRON  ";
    else if (reg->type == REGION_CEXON)
      cout << "CODING EXON  ";
    else if (reg->type == REGION_NCEXON)
      cout << "NONCODING EXON  ";
    else 
      cout << "REGION UNKNOWN  ";
    cout << (reg->is_start_inexact() ? "<" : "") << (reg->start) << ".."
	 << (reg->is_stop_inexact() ? ">" : "") << (reg->stop);

    if (reg->frame == FRAME_ANY) 
      cout << "\n";
    else
      cout << "     \tFRAME  " << (int(reg->frame)) << "\n";
  }
    
  // note how many markers there are
  cout << "INTRON MARKERS: " << get_marker_num() << "\n\n";
  cout << "Annotated Repeats? \t" << (Annotated_repeats ? "TRUE" : 
				      "FALSE") << "\n";
  cout << "Experimental Exons?\t" << (All_exons_experimental ? "TRUE" : 
				      "FALSE") << "\n\n";
  }

void GeneSequence::unparse_features(ostream& cout) {

  cout << "\n";

  if (isource) {
    cout << "SOURCE:\t\tlength: " << isource << "\n\t\torganism: " << osource
      << "\n\t\tchromosome: " << csource << "\n";
  }
  
  if (TATA_signal[0]) {
    cout << "TATA_SIGNAL:     " << TATA_signal[0];
    if (TATA_signal[1]) 
      cout << ".." << TATA_signal[1];
    cout << "\n";
  }
  
  if (polyA_site[0]) {
    cout << "POLYA_SITE:      " << polyA_site[0];
    if (polyA_site[1]) 
      cout << ".." << polyA_site[1];
    cout << "\n";
  }

  if (polyA_signal[0]) {
    cout << "POLYA_SIGNAL:    " << polyA_signal[0];
    if (polyA_signal[1]) 
      cout << ".." <<polyA_signal[1];
    cout << "\n";
  }

  if (prim_transcript[0]) {
    cout << "PRIM_TRANSCRIPT: ";
    if (ptranscript_inexact[0])
      cout << ptranscript_inexact[0];
    cout << prim_transcript[0];
    if (prim_transcript[1]) {
      cout << "..";
      if (ptranscript_inexact[1])
	cout << ptranscript_inexact[1];
      cout << prim_transcript[1];
    }
    cout << "\n";
  }
  
  if (mRNAnumber) {
    for (int i=0; i < mRNAnumber; i++) {
      cout << "mRNA:            ";
      if (mRNAinexact[i]->cval1)
        cout << mRNAinexact[i]->cval1;
      cout << mRNA[i]->val1;
      if (mRNA[i]->val2) { 
	cout << ".."; 
	if (mRNAinexact[i]->cval2)
	  cout << mRNAinexact[i]->cval2;
	cout << mRNA[i]->val2;
      }
      cout << "\n";
    }
  }

  if (allelenumber) {
    for (int i=0; i < allelenumber; i++) {
      cout << "ALLELE:          " << allele[i]->val1;
      if (allele[i]->val2) 
	cout << ".." << allele[i]->val2;
      cout << "\n";
    }
  }   


  for (int i=0; i < CDSnumber; i++) {
    cout << "CDS:          " << CDS[i]->val1;
    cout << ".." << CDS[i]->val2 << "\n";
  }
  
  cout << "CDS PRODUCT:    " << CDSproduct <<"\n";
  cout << "CDS SEQUENCE:    (length = " << cds_seqlength << ")";

  for (int i=0; i < cds_seqlength; i++) {
   
    if (i % 65 == 0)      // inefficient, but good enough for our purposes
      cout << "\n\t";
    cout << CDSsequence[i];
  }

  cout << "\n";
}


void GeneSequence::unparse_line(ostream& cout, int from) {

  char markerbuf[100];
  
  int i;
  for (i = 0; i < 76; i++)
    markerbuf[i] = ' ';
  markerbuf[i] = 0;
  
  int pos = from;
  Marker *mark;

  // check to see if there are markers within the sequence segment we're
  //  about to draw.  if there are, then mark those markers...
  while ((mark = get_marker_after(pos)) && mark->pos - from < 60) {

    pos = mark->pos;
    // label the marker as a beginning or an end at the appropriate
    //  place in the marker buffer
    markerbuf[10 + (mark->pos - from -1) + (mark->pos - from -1) / 10] =
      mark->type == MARK_ISTART ? '\\' : '/';
  }

  // first draw out the normal sequence line
  Sequence::unparse_line(cout, from);

  // then, if there were any markers, draw out the marker buffer
  if (pos != from) cout << markerbuf << "\n";
}

// End of code















