// This is a C++ code file

/*** Created by Eric Banks -- March '98 ***/

#include <ctype.h>
#include <string.h>
#include <String.h>
#include <stdio.h>
#include <stdlib.h>
#include <iostream.h>
#include <fstream.h>
#include "fseq.h"
//#include "common.h"

// FilterSequence implementation

FilterSequence::FilterSequence(istream& cin) {
  FilterSequence::FilterSequence();
  FilterSequence::parse(cin);
}

FilterSequence::FilterSequence() {
  CDS_accurate = stop_in_exon = region_accurate = FALSE;
  gc_percent = 0.0;
  numberof_loci = 0;
  initialize_PT();
}


void FilterSequence::initialize(void) {

  GeneSequence::initialize();

  CDS_accurate = stop_in_exon = region_accurate = FALSE;
  gc_percent = 0.0;
  numberof_loci = 0;
}


void FilterSequence::initialize_PT() {

  // *** initialize the Protein Table  *** //
  
  // for ease;
  int A = BASE_A, C = BASE_C, G = BASE_G, T = BASE_T; 
  
  ProteinTable[G][C][T] = ProteinTable[G][C][C] = 
    ProteinTable[G][C][A] = ProteinTable[G][C][G] = 'A';   // Alanine 
  
  ProteinTable[C][G][T] = ProteinTable[C][G][C] = 
    ProteinTable[C][G][A] = ProteinTable[C][G][G] =  
    ProteinTable[A][G][A] = ProteinTable[A][G][G] = 'R';   // Arginine 

  ProteinTable[A][A][T] = ProteinTable[A][A][C] = 'N';     // Asparagine
  
  ProteinTable[G][A][T] = ProteinTable[G][A][C] = 'D';     // Aspartic Acid

  ProteinTable[T][G][T] = ProteinTable[T][G][C] = 'C';     // Cysteine

  ProteinTable[G][A][A] = ProteinTable[G][A][G] = 'E';     // Glutamic Acid

  ProteinTable[C][A][A] = ProteinTable[C][A][G] = 'Q';     // Glutamine

  ProteinTable[G][G][T] = ProteinTable[G][G][C] = 
    ProteinTable[G][G][A] = ProteinTable[G][G][G] = 'G';   // Glycine 

  ProteinTable[C][A][T] = ProteinTable[C][A][C] = 'H';     // Histidine

  ProteinTable[A][T][T] = ProteinTable[A][T][C] = 
    ProteinTable[A][T][A] = 'I';                           // Isoleucine 

  ProteinTable[T][T][A] = ProteinTable[T][T][G] = 
    ProteinTable[C][T][T] = ProteinTable[C][T][C] = 
    ProteinTable[C][T][A] = ProteinTable[C][T][G] = 'L';   // Leucine  

  ProteinTable[A][A][A] = ProteinTable[A][A][G] = 'K';     // Lysine

  ProteinTable[A][T][G] = 'M';                             // Methionine

  ProteinTable[T][T][T] = ProteinTable[T][T][C] = 'F';     // Phenylalanine

  ProteinTable[C][C][T] = ProteinTable[C][C][C] = 
    ProteinTable[C][C][A] = ProteinTable[C][C][G] = 'P';   // Proline 

  ProteinTable[T][C][T] = ProteinTable[T][C][C] = 
    ProteinTable[T][C][A] = ProteinTable[T][C][G] = 
    ProteinTable[A][G][T] = ProteinTable[A][G][C] = 'S';   // Serine 

  ProteinTable[A][C][T] = ProteinTable[A][C][C] = 
    ProteinTable[A][C][A] = ProteinTable[A][C][G] = 'T';   // Threonine 

  ProteinTable[T][G][G] = 'W';                             // Tryptophan

  ProteinTable[T][A][T] = ProteinTable[T][A][C] = 'Y';     // Tyrosine

  ProteinTable[G][T][T] = ProteinTable[G][T][C] = 
    ProteinTable[G][T][A] = ProteinTable[G][T][G] = 'V';   // Valine 

  ProteinTable[T][A][A] = ProteinTable[T][A][G] = 
    ProteinTable[T][G][A] = 'x';                           // Stop Codon

}


FilterSequence::~FilterSequence() {;}


void FilterSequence::parse(istream& cin) {

  GeneSequence::parse(cin);

  // check validity of CDS after a parse if the field exists
  if (CDSnumber) 
    FilterSequence::check_cds();
  // otherwise, check for stops in the exon 
  else
   FilterSequence::check_stops();

  FilterSequence::check_regions();

  // find the GC percentage
  FilterSequence::check_gc();
  FilterSequence::check_gc_exon();
  FilterSequence::check_gc_intron();
  FilterSequence::check_minimum_exon_length();
  FilterSequence::check_minimum_intron_length();
}


void FilterSequence::check_cds() {

  // keeps placement counters
  int CDSseq_counter = 0, CDSindex_counter = 0;
  long geneseq_counter = (long) CDS [CDSindex_counter]->val1; 

  // lets us know where end of a coding exon is
  int endpoint = CDS [CDSindex_counter]->val2;

  // values of the actual sequence
  int geneseq_vals[3];

  // for each value in the CDS-sequence
  while (CDSseq_counter < cds_seqlength) {

    char CDSvalue = CDSsequence[CDSseq_counter];
    
    // read in the next 3 values from the gene sequence
    for (int i=0; i < 3; i++) {
      
      // if we've made it to the end of the coding exon, go to the next one
      if (geneseq_counter > endpoint) {

        // update the present place within the genesequence and the endpoint
	CDSindex_counter++;

        // check for an error
        if (CDSindex_counter == CDSnumber) {
          CDS_accurate = FALSE;
          return;
        }

	geneseq_counter = (long) CDS [CDSindex_counter]->val1; 
	endpoint = CDS [CDSindex_counter]->val2;
      }

      // read in the value of the gene-sequence
      geneseq_vals[i] = get_nucleotide (geneseq_counter);
      geneseq_counter++;
    }

   // do a table lookup to check whether it's the right codon 
    bool flag = PT_lookup(CDSvalue, geneseq_vals);

    // if wrong triplet, then set flag to false and return
    if (!flag) {
      CDS_accurate = FALSE;
      check_stops();
      return;
    }

    // else, keep checking  
    CDSseq_counter++;
  }

  // if we got this far, then it succeeded and is accurate
  CDS_accurate = TRUE;
  
  // this also implies the following
  stop_in_exon = FALSE;
}

void FilterSequence::check_regions(void) {
  // First, check that the regions are consecutive and of the right type.

  int num=get_region_num();
  if (num<3) {
    region_accurate=FALSE;
    return;
  }

  Region* reg=get_region(1);
  if ((reg->type!=REGION_NOTHING)||(reg->start!=1)) {
    region_accurate=FALSE;
    return;
  }

  reg=get_region(2);
  if ((reg->type!=REGION_NCEXON)&&(reg->type!=REGION_CEXON)) {
    region_accurate=FALSE;
    return;
  }

  int i;
  for(i=2;i<=num;i++)
    if (((get_region(i-1)->stop+1)!=(get_region(i)->start))||
	((get_region(i-1)->type)==(get_region(i)->type))||
	((i!=num)&&(get_region(i)->type==REGION_NOTHING))) {
      region_accurate=FALSE;
      return;
    }

  reg=get_region(num);
  if ((reg->type!=REGION_NOTHING)||(reg->stop!=get_length())) {
    region_accurate=FALSE;
    return;
  }

  i=2;
  while((get_region(i)->type==REGION_NCEXON)||
	(get_region(i)->type==REGION_INTRON)) i++;
  if (i==num) {
    region_accurate=FALSE;
    return;
  }

  while((get_region(i)->type==REGION_CEXON)||
	(get_region(i)->type==REGION_INTRON)) i++;

  while((get_region(i)->type==REGION_NCEXON)||
	(get_region(i)->type==REGION_INTRON)) i++;

  if (i!=num) {
    region_accurate=FALSE;
    return;
  }

  region_accurate=TRUE;
}

char* FilterSequence::DNAToProtein(const char* DNA) {
  int len=strlen(DNA);
  if (len<3) return "";
  char* protein=new char[(len/3)+1];
  int i;
  for(i=0;i<(len-2);i+=3) 
    protein[i/3]=PT_lookup(DNA+i);
  protein[(len-2)/3]='\0';
  return protein;
}

char FilterSequence::PT_lookup(const char* vals) {
  int i=char2base(vals[0]);
  int j=char2base(vals[1]);
  int k=char2base(vals[2]);

  if ((i==BASE_UNKNOWN)||
      (j==BASE_UNKNOWN)||
      (k==BASE_UNKNOWN)) return ' ';

  return ProteinTable[i][j][k];
}

bool FilterSequence::PT_lookup(char cds_val, int vals[3]) {
  
  int i = vals[0], j = vals[1], k = vals[2];
  
  // return false if there's a BASE_UNKNOWN
  if (i > 3 || j > 3 || k > 3)
    return FALSE;
  
  // check the table for a match
  if (ProteinTable[i][j][k] == cds_val)
    return TRUE;  
  else 
    return FALSE;
}


void FilterSequence::check_stops(void) { 

  int counter = 0, total = 0, triplet = 0;
  long index;
  char buf[3];

  // first, determine how much we'll need to read in.
  // we do this so that we can ignore the final stop codon which is allowed
  for (int i = 1; i <= get_region_num(); i++) {

    Region *r = get_region(i);
    if (r->type == REGION_CEXON)
      total += r->stop - r->start + 1;
  }

  // now, do the actual checking
  for (int i = 1; i <= get_region_num(); i++) {

    Region *r = get_region(i);

    // only work with coding exons
    if (r->type != REGION_CEXON)
      continue;

    for (index = r->start; index <= r->stop; index++) {

      buf[triplet++] = base2char (this->get_nucleotide(index));
      counter++;

      // when we have a full triplet, check it for a stop
      if (triplet == 3) {
	if (buf[0] == 't' && ((buf[1] == 'a' && (buf[2] == 'a' || 
						 buf[2] == 'g'))
			      || (buf[1] == 'g' && buf[2] == 'a'))) {

	  // we allow the last triplet to be a stop
	  if (counter == total) {
	    stop_in_exon = FALSE;
	    return;
	  }
	}
	triplet = 0;
      }
    }
  }

  // if we got this far, then there are no stops
  stop_in_exon = FALSE;
}


void FilterSequence::check_gc(void) { 
  
  long gc_count = 0, length = get_length();
  Nucleotide n;

  for (long i=1; i <= length; i++) {
    n = this->get_nucleotide(i);
    if (n == BASE_C || n == BASE_G)
      gc_count++;
  }

  gc_percent = 100.0 * ((double)gc_count / (double)length);
}


void FilterSequence::check_gc_exon(void) {

  long gc_count = 0;
  int count=0;
  Nucleotide n;
  Region *r;
  for (int i=1; i<=get_region_num(); i++) {
    r=get_region(i);
    if (r->type == REGION_INTRON)
      continue;
    for (int j=r->start+1;(j<=r->stop-1)&&(j<=get_length());j++) {
      n=this->get_nucleotide(j);
      count++;
      if ((n==BASE_C) || (n== BASE_G))
        gc_count++;
    }
  }
  gc_exon_percent = 100.0 * ((double)gc_count / (double)count);
}


void FilterSequence::check_gc_intron(void) {

  long gc_count = 0;
  int count=0;
  Nucleotide n;
  Region *r;
  for (int i=1; i<=get_region_num(); i++) {
    r=get_region(i);
    if (r->type != REGION_INTRON)
      continue;
    for (int j = r->start+1; j <= r->stop-1; j++){
      n=this->get_nucleotide(j);
      count++;
      if ((n==BASE_C) || (n== BASE_G))
        gc_count++;
    }
  }
  gc_intron_percent = 100.0 * ((double)gc_count / (double)count);
}



void FilterSequence::check_minimum_exon_length(void)
  {
  Region * r;
  int min=99999;
  if (get_region_num()<=2)
    {
    minimum_exon_length=0;
    return;
  }
  for (int i=2; i<get_region_num(); i++){
    r=get_region(i);
    if (r->type != REGION_CEXON)
      continue;
    if (r->stop - r->start < min)
      min=r->stop - r->start;
  }
  minimum_exon_length=min;
}

void FilterSequence::check_minimum_intron_length(void)
  {
  Region * r;
  int min=99999;
  if (get_region_num()<=2)
    {
      minimum_intron_length=0;
      return;
    }
  for (int i=2; i<get_region_num(); i++){
    r=get_region(i);
    if (r->type != REGION_INTRON)
      continue;
    if (r->stop - r->start < min)
      min=r->stop - r->start;
  }
  minimum_intron_length=min;
}

double FilterSequence::Minimum_Exon_Length(void){
  return minimum_exon_length;
}

double FilterSequence::Minimum_Intron_Length(void){
  return minimum_intron_length;

}

bool FilterSequence::CDS_Accurate(void) {
  return CDS_accurate;
}

bool FilterSequence::Region_Accurate(void) {
  return region_accurate;
}

bool FilterSequence::stops_In_Exon(void) {
  return stop_in_exon;
}


double FilterSequence::GC_percentage(void) {
  return gc_percent;
}


double FilterSequence::GC_exon_percentage(void) {
  return gc_exon_percent;
}

double FilterSequence::GC_intron_percentage(void) {
  return gc_intron_percent;
}

void FilterSequence::unparse_header(ostream& cout) {

 GeneSequence::unparse_header(cout);

 // print out the validity of the CDS
 cout << "Is CDS accurate?    ";
 if (CDS_Accurate())
   cout << "TRUE\n";
 else
   cout << "FALSE\n";

 cout << "Are regions accurate?    ";
 if (Region_Accurate())
   cout << "TRUE\n";
 else 
   cout << "FALSE\n";
}


void FilterSequence::Filter_Database(char *ifile, istream& cin, char *ofile) {
 
  ofstream outfile(ofile, ios::out);
  if (!outfile)
    return;

  this->parse(cin);
 
  while (!end_of_database) {
    filter_function(ifile, outfile);
    this->parse(cin);
  }

  outfile.close();
}

void FilterSequence::Filter_Database(char *ifile, char *datafile, char *ofile) {

  ofstream outfile(ofile, ios::out);
  if (!outfile)
    return;
  ifstream fdata(datafile);
  
  this->parse(fdata);
 
  while (!end_of_database) {
    filter_function(ifile, outfile);
    this->parse(fdata);
  }

  outfile.close();
}


void FilterSequence::Filter_N_Entries(char *ifile, istream& cin, char *ofile, int N) {

  ofstream outfile(ofile, ios::out);
  if (!outfile)
    return;

  for (int i=0; i < N; i++) {
    this->parse(cin);
    filter_function(ifile, outfile);
  }

  outfile.close();
}

void FilterSequence::Filter_N_Entries(char *ifile, char *dfile, char *ofile, int N) {

  ofstream outfile(ofile, ios::out);
  if (!outfile)
    return;
  ifstream finput(dfile);
  
  for (int i=0; i < N; i++) {
    this->parse(finput);
    filter_function(ifile, outfile);
  }

  outfile.close();
}


//  NOTE: it's really not so efficient of me to keep opening and closing
//   the input filter file for each gene, but it really keeps the code
//   clean and short.  if time becomes an issue later, then this can be
//   changed around, but for now i keep it.

void FilterSequence::filter_function(char *file, ostream& cout) {

  const int maxlen = 200;
  char buf[maxlen];
  static int counter = 1;

  bool ok_flag = true;
  
  ifstream infile(file, ios::in);
  if (!infile)
    return;

  // read in all lines of the file
  while (infile.getline(buf, maxlen)) { 
      
    ok_flag = filter(buf);  
    
    // return if it doesn't meet the criteria
    if (!ok_flag) {
      infile.close();
      return;
    }
  }        
  infile.close();

  // it has passed the filter...
  // add its locus to the output file
  cout << counter << "." << locus;
  counter++;
  return;
}


bool FilterSequence::filter(char *buf) {
 
  long val;
  char c;
  int index;

  if (strncmp(buf, "Locus:", 6) == 0) {

    index = 6;     
    while (buf[index] == ' ' || buf[index] == '\t')
      index++;

    return ( filter_test(buf+index, locus)); 
  }

  if (strncmp(buf, "Organism:", 9) == 0) {

    index = 9;     
    while (buf[index+1] == ' ' || buf[index+1] == '\t')
      index++;

    // we'll just look for an occurence of the given string
    //  so, stick *'s before and after the buf string
    buf[index] = '*';
    int i = index+1;
    while (buf[i] != '\0')
      i++;
    buf[i] = '*';
    buf[++i] = '\0';

    return ( filter_test(buf+index, organism)); 
  }

  if (strncmp(buf, "CDS accurate:", 13) == 0) {

    index = 13;  
    while (buf[index] == ' ' || buf[index] == '\t')
      index++;

    if (strncmp(buf+index, "Yes", 3) == 0)
      return (CDS_Accurate());
    else
      return TRUE;
  }

  if (strncmp(buf, "Region accurate:", 16) == 0) {

    index = 16;
    while (buf[index] == ' ' || buf[index] == '\t')
      index++;

    if (strncmp(buf+index, "Yes", 3) == 0)
      return (Region_Accurate());
    else
      return TRUE;
  }

  if (strncmp(buf, "No stops in Exon:", 17) == 0) {

    index = 17;     
    while (buf[index] == ' ' || buf[index] == '\t')
      index++;

    if (strncmp(buf+index, "Yes", 3) == 0)
      return (!stops_In_Exon());
    else
      return TRUE;
  }

  if (strncmp(buf, "No Annotated Repeats:", 21) == 0) {

    index = 21;
    while (buf[index] == ' ' || buf[index] == '\t')
      index++;

    if (strncmp(buf+index, "Yes", 3) == 0)
      return (!Annotated_repeats);
    else
      return TRUE;
  }

  if (strncmp(buf, "Contains an Intron:", 19) == 0) {

    index = 19;  
    while (buf[index] == ' ' || buf[index] == '\t')
      index++;

    if (strncmp(buf+index, "Yes", 3) == 0)
      return (get_marker_num());
    else
      return TRUE;
  }

  if (strncmp(buf, "Experimental Exons:", 19) == 0) {

    index = 19;
    while (buf[index] == ' ' || buf[index] == '\t')
      index++;

    if (strncmp(buf+index, "Yes", 3) == 0)
      return (All_exons_experimental);
    else
      return TRUE;
  }

  if (strncmp(buf, "No Unknown Basepairs:", 21) == 0) {

    index = 21;
    while (buf[index] == ' ' || buf[index] == '\t')
      index++;

    if (strncmp(buf+index, "Yes", 3) == 0)
      return (!base_count.contains("other"));
    else
      return TRUE;
  }

  if (strncmp(buf, "Type = DNA:", 11) == 0) {

    index = 11;
    while (buf[index] == ' ' || buf[index] == '\t')
      index++;

    if (strncmp(buf+index, "Yes", 3) == 0)
      return (seq_type.contains("DNA"));
    else
      return TRUE;
  }

  if (strncmp(buf, "CDS length:", 11) == 0) {

    index = 11;     
    while (buf[index] == ' ' || buf[index] == '\t')
      index++;

    if (buf[index] == '*' || buf[index+2] == '*')
      return TRUE;

    if (sscanf(buf+index, "%c %ld", &c, &val) != 2) {
      c = '>';
      if (sscanf(buf+index, "%ld", &val) != 1)
        return FALSE;
    }

    if (c == '<')
      return (cds_seqlength < val);
    else
      return (cds_seqlength > val);
  }

  if (strncmp(buf, "Number of regions:", 18) == 0) {

    index = 18;   
    while (buf[index] == ' ' || buf[index] == '\t')
      index++;

    if (buf[index] == '*' || buf[index+2] == '*')
      return TRUE;

    if (sscanf(buf+index, "%c %ld", &c, &val) != 2) {
      c = '>';
      if (sscanf(buf+index, "%ld", &val) != 1)
        return FALSE;
    }

    if (c == '<')
      return (get_region_num() < val); 
    else
      return (get_region_num() > val);
  }

  if (strncmp(buf, "Sequence Length:", 16) == 0) {

    index = 16;
    while (buf[index] == ' ' || buf[index] == '\t')
      index++;

    if (buf[index] == '*' || buf[index+2] == '*')
      return TRUE;

    if (sscanf(buf+index, "%c %ld", &c, &val) != 2) {
      c = '>';
      if (sscanf(buf+index, "%ld", &val) != 1)
        return FALSE;
    }

    if (c == '<')
      return (get_length() < val); 
    else
      return (get_length() > val);
  }

  if (strncmp(buf, "GC percentage:", 14) == 0) {

    index = 14;
    while (buf[index] == ' ' || buf[index] == '\t')
      index++;

    if (buf[index] == '*' || buf[index+2] == '*')
      return TRUE;

    if (sscanf(buf+index, "%c %ld", &c, &val) != 2) {
      c = '>';
      if (sscanf(buf+index, "%ld", &val) != 1)
        return FALSE;
    }

    if (c == '<')
      return (GC_percentage() < val); 
    else
      return (GC_percentage() > val);
  }

  

  if (strncmp (buf, "GC Exon Percentage:", 19) == 0){
    index=19;
    while (buf[index] == ' ' || buf[index]=='\t')
      index++;
    if (buf[index] == '*' || buf[index+2]=='*')
      return TRUE;
    if (sscanf(buf+index, "%c %ld", &c, &val) != 2) {
      c = '>';
      if (sscanf (buf+index, "%ld", &val) != 1)
        return FALSE;
    }
    if (c=='<')
      return (GC_exon_percentage() < val);
    else
      return (GC_exon_percentage() > val);
  }

  if (strncmp (buf, "GC Intron Percentage:", 19) == 0){
    index=21;
    while (buf[index] == ' ' || buf[index]=='\t')
      index++;
    if (buf[index] == '*' || buf[index+2]=='*')
      return TRUE;
    if (sscanf(buf+index, "%c %ld", &c, &val) != 2) {
      c = '>';
      if (sscanf (buf+index, "%ld", &val) != 1)
        return FALSE;
    }
    if (c=='<')
      return (GC_intron_percentage() < val);
    else
      return (GC_intron_percentage() > val);
  }

  if (strncmp(buf, "Minimum Exon Length:", 20)==0){
    index=20;
    while (buf[index] == ' ' || buf[index] == '\t')
      index++;
    if ((buf[index] == '*') || (buf[index+2] == '*'))
      return TRUE;
    if (sscanf (buf+index, "%c %ld", &c, &val) !=2){
      c='>';
      if (sscanf(buf+index, "%c %ld", &c, &val)!=1)
        return FALSE;
    }
    if (c== '<')
      return (Minimum_Exon_Length()<val);
    else
      return (Minimum_Exon_Length()>val);
  }

  if (strncmp(buf, "Minimum Intron Length:", 22)==0){
    index=22;
    while (buf[index] == ' ' || buf[index]=='\t')
      index++;
    if ((buf[index] == '*') || (buf[index+2] == '*'))
      return TRUE;
    if (sscanf (buf+index, "%c %ld", &c, &val) !=2){
      c='>';
      if (sscanf(buf+index, "%c %ld", &c, &val)!=1)
        return FALSE;
    }
    if (c== '<')
      return (Minimum_Intron_Length()<val);
    else
      return (Minimum_Intron_Length()>val);
  }

  cerr << "Bad field: " << buf << "\n";
  return TRUE;


}


bool FilterSequence::filter_test(char *s_arg1, String s_arg2) {

  // NOTE: assumes s_arg2 is one of the Strings created when database was read
  //       in by seq.cc or gseq.cc (they begin with \t and end with \n).

  int index1 = 0, index2 = 0, i = 0, extra = 0;
  bool begin_asterisk = FALSE, end_asterisk = FALSE;
  char *token;

  // convert the 2 Strings to normal strings without *'s
  char arg1[200], arg2[200];

  if (s_arg1[0] == '*') {
    // if it's the only char then return TRUE
    if (s_arg1[1] == '\0')
      return TRUE;
    begin_asterisk = TRUE;
    extra = 1;
  }
  while (s_arg1[i+extra] != '\0' && s_arg1[i+extra] != '*') {
    arg1[i] = s_arg1[i+extra];
    i++;
  }
  if (s_arg1[i+extra] == '*')
    end_asterisk = TRUE;
  arg1[i] = '\0';

  i = 1;
  while (s_arg2[i] != '\n') {
    arg2[i-1] = s_arg2[i];
    i++;
  }
  arg2[i] = '\0';

  // if there's a beginning asterisk, start searching at the first occurence
  //  of arg1 in arg2
  if (begin_asterisk) {
    token = strstr(arg2, arg1);
    if (!token)
      return FALSE;
    index2 = token - arg2;
  }

  while (arg1[index1] != '\0') {

    // if they don't match...
    if (arg1[index1] != arg2[index2]) 
      return FALSE;
    
    index1++;   
    index2++;
  }
 
  // is it an exact match or not?
  if (arg2[index2] == '\0' || end_asterisk)
    return TRUE;
  else
    return FALSE;
}



void FilterSequence::SortDatabase(char *infile, char *outfile, char *seqfile) {
  
  const int maxlen = 200;
  char buf[maxlen], temploc[maxlen];
  int number, memory = 100;

  ifstream ifile(infile, ios::in);
  if (!ifile)
    return;

  numberof_loci = 0;
  Loci = new (char *)[memory];

  // read in all lines of the file
  while (ifile.getline(buf, maxlen)) { 
    
    // allocate more memory if we don't have enough
    if (numberof_loci == memory) {
      memory += 100;
      char **newLoci = new (char *)[memory];
      for (int i=0; i < numberof_loci; i++)
	newLoci[i] = Loci[i];
      delete[] Loci;
      Loci = newLoci;
    }
  
    // if it's a good valid line, copy it into the Loci array
    if (sscanf (buf, "%d%*c %s", &number, temploc) == 2) {

      Loci[numberof_loci] = new char[strlen(temploc) + 1];
      strcpy(Loci[numberof_loci], temploc);
      numberof_loci++;
    }
  }        

  ifile.close();

  if (numberof_loci != number)
    cerr << "Not all entries read from locus file???\nLast entry: "
      << number << "\tNumber of entries: " << numberof_loci << "\n";

  // now get the database output file ready
  ofstream ofile(outfile, ios::out);
  if (!ofile) {
    for (int i=0; i < numberof_loci; i++)
      delete Loci[i];
    delete[] Loci;
    return;
  }

  int copy_flag = FALSE;

  for (int i=0; i<numberof_loci; ++i) {
    ifstream seqf(seqfile);
    copy_flag = FALSE;

    do {
      seqf.getline(buf, maxlen);

      if (!strncmp(buf, "LOCUS", 5)) {
	sscanf(buf+12, "%s", temploc);
	
	if (strcmp(temploc, Loci[i]) == 0) {
	  cout << i << ". " << temploc << endl;
	  copy_flag = TRUE;
	  ofile << buf << endl;
	}
	else if (copy_flag) { copy_flag = FALSE; break; }
      }
      else if (copy_flag) ofile << buf << endl;
      
    } while (seqf.good());
  }
  
  ofile.close();

  // clean up
  for (int i=0; i < numberof_loci; i++)
    delete Loci[i];
  delete[] Loci;

}


void FilterSequence::AccessionToLocus(char *infile, char *outfile, char *seqfile) {

  const int maxlen = 200;
  char buf[maxlen], temploc[maxlen];
  int number, memory = 100;

  ifstream seqf(seqfile);
  
  ifstream ifile(infile, ios::in);
  if (!ifile)
    return;

  numberof_loci = 0;
  Loci = new (char *)[memory];

  // read in all lines of the file
  while (ifile.getline(buf, maxlen)) { 
    
    // allocate more memory if we don't have enough
    if (numberof_loci == memory) {
      memory += 100;
      char **newLoci = new (char *)[memory];
      for (int i=0; i < numberof_loci; i++)
	newLoci[i] = Loci[i];
      delete[] Loci;
      Loci = newLoci;
    }
  
    // if it's a good valid line, copy it into the Loci array
    if (sscanf (buf, "%d%*c %s", &number, temploc) == 2) {

      Loci[numberof_loci] = new char[strlen(temploc) + 1];
      strcpy(Loci[numberof_loci], temploc);
      numberof_loci++;
    }
  }        

  ifile.close();
  char **trueLoci = new (char*)[numberof_loci];

  if (numberof_loci != number)
    cerr << "Not all entries read from locus file???\nLast entry: "
      << number << "\tNumber of entries: " << numberof_loci << "\n";

  // now get the database output file ready
  ofstream ofile(outfile, ios::out);
  if (!ofile) {
    for (int i=0; i < numberof_loci; i++)
      delete Loci[i];
    delete[] Loci;
    return;
  }
  found_this = new bool[numberof_loci];
  for (int i=0; i < numberof_loci; i++)
    found_this[i] = FALSE;

  char lastLocus[100];

  int seqno = 0;
  do {
    seqf.getline(buf, maxlen);   // read in lines until EOF or locus is found
    
    // if it's a locus...
    if (!strncmp(buf, "LOCUS", 5)) {
      sscanf(buf+12, "%s", lastLocus);
      ++seqno;
      if (!(seqno%100)) cout << endl << "******************" << seqno << endl;
    }

    if (!strncmp(buf, "ACCESSION", 9)) {
      sscanf(buf+12, "%s", temploc);
      
      
      for (int i=0; i < numberof_loci; i++) {
	
	if (strcmp(temploc, Loci[i]) == 0) {
	  found_this[i] = TRUE;
	  trueLoci[i] = new char[100];
	  sscanf(lastLocus, "%s", trueLoci[i]);
	  cout << temploc << "\t->\t" << lastLocus << "\t Found !" << endl;
	}
      }
    }
  }
  while (seqf.good());
  
  for (int i=1; i<=numberof_loci; ++i)
    ofile << i << "." << "\t" << trueLoci[i-1] << endl;
  
  ofile.close();
  
  // clean up
  for (int i=0; i < numberof_loci; i++)
    delete Loci[i];
  delete[] Loci;
}

void FilterSequence::Create_Database(char *infile, char *outfile, istream& cin) {

  const int maxlen = 1000;
  char buf[maxlen], temploc[maxlen];
  int number, memory = 100;

  ifstream ifile(infile, ios::in);
  if (!ifile)
    return;

  numberof_loci = 0;
  Loci = new (char *)[memory];

  // read in all lines of the file
  while (ifile.getline(buf, maxlen)) { 
    
    // allocate more memory if we don't have enough
    if (numberof_loci == memory) {
      memory += 100;
      char **newLoci = new (char *)[memory];
      for (int i=0; i < numberof_loci; i++)
	newLoci[i] = Loci[i];
      delete[] Loci;
      Loci = newLoci;
    }
  
    // if it's a good valid line, copy it into the Loci array
    if (sscanf (buf, "%d%*c %s", &number, temploc) == 2) {

      Loci[numberof_loci] = new char[strlen(temploc) + 1];
      strcpy(Loci[numberof_loci], temploc);
      numberof_loci++;
    }
  }        

  ifile.close();

  if (numberof_loci != number)
    cerr << "Not all entries read from locus file???\nLast entry: "
      << number << "\tNumber of entries: " << numberof_loci << "\n";

  // now get the database output file ready
  ofstream ofile(outfile, ios::out);
  if (!ofile) {
    for (int i=0; i < numberof_loci; i++)
      delete Loci[i];
    delete[] Loci;
    return;
  }

  int loci_left = numberof_loci;
  found_this = new bool[numberof_loci];
  for (int i=0; i < numberof_loci; i++)
    found_this[i] = FALSE;
  bool copy_flag = FALSE;

  do {
    cin.getline(buf, maxlen);   // read in lines until EOF or locus is found

    // if it's a locus...
    if (!strncmp(buf, "LOCUS", 5)) {
      sscanf(buf+12, "%s", temploc);

      // if it's a good locus
      if (found_good_locus(temploc)) {
	if (copy_flag)
	  loci_left--;
	copy_flag = TRUE;
	ofile << buf << endl;
      }

      // it's a bad locus
      else {
	if (copy_flag)
	  loci_left--;
	copy_flag = FALSE;
      }
    }

    // not a locus line -- do we pipe it to the output file?
    else if (copy_flag)
      ofile << buf << endl;

  }
  while (cin.good() && loci_left);

  ofile.close();

  // clean up
  for (int i=0; i < numberof_loci; i++)
    delete Loci[i];
  delete[] found_this;
  delete[] Loci;
}

bool FilterSequence::found_good_locus(char *temploc) {

  for (int i=0; i < numberof_loci; i++) {
    
    if (!found_this[i] && strcmp(temploc, Loci[i]) == 0) {
      found_this[i] = TRUE;
      return TRUE;
    }
  }

  return FALSE;
}


bool FilterSequence::CDS_matches(char *cds_file) {
  
  if (!cds_seqlength)
    return FALSE;
  
  ifstream file(cds_file, ios::in);
  if (!file)
    return FALSE;

  char buf;
  int counter = 0;

  do {

    file.get(buf);   // read in lines until EOF or locus is found

    if (buf != '\r' && buf != '\n') {  

      // is the sequence too long?
      if (counter == cds_seqlength) {
        file.close();
	return FALSE;
      }

      // does the sequence match?
      if (buf != CDSsequence[counter]) {
        file.close();
	return FALSE;
      }

      counter++;
    }
  }
  while (file.good());

  // if we got this far, then it's good
  file.close();
  return TRUE;

}

// End of code











