// This is a C++ code file

/**** This code was rewritten by Eric Banks -- June '98  ****/

#include <ctype.h>
#include <String.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <iostream.h>
#include <fstream.h>
#include "seq.h"
#include "common.h"

Nucleotide nuclComplement(Nucleotide n) {
  switch (n) {
  case BASE_UNKNOWN: return BASE_UNKNOWN;
  case BASE_A:       return BASE_T;
  case BASE_C:       return BASE_G;
  case BASE_G:       return BASE_C;
  case BASE_T:       return BASE_A;
  default: assert(FALSE);
  }
}

      
Nucleotide c2b(char ch)
{
  switch (ch) {
  case 'a':
  case 'A':
    return BASE_A;
  case 'c':
  case 'C':
    return BASE_C;
  case 'g':
  case 'G':
    return BASE_G;
  case 't':
  case 'T':
    return BASE_T;
  default:
    return BASE_UNKNOWN;
  }
}

Nucleotide Sequence::char2base(char ch) 
{return c2b(ch);}

char b2c(Nucleotide base) {
  switch (base) {
    case BASE_A:
      return 'a';
    case BASE_C:
      return 'c';
    case BASE_G:
      return 'g';
    case BASE_T:
      return 't';
  default:
    return 'n';
  }
}

char Sequence::base2char(Nucleotide base)
{return b2c(base);}

Sequence::Sequence(istream &cin) {
  Sequence::Sequence();
  parse(cin);
}

Sequence::Sequence() {

  I_Exist = FALSE;

  seqlen = buflen = current_keyword_index = 0;
  seq = NULL;

  end_of_database = FALSE;
  warningflag = FALSE;
  origin = FALSE;
  features = FALSE; 
  dbaseparserflag = FALSE;

  // define our main keywords
  mainkeywords[0] = "LOCUS      ";
  mainkeywords[1] = "DEFINITION ";
  mainkeywords[2] = "ACCESSION  ";
  mainkeywords[3] = "KEYWORDS   ";
  mainkeywords[4] = "SOURCE     ";
  mainkeywords[5] = "  ORGANISM ";
  mainkeywords[6] = "REFERENCE  ";
  mainkeywords[7] = "  AUTHORS  ";
  mainkeywords[8] = "  TITLE    ";
  mainkeywords[9] = "  JOURNAL  ";
  mainkeywords[10] = "  MEDLINE  ";
  mainkeywords[11] = "COMMENT    ";
  mainkeywords[12] = "FEATURES   ";
  mainkeywords[13] = "BASE COUNT ";
  mainkeywords[14] = "ORIGIN     ";

}

Sequence::~Sequence() {
  destroy();
}

void Sequence::destroy() {
  if (seq) 
    delete[] seq;
  seq = NULL;
}

void Sequence::initialize() {

  I_Exist = TRUE;

  seqlen = buflen = current_keyword_index = 0;
  end_of_database = FALSE;
  warningflag = FALSE;
  seq = NULL;

  origin = FALSE;
  features = FALSE; 

  // initialize strings 
  seq_type = "";  order = "";  date = "";  definition = "";  accession = "";
  keywords = "";  source = "";  organism = "";  reference = "";  authors = "";
  title = "";  journal = "";  medline = "";  comment = "";  base_count = "";
  locus = "";
}


long Sequence::get_length() const {
  return seqlen;
}


Nucleotide Sequence::get(long idx) const {

  assert (idx > 0 && idx <= seqlen);

  return seq[idx-1];           // NOTE THIS CHANGE: indexed from 1 to seqlen  
}                              //   and not 0 to seqlen-1....


Nucleotide Sequence::get_nucleotide(long idx) const {

  return get(idx);     
}


// here's a way to access the sequence as an array
Nucleotide& Sequence::operator[](long idx) {  
  return seq[idx]; 
}                      


void Sequence::parse(istream& cin) {
  cout << "parsing" << endl;

  char buf[maxbuf];

  if (I_Exist && !dbaseparserflag)  
    destroy();

  // check to see whether a dbaseparser has already initialized us
  if (!dbaseparserflag)
    initialize();
  // reset the flag
  else
    dbaseparserflag = FALSE;
  
  // check first if it's a "FAST-A" format...
  // peek() will let you look at the first char without moving position
  // FAST-A _always_ begins with a '>'
  char cinpeek = cin.peek();

  if (cinpeek == '>') {
    cout << "to parse fasta" << endl;
    parse_fasta(cin);
    return;
  }
  else {
    cout << "peek: " << (char)cinpeek << endl;
    cout << "not fasta !" << endl;
  }

  do {
    cin.getline(buf, maxbuf);    // read lines while the buffer is good
  }                              //  and while in the middle of a sequence
    while (cin.good() && parse_line(buf));
  
  if (!cin.good()) {
    end_of_database = TRUE;
    return;
  }
}

void mygetline(istream &cin, char *buf, int maxbuf) {
  char c;
  int currptr = 0;
  while (true) {
    cin.get(c);
    if (cin.good() && currptr < maxbuf && c != '\n' && c != '\r') {
      cout << c;
      buf[currptr++] = c;
    }
    else {
     buf[currptr] = '\0';
      break;
    }
  }
}

void Sequence::parse_fasta(istream &cin) {
  cout << "parsing fasta" << endl;

  char buf[maxbuf];
  
  cin.get();     // the '>'
  
//   cin.getline(buf, maxbuf);
  mygetline(cin, buf, maxbuf);

  locus = "";
  //  locus += '\t';
  locus += buf;
  //  locus += '\n';
  
  cout << "locus: " << locus << endl;
  do {
    if (cin.peek() == '>')       // new sequence...
      return;
    mygetline(cin, buf, maxbuf);
    
//     cin.getline(buf, maxbuf);    // read lines while the buffer is good
    fasta(buf);
  }                              
  while (cin.good());
  
  if (!cin.good()) {
    end_of_database = TRUE;
    return;
  }
}
void Sequence::fasta(const char *buf) {
  
  int length = strlen(buf);
  
  if (buflen < seqlen + length) {
    Nucleotide *newseq;
    long newlen = seqlen + length;
    
    newseq = new Nucleotide[newlen];
    memcpy(newseq, seq, seqlen*sizeof(Nucleotide));
    delete seq;
    seq = newseq;
    buflen = newlen;
  }
  
  for (int i = 0; i < length; i++)
    if (isalpha(buf[i])) {
      seq[seqlen] = char2base(buf[i]);
      seqlen++;
    }
    else
      return;
}



bool Sequence::parse_line(const char *buf) {

  // stop reading for end of sequence or bad lines
  if (!buf) return FALSE;
  if (buf[0] == '/' && buf[1] == '/') return FALSE;

  // if we're in the middle of reading the origin, keep going
  if (origin) {
    finish_parse(buf);
    return TRUE;
  }

  // if we're in the middle of reading the features, just quit...
  //  subclasses will deal with reading this stuff in...
  //  but we have to check for when feaatures ends and basecount begins 
  if (features && buf[0] == ' ')
    return TRUE;

  // check out what the keyword from the margin is
  char keyword[leftmargin];
  strncpy(keyword, buf, leftmargin);

  // "trim" the whitespace off the end of the keyword string
  //   note: we don't actually trim -- just remember length w/out whitespace
  int length = leftmargin;
  while (length != 0 && keyword[length-1] == ' ')
    length--;

  // for blank keyword send it on to be read or throw it out
  if (length == 0) {
    if (warningflag)
      return TRUE;
    else {
      finish_parse(buf + leftmargin);
      return TRUE;
    }
  }

  // for new keyword reset keyword index
  current_keyword_index = find_index(keyword, length);
 
  // for bad keyword
  if (current_keyword_index == -1) 
    warningflag =  TRUE; 
  else {    
    warningflag = FALSE;
    finish_parse(buf + leftmargin);
  }  

  return TRUE;
}


int Sequence::find_index(char *keyword, int length) {
 
  // find the index of the matching keyword
  for (int i=0; i < numberofkeywords; i++) {
    if (strncmp(keyword, mainkeywords[i], length) == 0)
      return i;
  }

  // if no match is found return bad value
  return -1;
}


void Sequence::addLocus(const char *buf) {

  dbaseparserflag = TRUE;  // used to determine when to initialize
  initialize();
  current_keyword_index = 0;    // "LOCUS"
  Sequence::parse_line(buf);
}


void Sequence::finish_parse(const char *buf) {

  // this procedure determines which variable we are writing to and then
  //   just adds the buffer to the variable in most cases.  

  switch(current_keyword_index) {
  
  case 0:      // keyword == "LOCUS"

     // here we break up the buffer into separate variables

     char str1[20], str2[20], str3[20], str4[20], tmpstr[5];
     long num;

     sscanf(buf, "%s%ld%s%s%s%s", str1, &num, tmpstr, str2, str3, str4);

     locus = "\t";  locus += str1;  locus += "\n";
     seq_type = "\t";  seq_type += str2;  seq_type += "\n";
     order = "\t";  order += str3;  order += "\n";
     date = "\t";  date += str4;  date += "\n";

     buflen = num + 60;	// allow for slight inaccuracies
     seq = new Nucleotide[buflen];
     break;

  case DEFINITION:

     definition += "\t";
     definition += buf;
     definition += "\n";
     break;

  case ACCESSION:

     accession += "\t";
     accession += buf;
     accession += "\n";
     break;

  case KEYWORDS:

     keywords += "\t";
     keywords += buf;
     keywords += "\n";
     break;

  case SOURCE:

     source += "\t";
     source += buf;
     source += "\n";
     break;

  case ORGANISM:

     organism += "\t";
     organism += buf;
     organism += "\n";
     break;

  case REFERENCE:

     reference += "\t";
     reference += buf;
     reference += "\n";
     break;

  case AUTHORS:

     authors += "\t";
     authors += buf;
     authors += "\n";
     break;

  case TITLE:

     title += "\t";
     title += buf;
     title += "\n";
     break;

  case JOURNAL:

     journal += "\t";
     journal += buf;
     journal += "\n";
     break;

  case MEDLINE:

     medline += "\t";
     medline += buf;
     medline += "\n";
     break;

  case COMMENT:

     comment += "\t";
     comment += buf;
     comment += "\n";
     break;

  case FEATS:

     features = TRUE;
     break;

  case BASE_COUNT:

     // we're done with features so set the features flag to false
     features = FALSE;

     base_count += "\t";
     base_count += buf;
     base_count += "\n";     
     break;

  case ORIGIN:

     origin = TRUE;

     // set current_keyword_index so that it hits default next time around
     current_keyword_index = numberofkeywords;
     break;

  default:

    // for bad value
    if (!origin) 
      return;

    // now read in the gene sequence...  it's pretty straightforward 
    //   look at the format of the database file to understand more clearly

    long tmpnum;
    if (sscanf(buf, "%ld", &tmpnum) && (tmpnum % 60) == 1) {
      if (buflen < seqlen + 60) {
        Nucleotide *newseq;
        long newlen = seqlen+60;

        newseq = new Nucleotide[newlen];
        memcpy(newseq, seq, seqlen*sizeof(Nucleotide));
        delete seq;
        seq = newseq;
        buflen = newlen;
      }

      for (int y = 0; y < 6; y++)
        for (int z = 0; z < 10; z++)
	  if (isalpha(buf[10+y*11+z])) {
	    seq[seqlen] = char2base(buf[10+y*11+z]);
  	    seqlen++;
	  }
  	  else
	    return;
      return;
    }
  }
  return;
}

void Sequence::unparse(ostream& cout) {
  unparse_header(cout);
  for (int i = 0; i < seqlen; i+=60)
    unparse_line(cout, i);
}

void Sequence::unparse_header(ostream& cout) {

  cout << "LOCUS\n" << locus;
  cout << "TYPE\n" << seq_type;
  cout << "ORDER\n" << order;
  cout << "DATE\n" << date;
  cout << "DEFINITION\n" << definition;
  cout << "ACCESSION\n" << accession;
  cout << "KEYWORDS\n" << keywords;
  cout << "SOURCE \n" << source;
  cout << "ORGANISM\n" << organism;
  cout << "REFERENCE\n" << reference;
  cout << "AUTHORS\n" << authors;
  cout << "TITLE\n" << title;
  cout << "JOURNAL\n" << journal;
  cout << "MEDLINE\n" << medline;
  cout << "COMMENT\n" << comment;
  cout << "BASE COUNT\n" << base_count;
  cout << "LENGTH\t" << seqlen << "\n";
}

void Sequence::unparse_line(ostream& cout, int from) {
  int width = cout.width();
  cout.width(9);

  // output the gene sequence in the same fashion as the database file

  cout << from+1;
  for (int j = from; j < seqlen && j < from+60; j+=10) {
    cout << " ";
    for (int k = j; k < seqlen && k < j+10; k++)
      cout << base2char(seq[k]);
  }
  cout << "\n";

  cout.width(width);
}

istream& operator>>(istream& cin, Sequence& seq) {
  cout << "parsing begins" << endl;
  seq.parse(cin);
  return cin;
}

ostream& operator<<(ostream& cout, Sequence& seq) {
  seq.unparse(cout);
  return cout;
}


void Sequence::find_Nth_gene(istream& cin, int number) {

  // to prevent memory leaks
  if (I_Exist)
    destroy();

  char buf[maxbuf];
  int genecounter = 0;

  do {
    cin.getline(buf, maxbuf);  // read in lines until EOF or number is reached
    if (foundALocus(buf))      
      genecounter++;
  }
  while (cin.good() && genecounter != number);

  // return if the gene number doesn't exist
  if (!cin.good()) 
    exit(-1);

  // initialize the Sequence
  addLocus(buf);              // initialize the locus since we've removed it
  cin >> *this;               //   from the input stream...
}


void Sequence::find_Nth_gene(char *ifile, int number) {

  // to prevent memory leaks
  if (I_Exist)
    destroy();

  char buf[maxbuf];
  int genecounter = 0;
  
  ifstream finput(ifile);

  do {
    finput.getline(buf, maxbuf);  // read in lines until EOF or number is reached
    if (foundALocus(buf))      
      genecounter++;
  }
  while (finput.good() && genecounter != number);

  // return if the gene number doesn't exist
  if (!finput.good()) 
    exit(-1);

  // initialize the Sequence
  addLocus(buf);              // initialize the locus since we've removed it
  finput >> *this;               //   from the input stream...
}


void Sequence::find_locus(istream& cin, const char *locus) {

  // to prevent memory leaks
  if (I_Exist)
    destroy();

  char buf[maxbuf];

  do {
    cin.getline(buf, maxbuf);   // read in lines until EOF or locus is found
  }
  while (cin.good() && !foundTheLocus(buf, locus));

  // return if the locus doesn't exist
  if (!cin.good()) 
    exit(-1);

  // initialize the Sequence
  addLocus(buf);           // initialize the locus since we've removed it
  cin >> *this;            //   from the input stream...
}


void Sequence::find_locus(const char *ifile, const char *locus) {

  // to prevent memory leaks
  if (I_Exist)
    destroy();

  char buf[maxbuf];

  ifstream finput(ifile);
  
  do {
    finput.getline(buf, maxbuf);   // read in lines until EOF or locus is found
  }
  while (finput.good() && !foundTheLocus(buf, locus));

  // return if the locus doesn't exist
  if (!finput.good()) 
    exit(-1);

  // initialize the Sequence
  addLocus(buf);           // initialize the locus since we've removed it
  finput >> *this;            //   from the input stream...
}


bool Sequence::foundTheLocus(const char *buf, const char *locus) {

  if (!buf) return FALSE;

  const int maxlen = 200;
  char tempstr[maxlen];

  sscanf(buf, "%s", tempstr);

  // see if it's a LOCUS line
  if (!strncmp(tempstr, "LOCUS", 5)) {
    sscanf(buf+12, "%s", tempstr);

    // see if it's the correct locus
    if (strcmp(tempstr, locus) == 0)
      return TRUE;
    else 
      return FALSE;
  }
  else
    return FALSE;
}


bool Sequence::foundALocus(const char *buf) {

  if (!buf) return FALSE;

  char tempstr[maxbuf];

  sscanf(buf, "%s", tempstr);

  // see if it's a LOCUS line
  if (!strncmp(tempstr, "LOCUS", 5)) 
    return TRUE;
  else
    return FALSE;
}

int seq2int(Sequence *seq, int *seqInt) {
  seqInt[0]=0;
  int i=1, seql = seq->get_length();
  for (i=1; i<=seql; ++i) seqInt[i] = (int)seq->seq[i-1];
  return i-1;
}


int Sequence::modify_sequence(Nucleotide *newnuc, int newseql) {
  assert(newseql <= seqlen);
  for (int i=1; i<=newseql; ++i) seq[i] = newnuc[i];
  return seqlen = newseql;
}



// End of code











