// This is a C++ header file

#ifndef __SEQ_H__
#define __SEQ_H__

#define FALSE false
#define TRUE true
#include <iostream.h>
#include <String.h>

// Binary encoding of the nucleotides
enum Nucleotide {BASE_A = 0, BASE_C = 1, BASE_G = 2, BASE_T = 3, BASE_UNKNOWN};

enum MAINKEYWORDS {LOCUS, DEFINITION, ACCESSION, KEYWORDS, SOURCE, ORGANISM, 
		     REFERENCE, AUTHORS, TITLE, JOURNAL, MEDLINE, COMMENT, 
		     FEATS, BASE_COUNT, ORIGIN };

Nucleotide nuclComplement(Nucleotide);
Nucleotide c2b(char ch);
char b2c(Nucleotide base);

// Generic DNA/RNA sequence
class Sequence {

 public:
  // Construct and load from a stream
  Sequence(istream& cin);
  
  // Generic constructor/destructor
  Sequence();
  virtual ~Sequence();

  // Use these for an array-like indexing of the sequence
  long get_length() const;
  Nucleotide get_nucleotide(long idx) const;  // these 2 are identical
  Nucleotide get(long idx) const;
  
  // Operator aliases
  friend istream& operator>>(istream& cin, Sequence& seq);
  friend ostream& operator<<(ostream& cout, Sequence& seq);

  // Useful static stuff
  static Nucleotide char2base(char ch);
  static char base2char(Nucleotide base);

  // database parsing stuff
  // finds nth gene or specific locus string
  void find_Nth_gene(istream&, int);  
  void find_Nth_gene(char*, int);  
  void find_locus(istream&, const char *);
  void find_locus(const char *, const char *);
  const char* get_locus() { 
    int pos = 0;
    for (int i = 0; i < (int)locus.length(); i++) {
      if (locus.at(i,1) == " " || locus.at(i,1) == "\t") 
	pos++;
      if (locus.at(i,1) == "\n")
	locus.at(i,1) = "\0";
    }
    return locus.chars() + pos;
    
  }

  void reverse_complement() {
    Nucleotide *tempSeq = new Nucleotide[seqlen];
    int i;
    for (i=0; i<seqlen; ++i)
      tempSeq[i] = seq[i];
    for (i=0; i<seqlen; ++i)
      seq[i] = nuclComplement(tempSeq[seqlen-1-i]);
    delete[] tempSeq;
  }

  // Use these for an array-like indexing of the sequence
  Nucleotide& operator[](long idx);
  
  bool end_of_database;  // a flag

  int modify_sequence(Nucleotide *newseq, int newlength);
 protected:
  
  // Use for reading/writing a sequence from a stream
  virtual void parse(istream& cin);
  void parse_fasta(istream& cin);
  void fasta(const char *buf);
  virtual void unparse(ostream& cout);

  // gene finders
  bool foundTheLocus(const char*, const char*);  
  bool foundALocus(const char*);  
  void addLocus(const char *buf);

  // do nothing constructor
  Sequence(int) {};

  // the real constructor and destructor
  virtual void initialize();
  virtual void destroy();

  // Override this in subclasses to handle specific information
  virtual bool parse_line(const char *buf);
  int find_index(char *keyword, int length);
  void finish_parse(const char *buf);

  // Override this to add stuff to the output
  virtual void unparse_header(ostream& cout);
  virtual void unparse_line(ostream& cout, int from);

  // protected variables
  String locus, source, seq_type, base_count, organism;
  bool I_Exist, features; 

  long seqlen;
 private:
  Nucleotide *seq;
  long buflen;
  bool origin, dbaseparserflag;

  // header strings
  String order, date, definition, accession, keywords,
    reference, authors, title, journal, medline, comment; 


  // added to help read in multiple lines for one field...

   // length of left margin in input file  
  static const int leftmargin = 12;
  static const int maxbuf = 50000;

   // number of possible keywords
  static const int numberofkeywords = 15;

   // possible keywords         // make sure beginning whitespace is included
  char mainkeywords[numberofkeywords][leftmargin];

   // flag for bad lines
  bool warningflag;

   // index to current keyword being read
  int current_keyword_index;
  friend int seq2int(Sequence*, int *);
  
};

int seq2int(Sequence *seq, int *seqInt);
// Does NOT allocate memory for seqInt;
// Returns sequence length;
// seqInt[0] is arbitrary (in fact it is 0);

#endif // __SEQ_H__

// End of header file




















