// This is a C++ header file

#ifndef __GSEQ_H__
#define __GSEQ_H__

#include <iostream.h>
#include <String.h>
#include "seq.h"


// GeneSequence interface
  
enum RegionType { REGION_ANY=0, REGION_INTRON=1, REGION_CEXON=2, REGION_NCEXON=3,
                  REGION_NOTHING=4 };
enum RegionFlags { START_INEXACT = 1, STOP_INEXACT = 2 };
enum ReadingFrame { FRAME_0=0, FRAME_1=1, FRAME_2=2, FRAME_ANY=3 };

// A Region is merely a subsection of a gene sequence encoding either an
//   intron or an exon.  A region consists of a start and stop point as
//   well as a region_type and flags indicating unsurities. 

class Region {

public:

  long start, stop;
  RegionType type;
  int flags;
  ReadingFrame frame;
  bool experimental;

  Region(long rstart, long rstop, RegionType rtype) {
    start = rstart; 
    stop = rstop; 
    type = rtype; 
    flags = 0;
    experimental = FALSE;
  }
  int length() { return stop-start+1; }

  // if we're not sure of the exact start or stop place...
  void set_start_inexact() { flags |= START_INEXACT; }
  void set_stop_inexact() { flags |= STOP_INEXACT; }

  bool is_start_inexact() const { return bool((flags & START_INEXACT) != 0); }
  bool is_stop_inexact() const { return bool((flags & STOP_INEXACT) != 0); }
};

enum MarkerType { MARK_ANY, MARK_ISTART, MARK_ISTOP };
enum MarkerFlags { MARK_INEXACT };

// A Marker refers to a place in a gene sequence which either begins or ends
//   an intron.  A marker consists of a marker point as well as a 
//   marker_type and flags indicating unsurities. 

class Marker {

public:

  long pos;
  MarkerType type;
  int flags;

  Marker(long mpos, MarkerType mtype) {
    pos = mpos; 
    type = mtype; 
    flags = 0;
  }

  // if we're not sure of the exact start or stop place...
  void set_inexact() { flags |= MARK_INEXACT; }
  bool is_inexact() { return bool((flags & (int)MARK_INEXACT) != 0); }
};

enum FeatureTypes {EXON, INTRON, SOURCEfeature, CDSfeature, MRNA, VARIATION, 
		     TATA_SIGNAL, POLYA_SIGNAL, POLYA_SITE, PRIM_TRANSCRIPT, 
		     ALLELE, REPEAT_REGION, PROMOTER};

class ValuePair {

 public:
  long val1, val2;
  char cval1, cval2;
  
  ValuePair(long v1, long v2) {
    val1 = v1;
    val2 = v2;
  }

  ValuePair(char v1, char v2) {
    cval1 = v1;
    cval2 = v2;
  }

  ValuePair(long v) {
    val1 = v;
    val2 = 0;
  }
};


// A DNA/RNA sequence with intron/exon information

class GeneSequence : public Sequence {

public:
  // Construct and load from a stream
  GeneSequence(istream& cin);
 
  // Generic constructor/destructor
  GeneSequence();
  virtual ~GeneSequence();

  // Regions access.
  // Regions come directly from the 'intron' & 'exon' directives
  int get_region_num() const;
  Region* get_region(int index);
  
  Region* firstCExon(void);
  Region* lastCExon(void);
  Region* nuc2reg(long pos);
  RegionType whatRegion(long pos);


  // Mark access
  // A mark is a start of an intron, or an end of an intron.
  // This allows unification of the information provided by the regions
  int get_marker_num() const;
  Marker* get_marker(int index);
  
  int promoter_start(int i) const;
  int promoter_stop(int i) const;
  int promoter_len(int i) const;
  int TATA_start() const;
  int TATA_stop() const;
  int no_CDS() const;
  int CDS_start(int i) const;
  int CDS_stop(int i) const;

protected:

  virtual void initialize();
  virtual void destroy();
  
  Region* add_region(long from, long to, RegionType type);
  Region* insert_region(long from, long to, RegionType type, int region);
  void split_region(int region_num, RegionType first, 
		    RegionType second, long location);
  Marker* add_marker(long pos, MarkerType type);
  Marker* get_marker_after(long pos, MarkerType type = MARK_ANY);
  void set_last_exon_experimental(void);
  void add_reading_frames(void);
  
  virtual bool parse_line(const char *buf);
  int find_index(char *keyword, int length);
  void finish_parse(const char *buf);
  bool read_values(const char *, long *, long *);
  bool read_cvalues(const char *, long *, long *, char *, char *);
  void read_cds(const char *buf);
  int parse_join(char *, long[][2], char[][2]);
  void unparse_header(ostream& cout);
  void unparse_features(ostream& cout);
  void unparse_line(ostream& cout, int from);


  // protected variables
  Region **regions;
  ValuePair **CDS;
  char *CDSsequence;
  int  cds_seqlength, CDSnumber;
  bool Annotated_repeats, All_exons_experimental;


private:
  bool joinflag, cdsflag, cds_seqflag;    // flags for reading in fields
  bool extra_exon_line, extra_cds_flag;
  int region_num, region_max, last_exon;


  // *** features holders  ***
  int cds_seqmemory, CDSmax, maxmRNA, maxallele;
  
  // holds the actual data
  long prim_transcript[2], polyA_site[2], polyA_signal[2], TATA_signal[2], 
    isource, promoter[2];
  ValuePair **mRNA, **allele;
  String osource, csource, CDSproduct;
  
  // holds details of inexactness
  ValuePair **mRNAinexact; 
  char ptranscript_inexact[2];
  
  // number count for features with more than one entry
  int mRNAnumber, allelenumber;
  
  // replaces, one-ofs, and complements
  //  long polyA_signal_oneof[2][2], prim_transcript_oneof[2][2];  
  //  long polyA_signal_complement[2][2];
  //  long allele_replace[2];
  //  char allele_creplace[5];
  
  int marker_num, marker_max;
  Marker **markers;
  
  // length of left margin in input file  
  static const int leftmarginstart = 5;
  static const int leftmargin = 16;
  static const int maxbuf = 50000;
  
  // number of possible keywords
  static const int numberofkeywords = 13;
  
  // possible keywords
  char mainkeywords[numberofkeywords][leftmargin];
  
  // flag for bad lines
  bool warningflag;
  
  // index to current keyword being read
  int current_keyword_index;
};

#endif // __GSEQ_H__

// End of header file

