#ifndef __REP_IDENTIFY_H__
#define __REP_IDENTIFY_H__

#include <iostream.h>
#include <String.h>
#include <fstream.h>
#include <unistd.h>
#include <errno.h>
#include "common.h"
#include "vector.h"
#include "fseq.h"
#include "rep_entry.h"
#include <sys/wait.h>

enum RepeatType { REP_INTRON=1, REP_CEXON=2, REP_NCEXON=4, REP_ANY=8 };


class RepIdentify {

public:
  //arrays for information extracted from RepeatMasker list of repeats
  //locus contains all the locus names
  //repArray is an array of linked RepeatEntry pointers which contain 
  //the corresponding repeat information for each locus 
  String locus[10000];
  RepeatEntry **repArray;
  int entries;
  ifstream f1;
  String cachefile;


  RepIdentify() {
    entries = 0;
    repArray = new (RepeatEntry*) [10000];
    for (int i = 0; i < 10000; i++) {
      locus[i] = "";
      repArray[i] = NULL;
    }
    cachefile = "/data2/rosetta/tables/repeats_cache";
    f1.open(cachefile);
  }
  
  ~RepIdentify() {
    if (repArray) {
      for (int i = 0; i <= entries; i++) 
	if (repArray[i])
	  delete repArray[i];
      delete repArray;

    }
  }
  
  void rm_read(ifstream& f) {
    //takes in an input stream opened from a file containing a RepeatMasker
    //style file and parses it into two arrays for later access

    String entry;
    String token[15];
    for (int i = 0; i < 15; i++) 
      token[i] = "";
    int index = 0;
    while (readline(f,entry)) { 
      int j = split(entry,token,15,RXwhite);
      if (j < 9) {
	if (j == 2 && token[1] == "none"){
	  if (locus[index] == "") {
	    locus[index] = token[0];
	    repArray[index] = NULL;
	  }
	  else {
	    index = index + 1;
	    locus[index] = token[0];
	    repArray[index] = NULL;
	  }
	}
      	else 
	  cout << "error in RepeatMasker file format" << endl;
      }
      else { 
	String name = token[4];
	int begin = atoi(token[5]);
	int end = atoi(token[6]);
	String align = token[8];
	String repeat = token[9];
	RepeatEntry *curRepeat;
	if (locus[index] == "") {
	  locus[index] = name;
	  repArray[index] = new RepeatEntry(begin,end,align,repeat);
	  curRepeat = repArray[index];
	}
	else if (locus[index] == name) {
	  curRepeat->next = new RepeatEntry(begin,end,align,repeat);
	  curRepeat = curRepeat->next;
	}
	else {
	  index = index + 1;
	  locus[index] = name;
	  repArray[index] = new RepeatEntry(begin,end,align,repeat);
	  curRepeat = repArray[index];
	}
      }
    }
    entries = index;
  }

  int overlap_id(String repeat) {
    // removed repeats that only overlapped by one bp
    // 1 - intron, 2 - coding, 4 - noncoding, 8 - unclassified
    if (repeat == "GC_rich")
      return (REP_NCEXON + REP_INTRON + REP_ANY);
    else if (repeat == "AT_rich")
      return (REP_NCEXON + REP_ANY);
    else if (repeat == "L1PA12")
      return (REP_INTRON + REP_CEXON + REP_NCEXON);
    else if (repeat == "AluJo")
      return (REP_ANY + REP_INTRON + REP_CEXON + REP_NCEXON);
    else if (repeat == "AluJo/FLAM")
      return (REP_ANY + REP_NCEXON);
    else if (repeat == "AluSq")
      return (REP_INTRON + REP_NCEXON);
    else if (repeat == "AluSx")
      return (REP_CEXON + REP_ANY);
    else if (repeat == "AluY")
      return (REP_NCEXON + REP_INTRON);
    else if (repeat == "Tigger1")
      return (REP_ANY + REP_CEXON + REP_INTRON);
    else if (repeat == "L2")
      return (REP_INTRON + REP_NCEXON + REP_CEXON);
    else if (repeat == "FLAM_C")
      return (REP_INTRON + REP_NCEXON);
    else if (repeat == "(CAT)n")
      return (REP_CEXON + REP_ANY);
    else if (repeat == "MIR")
      return (REP_INTRON + REP_NCEXON + REP_ANY + REP_CEXON);
    else if (repeat == "(CGGG)n")
      return (REP_NCEXON + REP_CEXON);
    else if (repeat == "MER54B")
      return (REP_ANY + REP_NCEXON + REP_INTRON);
    else if (repeat == "MER88")
      return (REP_CEXON + REP_INTRON);
    else if (repeat == "MER52A")
      return (REP_INTRON + REP_CEXON);
    else if (repeat == "MER33")
      return (REP_INTRON + REP_NCEXON);
    else if (repeat == "Charlie1a")
      return (REP_NCEXON + REP_ANY);
    else if (repeat == "(TA)n")
      return (REP_NCEXON + REP_ANY);
    else if (repeat == "(CCCCG)n")
      return (REP_CEXON + REP_INTRON);
    else if (repeat == "L1M3e") 
      return (REP_CEXON + REP_INTRON + REP_NCEXON);
    else if (repeat == "MLT2D")
      return (REP_CEXON + REP_NCEXON);
    else return 0;
  }

  void rm_parse(ifstream& f, String name, vector<int>& begin, vector<int>& end,
		vector<String>& repeat, vector<int>& region,
		vector<int>& intscore, vector<int>& cexscore) {
    String entry;
    String token[15];
    for (int i = 0; i < 15; i++) 
      token[i] = "";
    ofstream fo(cachefile,ios::app);
    readline(f,entry); 
     int j = split(entry,token,15,RXwhite);
    if (j == 8 && entry.contains("There were no repetitive")) 
      fo << name << " none" << endl;
    readline(f,entry);
    while (readline(f,entry)) { 
      int j = split(entry,token,15,RXwhite);
      if (j < 9){ 
	 if (j == 8 && entry.contains("There were no repetitive")) 
	   fo << name << " none" << endl;
	 else cout << "error in RepeatMasker file format" << endl;
      }
      else { 
	fo << entry << endl;
	int start = atoi(token[5]);
	int stop = atoi(token[6]);
	String align = token[8];
	String rep = token[9];
	begin.push_back(start);
	end.push_back(stop);
	repeat.push_back(rep);
	region.push_back(overlap_id(rep));
	intscore.push_back(intron_score(rep));
	cexscore.push_back(cexon_score(rep));
      }
    }
    fo.close();
//    if (fork()) wait(NULL);
//    else if (execl("mv","mv",tempfile,cachefile,NULL) == -1)
//      cout << "error in mv call" << endl;
  }	

  void rep_find(FilterSequence* seq, vector<int>& begin, vector<int>& end, 
		vector<String>& repeat, vector<int>& region, 
		vector<int>& intscore, vector<int>& cexscore, int ismouse) { 
    // ismouse=1 flag sets repeat masking for rodents instead of humans
    rm_read(f1);
    String name = seq->get_locus();
    int i;
    for (i = 0; i <= entries; i++) 
      if (name == locus[i]) 
	break;
    if (name == locus[i]) {
      if (i <= entries) {
	RepeatEntry *rep = repArray[i];
	while (rep) {
	  begin.push_back(rep->begin);
	  end.push_back(rep->end);
	  repeat.push_back(rep->repeat);
	  region.push_back(overlap_id(rep->repeat));
	  intscore.push_back(intron_score(rep->repeat));
	  cexscore.push_back(cexon_score(rep->repeat));
	  rep = rep->next;
	}
      }
    } else {
      ofstream fo("/data2/rosetta/output/"+name);
      fo << ">"+name << endl;
      for (int j = 1; j <= seq->get_length(); j++) 
	fo << seq->base2char(seq->get(j));
      fo << endl;
      fo.close();
      
      String arg="/data2/rosetta/output/"+name;
      if (ismouse){
	if (fork()) wait(NULL);
	else if (execl("/data2/rosetta/RepeatMasker121298/RepeatMasker",
		       "RepeatMasker","-s","-rod",(const char*)arg,NULL)==-1) {
	  cout << "error in RepeatMasker call" << endl;
	  switch (errno) {
	  case E2BIG:
	    cout << "Arg list too long" << endl;
	    break;
	  case EACCES:
	    cout << "Permission denied or file not regular file" << endl;
	    break;
	  case EAGAIN:
	    cout << "Resource temporarily unavailable" << endl;
	    break;
	  case EBADF:
	    cout << "Bad file descriptor" << endl;
	    break;
	  case EBUSY: 
	    cout << "Resource busy" << endl;
	    break;
	  case ECHILD:
	    cout << "No child processes" << endl;
	    break;
	  case EDEADLK:
	    cout << "Resource deadlock avoided" << endl;
	    break;
	  case EDOM:
	    cout << "Domain error" << endl;
	    break;
	  case EEXIST:
	    cout << "File exists" << endl;
	    break;
	  case EFAULT: 
	    cout << "Bad address" << endl;
	    break;
	  case EPERM:
	    cout << "file system is mounted noexec" << endl;
	    break;
	  case ENOEXEC:
	    cout << "The magic number in the file is incorrect" << endl;
	    cout << "(exec format error)"  << endl;
	    break;
	  case ENAMETOOLONG:
	    cout << "filename is too long" << endl;
	    break;
	  case ENOENT:
	    cout << "file does not exist" << endl;
	    break;
	  case ENOMEM:
	    cout << "insufficient kernel memory was available" << endl;
	    break;
	  case ENOTDIR:
	    cout << "a component of the path prefix is not a directory" << endl;
	    break;
	  case ETXTBSY:
	    cout << "executable was open for writing by one or more processes.";
	    cout << " file name contains a circular reference" << endl;
	    break;
	  default:
	    cout << "error is?" << endl;
	  }
	}
      }
      else {
	if (fork()) wait(NULL);
	else if (execl("/data2/rosetta/RepeatMasker121298/RepeatMasker",
		       "RepeatMasker","-s",(const char*)arg,NULL)==-1) {
	  cout << "error in RepeatMasker call" << endl;
	  switch (errno) {
	  case E2BIG:
	    cout << "Arg list too long" << endl;
	    break;
	  case EACCES:
	    cout << "Permission denied or file not regular file" << endl;
	    break;
	  case EAGAIN:
	    cout << "Resource temporarily unavailable" << endl;
	    break;
	  case EBADF:
	    cout << "Bad file descriptor" << endl;
	    break;
	  case EBUSY: 
	    cout << "Resource busy" << endl;
	    break;
	  case ECHILD:
	    cout << "No child processes" << endl;
	    break;
	  case EDEADLK:
	    cout << "Resource deadlock avoided" << endl;
	    break;
	  case EDOM:
	    cout << "Domain error" << endl;
	    break;
	  case EEXIST:
	    cout << "File exists" << endl;
	    break;
	  case EFAULT: 
	    cout << "Bad address" << endl;
	    break;
	  case EPERM:
	    cout << "file system is mounted noexec" << endl;
	    break;
	  case ENOEXEC:
	    cout << "The magic number in the file is incorrect" << endl;
	    cout << "(exec format error)"  << endl;
	    break;
	  case ENAMETOOLONG:
	    cout << "filename is too long" << endl;
	    break;
	  case ENOENT:
	    cout << "file does not exist" << endl;
	    break;
	  case ENOMEM:
	    cout << "insufficient kernel memory was available" << endl;
	    break;
	  case ENOTDIR:
	    cout << "a component of the path prefix is not a directory" << endl;
	    break;
	  case ETXTBSY:
	    cout << "executable was open for writing by one or more processes.";
	    cout << " file name contains a circular reference" << endl;
	    break;
	  default:
	    cout << "error is?" << endl;
	  }
	}
      }
      ifstream f("/data2/rosetta/output/"+name+".out");
      rm_parse(f,name,begin,end,repeat,region,intscore,cexscore);
      
      String arg2;
      if (fork()) wait(NULL);
      else if (execlp("rm","rm","-f",(const char*)arg,NULL) == -1) 
	cout << "error in rm" << endl;
      arg2 = arg + ".cat";
      if (fork()) wait(NULL);
      else if (execlp("rm","rm","-f",(const char*)arg2,NULL) == -1)
	cout << "error in rm" << endl;
      arg2 = arg + ".masked";
      if (fork()) wait(NULL);
      else if (execlp("rm","rm","-f",(const char*)arg2,NULL) == -1)
	cout << "error in rm" << endl;
      arg2 = arg + ".masked.log";
      if (fork()) wait(NULL);
      else if (execlp("rm","rm","-f",(const char*)arg2,NULL) == -1)
	cout << "error in rm" << endl;
      arg2 = arg + ".out";
      if (fork()) wait(NULL);
      else if (execlp("rm","rm","-f",(const char*)arg2,NULL) == -1)
	cout << "error in rm" << endl;
      arg2 = arg + ".tbl";
      if (fork()) wait(NULL);
      else if (execlp("rm","rm","-f",(const char*)arg2,NULL) == -1)
	cout << "error in rm" << endl;
      arg2 = arg + ".stderr";
      if (fork()) wait(NULL);
      else if (execlp("rm","rm","-f",(const char*)arg2,NULL) == -1)
	cout << "error in rm" << endl;
    }
  }
  
  int intron_score(String rep) {
  // if repeat rep is any one of several repeats found to occur reasonably;
  // significantly in coding exons, return score 0. otherwise;
  // 1 indicates likely intron (or really not an exon);

  if (rep.before(")").length() == 4) {
    if (rep == "(CAA)n") return 1;
    else if (rep == "(TAA)n") return 1;
    else if (rep == "(TAG)n") return 1;
    else return 0;
  }
  else if (rep == "(CCAA)n") return 0;
  else if (rep == "(CCCA)n") return 0;
  else if (rep == "purine-rich") return 0;
  else if (rep == "L1M3e") return 0;
  else if (rep == "L1PA12") return 0;
  else if (rep == "MER52A") return 0;
  else if (rep == "MER88") return 0;
  else if (rep == "MLT2D") return 0;
  else if (rep == "Tigger1") return 0;
  else if (rep == "GC_rich") return 0;
  else if (rep == "(GGA)n") return 0;
  else if (rep == "LTR3") return 0;
  else if (rep == "L1M3e") return 0;
  else if (rep == "(CAG)n") return 0;
  else if (rep == "(CGG)n") return 0;
  else if (rep == "L1M4c") return 0;
  else if (rep == "polypurine") return 0;
  else if (rep == "(CCCCG)n") return 0;
  else if (rep == "(GGA)n") return 0;
  else if (rep == "(CAT)n") return 0;
  else if (rep == "(TGG)n") return 0;
  else if (rep == "MLT2D") return 0;
  else if (rep == "SUBTEL_sat") return 0;
  else if (rep == "G-rich") return 0;
  else if (rep == "GA-rich") return 0;
  else if (rep == "(ATG)n") return 0;
  else if (rep == "(CCCCAG)n") return 0;
  
  
  

  else return 1;
  }

  int cexon_score(String rep) {
    // if repeat rep is (CAG)n very likely to be coding exon -return 1 - 
    // otherwise return 0
    if (rep == "(CAG)n")
      return 1;
    else return 0;
  }
};

#endif



 
