#include "modules.h"

#define PROTEIN_CUTOFF 15

char* sequenceToChar(FilterSequence *m_seq, bool complement) {
  // The purpose of this function is to return an array which contains the sequence
  // masked for repeats and unwanted regions.  
  // complement = FALSE leaves the sequence as is.
  // complement = TRUE complements the sequence.
  
  int i,j;
  int seql = m_seq->get_length();

  char* seqText           = new char[seql+1];

  seqText[0] = 'n';
  if (complement==TRUE){
    for(i=1;i<=seql;i++) {
      switch(m_seq->get(i)) {
      case BASE_A: seqText[i-1]='t'; break;
      case BASE_C: seqText[i-1]='g'; break;
      case BASE_G: seqText[i-1]='c'; break;
      case BASE_T: seqText[i-1]='a'; break;
      default: seqText[i-1]='n';
      }
    }
  }
  else {
     for(i=1;i<=seql;i++) {
       switch(m_seq->get(i)) {
       case BASE_A: seqText[i-1]='a'; break;
       case BASE_C: seqText[i-1]='c'; break;
       case BASE_G: seqText[i-1]='g'; break;
       case BASE_T: seqText[i-1]='t'; break;
       default: seqText[i-1]='n';
       }
     }
  }

   seqText[seql]='\0';
   return seqText;
}


void findAltSS(){
  int i;

  ofstream infoOut("/data2/rosetta/output/altSpSitesMulti_b.txt");
  ofstream info3Out("/data2/rosetta/output/tripleSpSitesMulti.txt");

  ofstream statOut("/data2/rosetta/output/altSpSitesMulti_b.dat");
  ofstream stat3Out("/data2/rosetta/output/tripleSpSitesMulti.dat");
  
  ifstream database("/data2/rosetta/databases/multi.seq");

  Dictionary *owl;
  char errMsg[128]; errMsg[0]='\0';

  owl=new Dictionary ("/data2/rosetta/dictionaries/dbOWL",errMsg);
  if (strlen(errMsg)) {
    cout << "Dictionary error: " << errMsg << endl;
    exit(0);
  }

  FilterSequence *seq = new FilterSequence();
  
  int seqnum = 0;
  while (database.good()) {
    
    database >> *seq;
    int seql = seq->get_length();
    cout << "Sequence : " << ++seqnum << "\t" << seq->get_locus() << endl;

    char *seqText, *proteinText, *proteinNam, *temp;
    seqText = sequenceToChar(seq, false);
    
    long unsigned int *protein, *proteinMaxSeglen;
    
    vector<String> names;
    ivector<long unsigned int> proteinSegpos;
    ivector<long unsigned int> proteinSeglen;
    ivector<long unsigned int> proteinSegacc;
    ivector<long unsigned int> proteinImage;
    
    proteinText = new char[seql+3];
    arrayInit(' ',proteinText,seql+3);
    strcpy(proteinText,                seq->DNAToProtein(seqText));
    strcpy(proteinText+(seql/3)+1,     seq->DNAToProtein(seqText+1));
    strcpy(proteinText+(2*(seql/3))+2, seq->DNAToProtein(seqText+2));

    protein           = new (long unsigned int)[seql+3]; arrayZero(protein,seql+3);
    int proteinCutoff = PROTEIN_CUTOFF;
    
    cout << "Finding protein segments..." << endl;
    owl->Segments(proteinText,seql+3,proteinCutoff,proteinSegpos, proteinImage, proteinSeglen, proteinSegacc);
    
    
    int segcount = proteinSegpos.size();

    int *pSegposArray = new int[segcount]; arrayZero(pSegposArray,segcount);
    int *pSeglenArray = new int[segcount]; arrayZero(pSeglenArray,segcount);
    int *pSegaccArray = new int[segcount]; arrayZero(pSegaccArray,segcount);
    int *pImageArray  = new int[segcount]; arrayZero(pImageArray, segcount);
    
    for (i=0; i<segcount; ++i) {
      pSegposArray[i] = proteinSegpos[i];
      pSeglenArray[i] = proteinSeglen[i];
      pSegaccArray[i] = proteinSegacc[i];
      pImageArray[i]  = proteinImage[i];
    }
    
    orWrapper(protein, pSegposArray, pSeglenArray, segcount);

    int numtwo = 0, frametwo = 0, framethree = 0, numthree = 0;
    
    cout << "Checking for alternative splice sites" << endl;
    int entries = 1, entries3 = 1;
    for (int regionum=1; regionum <= seq->get_region_num(); regionum++) {
      Region *r = seq->get_region(regionum);
      if (r->type==REGION_CEXON && r->length()>=3*proteinCutoff) {
	frametwo=0; framethree = 0;
	for (int proteinPiece=0; proteinPiece < segcount; proteinPiece++) {
	  for (int proteinPiece2=0; proteinPiece2 < segcount; proteinPiece2++) {
	    long int start1 = (pSegposArray[proteinPiece] % ((seql/3)+1))*3, start2 = (pSegposArray[proteinPiece2] % ((seql/3)+1))*3;
	    long int stop1  = start1 + pSeglenArray[proteinPiece]*3,         stop2  = start2 + pSeglenArray[proteinPiece2]*3;
	    int rlen   = r->length();
	    
	    if ( min(stop1, r->stop) - max(start1, r->start) >= 3*proteinCutoff &&
		 min(stop2, r->stop) - max(start2, r->start) >= 3*proteinCutoff &&
		 min(stop1, stop2)   - max(start1, start2)   >= 3*proteinCutoff &&
		 (floor(pSegposArray[proteinPiece]/((seql/3)+1)) != floor(pSegposArray[proteinPiece2]/((seql/3)+1)))) {
	      for (int proteinPiece3=0; proteinPiece3 < segcount; proteinPiece3++) {
		long int start3 = (pSegposArray[proteinPiece3] % ((seql/3)+1))*3;
		long int stop3  = start3 + pSeglenArray[proteinPiece3]*3;
		
		if ( min(stop3, r->stop) - max(start3, r->start) >= 3*proteinCutoff &&
		     min(stop3, stop2)   - max(start3, start2)   >= 3*proteinCutoff &&
		     min(stop3, stop1)   - max(start3, start1)   >= 3*proteinCutoff &&
		     (floor(pSegposArray[proteinPiece3]/((seql/3)+1)) != floor(pSegposArray[proteinPiece2]/((seql/3)+1))) &&
		     (floor(pSegposArray[proteinPiece3]/((seql/3)+1)) != floor(pSegposArray[proteinPiece] /((seql/3)+1)))) {
		  if (!framethree) {
		    framethree=1;
		    stat3Out << entries3 << "\t" << r->start << "\t" << r->stop << "\t" << start1 << "\t" << stop1 << "\t"
			     << start2   << "\t" << stop2    << "\t" << start3  << "\t" << stop3  << "\t" 
			     << max(start1, max(start3, start2)) << "\t" << min(stop1, min(stop3, stop2)) << endl;
		  }
		}
	      }
	      if (!frametwo) {
		statOut << entries << "\t" << r->start << "\t" << r->stop << "\t" << start1 << "\t" << stop1 << "\t"
			<< start2  << "\t" << stop2    << "\t" << max(start1, start2) << "\t" << min(stop1, stop2) << endl;
		frametwo=1;
	      }
	    }
	  }
	}
	if (frametwo) {
	  numtwo++; entries++;
	  infoOut << entries << ". Sequence where exon with two frames occurs: " << seq->get_locus() << endl;
	  infoOut << "Exon positions: " << r->start << "," << r->stop << endl;
	}
	if (framethree) {
	  numthree++; entries3++;
	  info3Out << entries << ". Sequence where exon with three frames occurs: " << seq->get_locus() << endl;
	  info3Out << "Exon positions: " << r->start << "," << r->stop << endl;
	}
      }
    }
    cout << "Total: " << numtwo << endl;
    
    delete[] pSegposArray;
    delete[] pSeglenArray;
    delete[] pSegaccArray;
    delete[] pImageArray;
    delete[] protein;
    delete[] proteinText;
  }

}
main(int argc, char *argv[]) {
  
  cout << "Finding alternative splice sites" << endl;

  findAltSS();

}









