#include <time.h>
#include <sys/time.h>

#include "internals.h"
#include "dictionary.h"

const unsigned long int start_time = time (NULL);
char alphabet, *stem, *stemptr, *tuple = (char *) malloc (DNA_MAX_TUP + 1);
unsigned long int availmem, *memchunk, numseq = 0, maxlen = 0;
FILE *srcfile, *acc_codes, *acc_order, *seq_index, *seq_lookup, *tup_index, *tup_lookup;

#define LINE0                  "======================================\n"
#define LINE1                  "Dictionary Builder v5.05, January 1999\n"
#define LINE2         "Valentin I. Spitkovsky (val@theory.lcs.mit.edu)\n"

#define CLINE1                     "MIT Laboratory for Computer Science\n"
#define CLINE2            "Theory of Computation: Computational Biology\n"
#define CLINE3                                  "Gene Recognition Group\n"
#define CLINE4                                  "======================\n"

#define USAGE0 " -mM -aa/-dna <dictionary prefix> <source database>\n\n"
#define USAGE1 " note: -mM limits memory usage to M megabytes (M > 51 Mb)\n"
#define USAGE2 "       the -aa/-dna option specifies the alphabet of the source database\n"
#define USAGE3 "       the former selects the set of twenty amino acids\n"
#define USAGE4 "       the latter selects the set of four nucleotides\n\n"

// Prints a greeting.
#define GREETING \
  printf ("\n%s%s%s\n%s%s%s%s\n", \
          LINE0, LINE1, LINE2, CLINE1, CLINE2, CLINE3, CLINE4);

// Calculates the time elapsed since the program began executing.
#define ELAPSED (time(0) - start_time)

// Reports (un)successful completion of a phase...
#define PHASE(CALL,NAME,MSG) \
  if (!CALL) \
    { \
      if (MSG) fprintf (stderr, "\n%s failed.\n\n", NAME); \
      clear (); exit (1); \
    } \
  printf ("%10lu sec        o %s completed.\n", ELAPSED, NAME);

// Prints a message, optionally invokes perror, and returns zero.
#define RETZERO(msg,PERROR) \
{ \
  fprintf (stderr, "\n%s", msg); \
  if (PERROR) perror (0); \
  return (0); \
}

char memalloc (void) // All significant memory allocation done here.
{                    // Returns one iff all okay.
  return (Dictionary::tuptr && Dictionary::radix && tuple &&
	  (memchunk = (unsigned long int *) malloc (availmem)));
}

char clear (void)
{
  tryclose (srcfile); // Closes all files.
  tryclose (acc_codes);
  tryclose (acc_order);
  tryclose (seq_index);
  tryclose (seq_lookup);
  tryclose (tup_index);
  tryclose (tup_lookup);

  tryfree (memchunk); // Frees all memory.

  return (1);
}

// Sorts out the command-line arguments and returns one iff all okay.
char args_ok (const int argc, const char **argv, char *stem, char *stemptr)
{
  if (argc != 5) goto usage; // Make sure we have 4 arguments.

  if (argv[1][0] != '-' || argv[1][1] != 'm') goto usage;
  sscanf (argv[1]+2, "%ld", &availmem); // Determine the buffer size.
  availmem = iMAX (availmem << 20, __DICTIONARY_MEMORY + 5*(DNA_MAX_TUP + 1) + (SEQ_BUFFER_LEN << 2))
    - (DNA_MAX_TUP + 1 + __DICTIONARY_MEMORY);

  if (!strcmp (argv[2], "-aa")) // Select one of two alphabets.
    alphabet = AA_ALPHABET;
  else if (!strcmp (argv[2], "-dna"))
    alphabet = DNA_ALPHABET;
  else
    goto usage;

  { // Allocate memory for argument path (stem).
    const int stemlen = strlen (argv[3]);

    if (!(stem = (char *) malloc (stemlen + 16)))
      RETZERO ("Out of memory!\n\n", 0);

    memcpy (stem, argv[3], stemlen);
    stemptr = stem + stemlen;
  }

  if (!(srcfile = fopen (argv[4], "r"))) // Open the source pairs file.
    {
      char msg[1024]; sprintf (msg, 
	  "\nError opening %s, the source database, for reading.\n", argv[4]);
      RETZERO (msg, 0);
    }

  // Finally, open the dictionary files for writing.
  return (Dictionary::open_dict_files (stem, stemptr, "w+", 
				       acc_codes, acc_order,
				       seq_index, seq_lookup,
				       tup_index, tup_lookup));

usage:
  printf ("\nusage: %s%s%s%s%s%s",argv[0],USAGE0,USAGE1,USAGE2,USAGE3,USAGE4);
  return (0);
}

char sequencing (const char *filter)
{
  unsigned long int *list = memchunk;         // holds the string and later its tuples
  char *v = (char *)(list + SEQ_BUFFER_LEN);  // a buffer for .tup.lookup
  const unsigned long int TUP_LEN_SUB_1 = (filter == dna_filter ? DNA_TUP_LEN - 1 : AA_TUP_LEN - 1), 
    vsize = availmem - (SEQ_BUFFER_LEN << 2), *slider0 = list + TUP_LEN_SUB_1;
  unsigned long int total_len = 0, vpos = 0, *tupindex = (unsigned long int *)Dictionary::tuptr;

  // Invalidate some tuples, based on the given alphabet.
  if (filter == dna_filter)
    {
      memset (tuple, 0, DNA_MAX_TUP);
      tuple[DNA_MAX_TUP] = 1;
    }
  else
    {
      memset (tuple, 1, DNA_MAX_TUP + 1);

      for (int i = 0; i < AA; i++)
	{ const unsigned long int sofari = i << 5;
	  for (int j = 0; j < AA; j++)
	    { const unsigned long int sofarj = (sofari | j) << 5;
	      for (int k = 0; k < AA; k++)
		{ const unsigned long int sofark = (sofarj | k) << 5;
		  for (int l = 0; l < AA; l++)
		    tuple[sofark | l] = 0; }}}
    }

  memset (list, 0, ACC_CODE_LEN - 1);
  while (fscanf (srcfile, "%s", (char *)list) != EOF) // While still have source pairs.
    {
      list[ACC_CODE_LEN - 1] = 0;                                    // Fix the accession code.
      if (fwrite (list, 1, ACC_CODE_LEN, acc_codes) != ACC_CODE_LEN) // Write the accession code.
	RETZERO ("Could not write to a dictionary file (.acc.codes): ", 1);

      fscanf (srcfile, "%s", (char *)list);                          // Read the sequence.
      const unsigned long int seqlen = strlen ((char *)list);        // Measure its length.
      if (!fwrite (&total_len, 4, 1, seq_index))                     // Write the position.
	RETZERO ("Could not write to a dictionary file (.seq.index): ", 1);
      total_len += seqlen;                                           // Update the position.
      if (seqlen > maxlen) maxlen = seqlen;

      if (vsize < vpos + seqlen)
	{
	  if (fwrite (v, 1, vpos, seq_lookup) != vpos)
	    RETZERO ("Could not write to a temporary dictionary file (.seq.lookup): ", 1);
	  vpos = 0;
	}

      for (char *slider = (char *)list; *slider; slider++)           // Copy a nicified version
	v[vpos++] = filter[*slider];                                 // of the sequence onto the buffer.

      Dictionary::makeTupleList (alphabet, (const char *)(v + vpos - seqlen), seqlen, list);

      const unsigned long *sent = list + seqlen; list[0] = 0;
      for (unsigned long int *slider = (unsigned long int *)slider0; slider < sent; slider++)
	if (!tuple[*slider])
	  {
	    tuple[list[++list[0]] = *slider] = 1;
	    tupindex[*slider]++;                                    // Count up the frequency for each tuple.
	  }
      
      const unsigned long int *sent2 = list + *list;
      for (unsigned long int *slider = list + 1; slider <= sent2; slider++)
	tuple[*slider] = 0;                                         // Clear the tuple structure...
	  
      memset (list, 0, ACC_CODE_LEN - 1);
      numseq++;
    }
  maxlen++;

  if (!fwrite (&total_len, 4, 1, seq_index))                        // Write the terminating position.
    RETZERO ("Could not write to a dictionary file (.seq.index): ", 1);

  if (fwrite (v, 1, vpos, seq_lookup) != vpos)
    RETZERO ("Could not write to a temporary dictionary file (.seq.lookup): ", 1);

  { /* .seq.index */
    unsigned long int tup;

    for (list[0] = 0, tup = 1; tup <= DNA_MAX_TUP; tup++)
      list[tup] = list[tup-1] + tupindex[tup-1];
    list[tup] = list[tup-1];

    if (fwrite (list, 4, DNA_MAX_TUP + 2, tup_index) != DNA_MAX_TUP + 2)
      RETZERO ("Could not write to a dictionary file (.seq.index): ", 1);
  }

  { /* .acc.order */
    unsigned long int *order = memchunk, *radix = order + numseq, 
      *score = radix + numseq, *acodes = score + numseq, seq;

    if (44 * numseq > availmem)
      RETZERO ("Need more memory: please up the M parameter...", 0);

    rewind (acc_codes);
    if (fread (acodes, ACC_CODE_LEN, numseq, acc_codes) != numseq)
      RETZERO ("Could not read from a dictionary file (.acc.codes): ", 1);
    
    for (seq = 0; seq < numseq; seq++) order[seq] = seq;
    for (int j = 7; j >= 0; j--)
      {
	const unsigned long int *ptr = acodes + j;
	for (seq = 0; seq < numseq; seq++) 
	  {
	    const char *morph = (char *)(ptr + (seq << 3));
	    score[seq] = (morph[0] << 24) + (morph[1] << 16) + (morph[2] << 8) + morph[3];
	  }
	Dictionary::radixSortFun (order, radix, numseq, score);
      }

    for (seq = 0; seq < numseq; seq++)
      if (!(fwrite (acodes + order[seq], 32, 1, acc_order) && fwrite (order + seq, 4, 1, acc_order)))
	RETZERO ("Could not write to a dictionary file (.acc.order): ", 1);
  }

  if (!fwrite (&alphabet, 1, 1, acc_order))
    RETZERO ("Could not write to a dictionary file (.acc.order): ", 1);

  return (1);
}

char tupling (void)
{
  unsigned long int *seqlen = memchunk, *list = memchunk + numseq, *freq = (unsigned long int *)Dictionary::tuptr, 
    *output = list + maxlen, tup = 1;  
  const unsigned long int space = (availmem >> 2) - maxlen - numseq, 
    *tstart = list + (alphabet == DNA_ALPHABET ? DNA_TUP_LEN - 1 : AA_TUP_LEN - 1);
  char *str = (char *)(list + maxlen) - maxlen;

  memset (tuple, 1, DNA_MAX_TUP + 1);

  fseek (seq_index, 4, SEEK_SET);
  if (fread (seqlen, 4, numseq, seq_index) != numseq)
    RETZERO ("Could not read from a dictionary file (.seq.index): ", 1);

  for (unsigned long int seq = numseq - 1; seq; seq--)
    seqlen[seq] -= seqlen[seq-1];

  freq[DNA_MAX_TUP] = space + 1;
  for (unsigned long int a = 0, b = 0; a < DNA_MAX_TUP; a = ++b)
    {
      for (unsigned long int sz = freq[a]; sz + freq[++b] <= space; sz += freq[b]);
      const unsigned long int span = b-- - a;
      rewind (seq_lookup);
      
      if (span > 1)
	{
	  unsigned long int total = 0;
	  memset (tuple + a, 0, span);

	  for (tup = a; tup <= b; tup++)
	    { const unsigned long int tmp = freq[tup]; freq[tup] = total; total += tmp; }

	  for (unsigned long int seq = 0; seq < numseq; seq++)
	    {
	      const unsigned long int len = seqlen[seq], *tsent = list + len;

	      if (fread (str, 1, len, seq_lookup) != len)
		RETZERO ("Could not read from a dictionary file (.seq.lookup): ", 1);

	      Dictionary::makeTupleList (alphabet, str, len, list); list[0] = 0;

	      for (unsigned long int *slider = (unsigned long int *)tstart; slider < tsent; slider++)
		if (!tuple[*slider]) tuple[list[++list[0]] = *slider] = 1;

	      const unsigned long int *lsent = list + *list;
	      for (unsigned long int *slider = list + 1; slider <= lsent; slider++)
		{ tuple[*slider] = 0; output[freq[*slider]++] = seq; }
	    }	  

	  if (fwrite (output, 4, total, tup_lookup) != total)
	    RETZERO ("Could not write to a dictionary file (.tup.lookup): ", 1);

	  memset (tuple + a, 1, span);
	}
      else
	for (unsigned long int seq = 0; seq < numseq; seq++)
	  {
	    const unsigned long int len = seqlen[seq], *tsent = list + len, tup = a;
	    unsigned long int *slider = (unsigned long int *)tstart;
	    
	    if (fread (str, 1, len, seq_lookup) != len)
	      RETZERO ("Could not read from a dictionary file (.seq.lookup): ", 1);
	    
	    Dictionary::makeTupleList (alphabet, str, len, list);
	    for (; *slider != tup && slider < tsent; slider++);
	    if (slider < tsent && !fwrite (&seq, 1, 4, seq_lookup))
	      RETZERO ("Could not write to a dictionary file (.tup.lookup): ", 1);
	  }
    }

  return (1);
}

int main (const int argc, const char **argv)
{
  srcfile = acc_codes = acc_order = seq_index = seq_lookup = tup_index = tup_lookup = 0;

  GREETING

  PHASE(args_ok (argc, argv, stem, stemptr), "Argument confirmation", 0);
  PHASE(memalloc (), "Memory allocation", 1);
  PHASE(sequencing (alphabet == AA_ALPHABET ? aa_filter : dna_filter), "Sequencing phase", 1);
  PHASE(tupling (), "Tupling phase", 1);
  PHASE(clear (), "Memory deallocation", 1);

  printf ("\n%10lu sec        o Success!  :)\n\n", ELAPSED);
}
