
static char const rcsid [] = "$Id: infocon.c,v 1.3 1999/08/09 20:34:18 florea Exp $";


/*****************************************************************************/
/*                                                                           */
/* Program: infocon (utility for locating regions of high info content)      */
/*                                                                           */
/* Author: Nikola Stojanovic                                                 */
/*                                                                           */
/* Revision:    26 JUL 97   Version 1.0                                      */
/*              22 JAN 98   Version 2.0                                      */
/*                                                                           */
/*   Given an alignment (file and the sequences) this program determines all */
/* full runs of columns with high information content inside it.             */
/*                                                                           */
/*****************************************************************************/


#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <float.h>
#include <math.h>
#include "ntl.h"

/*****************************************************************************/
/*                                                                           */
/* Definitions section                                                       */
/*                                                                           */
/*****************************************************************************/


/*****************************************************************************/
/* Definitions of the constants of the program unit                          */
/*****************************************************************************/


#define UNDEFINED                        31781
#define INFINITY                         31873
#define FP_SHIELD                        283

#define DEFAULT_PATH                     "."

#define DEFAULT_RANGE_SEQUENCE           "human"
#define DEFAULT_SEQUENCE                 NULL
#define DEFAULT_RANGE_START              0
#define DEFAULT_RANGE_STOP               0

#define DEFAULT_COUNTER_NUMBER           1

#define MAX_LONG_LEN                     9
#define MININT_VALUE                     -32000
#define MAXINT_VALUE                     32000

#define MAX_WHOLE                        3
#define MAX_FRAC                         6

#define DEFAULT_MINIMAL_LENGTH           6
#define DEFAULT_MINIMAL_ACTIVE           3

#define GENERAL_GAPS                     1
#define IGNORE_GAPS                      2
#define EXCLUDE_GAPS                     3

#define NO_AVERAGE                       0
#define TOTAL_AVERAGE                    1
#define ACTIVE_AVERAGE                   2
#define DEFAULT_AVERAGE_SUBTRACT         TOTAL_AVERAGE

#define DEFAULT_ADJUSTMENT               0.0
#define DEFAULT_GAP_TOLERANCE            EXCLUDE_GAPS

#define FLOAT_LN_LIMIT                   0.0001


/*****************************************************************************/
/* Variables used globally in the program unit                               */
/*****************************************************************************/


static double pa;
static double pc;
static double pg;
static double pt;
static double pgap;


/*****************************************************************************/
/* Prototypes of locally used functions of the program unit                  */
/*****************************************************************************/


int infocon_load_data (strlist_ptr paths, char *file_name, char *sequence,
                       long int start, long int stop, int average_subtract,
                       int active, int gaps, restricted_ptr *alignment,
                       long int *range_len, char **counter_seq, int *counter,
                       long int *track, float *average);
void infocon_output_header (char *file_name, char *sequence, long int start,
                            long int stop, char *counter_seq, int counter,
                            int average_subtract, float average,
                            float adjustment, int length, int active,
                            int gaps);
int infocon_process_data (long int range_len, char *counter_seq,
                          long int track, restricted_ptr alignment,
                          float adjustment, int length, int active, int gaps);
void infocon_full_runs (float *scores, long int range_len,
                        long int *intervals, long int **low_ends,
                        long int **high_ends);
bool infocon_is_long (char *string, long int *value);
bool infocon_is_int (char *string, int *value);
bool infocon_is_float (char *string, float *value);

/*****************************************************************************/
/*                                                                           */
/* Code section                                                              */
/*                                                                           */
/*****************************************************************************/


/*****************************************************************************/
/*                                                                           */
/* Procedure: main                                                           */
/*                                                                           */
/* "main" procedure of the program. Receives and analyses the command line   */
/*   parameters, sets the control variables of the program, checks their     */
/*   consistency and passes control to internal procedures which actually    */
/*   process the input data; returns 0 if everything is OK, non-zero status  */
/*   in case of any errors                                                   */

int main (int argc, char **argv)
{
 char *file_name, *sequence, *counter_seq, *save_count;
 bool range_set, counter_locked, average_locked; int average_subtract;
 bool factor_locked, strictness_set; long int start, stop, range_len, track;
 int counter, length, active, arg_count, gaps; float adjustment, average;
 strlist_ptr registered_paths, new_path, path_scan; restricted_ptr alignment;

 /* Initialize the settings and prepare for the processing of the arguments  */

 file_name = NULL;
 range_set = FALSE; start = stop = UNDEFINED; sequence = NULL;
 if (DEFAULT_PATH == NULL) registered_paths = NULL;
 else {
  registered_paths = (strlist_ptr) NTL0_ckalloc (sizeof (StrList_Struct));
  registered_paths -> string = DEFAULT_PATH;
  registered_paths -> next = NULL;
 }
 counter_locked = FALSE; counter_seq = NULL; counter = UNDEFINED;
 average_locked = FALSE; average_subtract = UNDEFINED;
 factor_locked = FALSE; adjustment = 0.0;
 length = UNDEFINED; active = UNDEFINED;
 strictness_set = FALSE; gaps = UNDEFINED;

 if (argc < 2) {
  fprintf (stderr, "usage: %s <alignment_file>\n", argv [0]);
  fprintf (stderr, "               [-I <directory_path>]*\n");
  fprintf (stderr, "               [-r <from> <to> [-s <sequence>]]\n");
  fprintf (stderr,
                 "               [-c <counter_sequence> | -n <seq_number>]\n");
  fprintf (stderr, "               [-m | -b | -o]\n");
  fprintf (stderr, "               [-d <decrease_factor>]\n");
  fprintf (stderr, "               [-l <min_length_of_region]\n");
  fprintf (stderr, "               [-a <min_active_sequences>]\n");
  fprintf (stderr, "               [-g | -i | -x]\n");
  exit (1);
 }
 else {                           /* Some parameters provided - process them */

  /* Proceed to extract the command line parameters and create the settings  */

  arg_count = 1; while (arg_count < argc) {

   if (argv [arg_count] [0] != '-') {       /* Not an "-" option - file name */
    if (file_name != NULL) {
     fprintf (stderr, "Repeated alignment file name ('%s').\n",
                      argv [arg_count]);
     exit (1);
    }
    else { file_name = NTL0_strsave (argv [arg_count]); arg_count++; }
   }
   else if (!strcmp (argv [arg_count], "-I")) {        /* New "include" path */
    arg_count++;
    if (arg_count == argc) {
     fprintf (stderr, "Missing directory path for inclusion.\n"); exit (1);
    }
    else {
     new_path = (strlist_ptr) NTL0_ckalloc (sizeof (StrList_Struct));
     new_path -> string = NTL0_strsave (argv [arg_count]);
     new_path -> next = NULL;

     if (registered_paths == NULL) registered_paths = new_path;
     else {
      path_scan = registered_paths;
      while (path_scan -> next != NULL) path_scan = path_scan -> next;
      path_scan -> next = new_path;
     }
     arg_count++;
    }
   }
   else if (!strcmp (argv [arg_count], "-r")) {             /* Range request */
    if (range_set) {
     fprintf (stderr,
              "Range for alignment loading already set to [%ld-%ld].\n",
              start, stop);
     exit (1);
    }
    else if (arg_count > argc - 3) {
     fprintf (stderr, "Incomplete range specification.\n"); exit (1);
    }
    else if ((!infocon_is_long (argv [arg_count + 1], &start)) ||
             (!infocon_is_long (argv [arg_count + 2], &stop))) {
     fprintf (stderr, "Illegal specification for the range.\n"); exit (1);
    }
    else if ((start < 0) || (stop < 0)) {
     fprintf (stderr, "Negative values not permitted for the range.\n");
     exit (1);
    }
    else if (start > stop) {
     fprintf (stderr, "Range start (%ld) greater than range end (%ld).\n",
                      start, stop);
     exit (1);
    }
    else { range_set = TRUE; arg_count += 3; }
   }
   else if (!strcmp (argv [arg_count], "-s")) {    /* Range sequence specif. */
    arg_count++;
    if (sequence != NULL) {
     fprintf (stderr, "Sequence for alignment range already set.\n");
     exit (1);
    }
    else { sequence = NTL0_strsave (argv [arg_count]); arg_count++; }
   }
   else if (!strcmp (argv [arg_count], "-c")) {     /* Counter sequence name */
    arg_count++;
    if (counter_locked) {
     fprintf (stderr, "Counter sequence in alignment already set.\n");
     exit (1);
    }
    else {
     counter_seq = NTL0_strsave (argv [arg_count]); counter_locked = TRUE;
     arg_count++;
    }
   }
   else if (!strcmp (argv [arg_count], "-n")) {   /* Counter sequence number */
    if (counter_locked) {
     fprintf (stderr, "Counter sequence in alignment already set.\n");
     exit (1);
    }
    else if (arg_count > argc - 2) {
     fprintf (stderr, "Incomplete sequence number specification.\n"); exit (1);
    }
    else if (!infocon_is_int (argv [arg_count + 1], &counter)) {
     fprintf (stderr, "Illegal sequence number (%s).\n", argv [arg_count + 1]);
     exit (1);
    }
    else if (counter < 0) {
     fprintf (stderr, "Negative values not permitted for sequence number.\n");
     exit (1);
    }
    else { counter_locked = TRUE; arg_count += 2; }
   }
   else if (!strcmp (argv [arg_count], "-m")) {     /* Subtract average info */
    if (average_locked) {
     fprintf (stderr, "Duplicate request for subtracting average info.\n");
     exit (1);
    }
    else {
     average_subtract = TOTAL_AVERAGE; average_locked = TRUE; arg_count++;
    }
   }
   else if (!strcmp (argv [arg_count], "-b")) {  /* Subtract bounded average */
    if (average_locked) {
     fprintf (stderr, "Duplicate request for subtracting average info.\n");
     exit (1);
    }
    else {
     average_subtract = ACTIVE_AVERAGE; average_locked = TRUE; arg_count++;
    }
   }
   else if (!strcmp (argv [arg_count], "-o")) { /* Not subtract average info */
    if (average_locked) {
     fprintf (stderr, "Duplicate request for subtracting average info.\n");
     exit (1);
    }
    else { average_subtract = NO_AVERAGE; average_locked = TRUE; arg_count++; }
   }
   else if (!strcmp (argv [arg_count], "-d")) {      /* Subtract factor req. */
    if (factor_locked) {
     fprintf (stderr, "Duplicate subtract amount request.\n"); exit (1);
    }
    else if (arg_count > argc - 2) {
     fprintf (stderr, "Incomplete subtract amount specification.\n"); exit (1);
    }
    else if (!infocon_is_float (argv [arg_count + 1], &adjustment)) {
     fprintf (stderr, "Illegal amount for subtracting (%s).\n",
                      argv [arg_count + 1]);
     exit (1);
    }
    else { factor_locked = TRUE; arg_count += 2; }
   }
   else if (!strcmp (argv [arg_count], "-l")) {     /* Region length request */
    if (length != UNDEFINED) {
     fprintf (stderr, "Minimal length of full intervals already set to %d.\n",
              length);
     exit (1);
    }
    else if (arg_count > argc - 2) {
     fprintf (stderr, "Incomplete length specification.\n"); exit (1);
    }
    else if (!infocon_is_int (argv [arg_count + 1], &length)) {
     fprintf (stderr, "Illegal minimal region length (%s).\n",
                      argv [arg_count + 1]);
     exit (1);
    }
    else if (length <= 0) {
     fprintf (stderr, "Positive length required for full intervals.\n");
     exit (1);
    }
    else arg_count += 2;
   }
   else if (!strcmp (argv [arg_count], "-a")) {  /* Active sequences request */
    if (active != UNDEFINED) {
     fprintf (stderr,
              "Minimal number of active sequences already set to %d.\n",
              active);
     exit (1);
    }
    else if (arg_count > argc - 2) {
     fprintf (stderr, "Incomplete active sequences specification.\n");
     exit (1);
    }
    else if (!infocon_is_int (argv [arg_count + 1], &active)) {
     fprintf (stderr, "Illegal minimal number of active sequences (%s).\n",
                      argv [arg_count + 1]);
     exit (1);
    }
    else if (active <= 0) {
     fprintf (stderr, "Positive number of active sequences required.\n");
     exit (1);
    }
    else arg_count += 2;
   }
   else if (!strcmp (argv [arg_count], "-g")) {            /* Count the gaps */
    if (strictness_set) {
     fprintf (stderr, "Kind of requested regions already set.\n"); exit (1);
    }
    else { gaps = GENERAL_GAPS; strictness_set = TRUE; arg_count++; }
   }
   else if (!strcmp (argv [arg_count], "-i")) {           /* Ignore the gaps */
    if (strictness_set) {
     fprintf (stderr, "Kind of requested regions already set.\n"); exit (1);
    }
    else { gaps = IGNORE_GAPS; strictness_set = TRUE; arg_count++; }
   }
   else if (!strcmp (argv [arg_count], "-x")) {         /* Prohibit the gaps */
    if (strictness_set) {
     fprintf (stderr, "Kind of requested regions already set.\n"); exit (1);
    }
    else { gaps = EXCLUDE_GAPS; strictness_set = TRUE; arg_count++; }
   }
   else {                                              /* Unknown "-" option */
    fprintf (stderr, "usage: %s <alignment_file>\n", argv [0]);
    fprintf (stderr, "               [-I <directory_path>]*\n");
    fprintf (stderr, "               [-r <from> <to> [-s <sequence>]]\n");
    fprintf (stderr,
                 "               [-c <counter_sequence> | -n <seq_number>]\n");
    fprintf (stderr, "               [-m | -b | -o]\n");
    fprintf (stderr, "               [-d <decrease_factor>]\n");
    fprintf (stderr, "               [-l <min_length_of_region]\n");
    fprintf (stderr, "               [-a <min_active_sequences>]\n");
    fprintf (stderr, "               [-g | -i | -x]\n");
    exit (1);
   }
  }
  /* Check whether all necessary data have been collected and consistent     */

  if (file_name == NULL) {
   fprintf (stderr, "Must have an alignment to process.\n"); exit (1);
  }
  else if ((range_set) && (sequence == NULL)) { /* Range in default sequence */
   sequence = NTL0_strsave (DEFAULT_RANGE_SEQUENCE);
  }
  else if (!range_set) {  /* Range for loading not provided - assume default */
   if (sequence != NULL) {
    fprintf (stderr, "Sequence '%s' provided without range.\n", sequence);
    exit (1);
   }
   else {
    sequence = NTL0_strsave (DEFAULT_SEQUENCE);
    start = DEFAULT_RANGE_START; stop = DEFAULT_RANGE_STOP;
   }
  }
  /* Set up to defaults all values not explicitely assigned in command line  */

  if (!counter_locked) counter = DEFAULT_COUNTER_NUMBER;
  if (!average_locked) average_subtract = DEFAULT_AVERAGE_SUBTRACT;
  if (!factor_locked) adjustment = DEFAULT_ADJUSTMENT;
  if (length == UNDEFINED) length = DEFAULT_MINIMAL_LENGTH;
  if (active == UNDEFINED) active = DEFAULT_MINIMAL_ACTIVE;
  if (!strictness_set) gaps = DEFAULT_GAP_TOLERANCE;

  save_count = NTL0_strsave (counter_seq);

  /* Proceed to collect (load) the data to be processed by the algorithm     */

  if (infocon_load_data (registered_paths, file_name, sequence, start, stop,
                         average_subtract, active, gaps, &alignment,
                     &range_len, &counter_seq, &counter, &track, &average) < 0)
   exit (1);

  infocon_output_header (file_name, sequence, start, stop, save_count, counter,
                         average_subtract, average, adjustment, length, active,
                         gaps);

  adjustment += average;

  if (infocon_process_data (range_len, counter_seq, track, alignment,
                            adjustment, length, active, gaps) < 0)
   exit (1);

  exit (0);
 }
}


/*****************************************************************************/
/*                                                                           */
/* Procedure: infocon_load_data                                              */
/*                                                                           */
/* Procedure collects alignment data with respect to provided settings and   */
/*   returns them within its reference parameters; returns 0 if everything   */
/*   is OK, negative status otherwise                                        */

int infocon_load_data (strlist_ptr paths, char *file_name, char *sequence,
                       long int start, long int stop, int average_subtract,
                       int active, int gaps, restricted_ptr *alignment,
                       long int *range_len, char **counter_seq, int *counter,
                       long int *track, float *average)
{
 strlist_ptr current_path; char *new_path;
 line_ptr line_al, last_line; long int al_scan, col_count;
 bool found; int index, count, na, ng, nt, nc, no, ngap, total, col_scan;
 header_ptr file_info; align_ptr packed; unpacked_ptr expanded;
 double fa, fc, fg, ft, fgap, da, dc, dg, dt, dgap; errind err_stat;

 /* Load the alignment into an internal structure first                      */

 if ((paths == NULL) ||
     (file_name [0] == '/') || (file_name [0] == '~')) {
  if ((err_stat = NTL2_Load_AlignFile (file_name, &file_info)) != NULL) {
   fprintf (stderr, "Can't open alignment file '%s' (%s).\n",
                    file_name, err_stat -> message);
   return -1;
  }
  else if (file_info == NULL) {        /* File found, but nothing retrieved */
   fprintf (stderr, "No alignment to process.\n"); return -2;
  }
 }
 else {                     /* Directory path(s) provided, try them in order */
  current_path = paths; file_info = NULL;
  while ((file_info == NULL) && (current_path != NULL)) {
   new_path = NTL0_ckalloc ((strlen (current_path -> string) +
                             strlen (file_name) + 2) * sizeof (char));
   strcpy (new_path, current_path -> string);
   if (new_path [strlen (current_path -> string) - 1] != '/')
    strcat (new_path, "/");
   strcat (new_path, file_name);

   if ((err_stat = NTL2_Load_AlignFile (new_path, &file_info)) != NULL) {
    file_info = NULL; current_path = current_path -> next;
   }
   free (new_path);
  }
  if (err_stat != NULL) {                      /* File not found on any path */
   fprintf (stderr, "Can't open alignment file '%s' (%s).\n",
                    file_name, err_stat -> message);
   return -3;
  }
  else if (file_info == NULL) {         /* File found, but nothing retrieved */
   fprintf (stderr, "No alignment to process.\n"); return -4;
  }
 }
 /* Now proceed to partially unpack the information from the loaded file     */

 if ((err_stat = NTL1_Load_Alignment (1, file_info, &packed)) != NULL) {
  fprintf (stderr, "Can't extract the alignment from the file (%s).\n",
                   err_stat -> message);
  return -5;
 }
 else {
  if ((err_stat = NTL2_Unpack_Alignment (1, file_info, packed, paths,
                                         &expanded)) != NULL) {
   fprintf (stderr, "Can't expand the alignment (%s).\n", err_stat -> message);
   return -6;
  }
  else {                          /* Expansion of the full alignment done OK */

   /* Count the occurences of relevant characters in sequences, where active */

   total = na = nc = ng = nt = ngap = 0;
   for (al_scan = 0; al_scan < expanded -> size; al_scan++) {
    for (col_scan = 0; col_scan < expanded -> dimension; col_scan++) {
     if ((expanded -> texts) [col_scan] [al_scan] == 'A') {
      na++; total++;
     }
     else if ((expanded -> texts) [col_scan] [al_scan] == 'C') {
      nc++; total++;
     }
     else if ((expanded -> texts) [col_scan] [al_scan] == 'G') {
      ng++; total++;
     }
     else if ((expanded -> texts) [col_scan] [al_scan] == 'T') {
      nt++; total++;
     }
     else if ((expanded -> texts) [col_scan] [al_scan] == GAP_SYMBOL) {
      if (((expanded -> starts) [col_scan] <= al_scan) &&
          ((expanded -> stops) [col_scan] >= al_scan))
       ngap++;
     }
    }
   }
   if (gaps == GENERAL_GAPS) {
    pa = ((double) na) / ((double) (total + ngap));
    pc = ((double) nc) / ((double) (total + ngap));
    pg = ((double) ng) / ((double) (total + ngap));
    pt = ((double) nt) / ((double) (total + ngap));
    pgap = ((double) ngap) / ((double) (total + ngap));
   }
   else {                           /* Gaps either to be ignored or excluded */
    pa = ((double) na) / ((double) total);
    pc = ((double) nc) / ((double) total);
    pg = ((double) ng) / ((double) total);
    pt = ((double) nt) / ((double) total);
    pgap = 1.0;
   }

   /* If subtraction of the average info is requested, calculate it          */

   if (average_subtract != NO_AVERAGE) {
    *average = 0.0; col_count = 0;
    for (al_scan = 0; al_scan < expanded -> size; al_scan++) {
     total = na = nc = ng = nt = no = ngap = 0;
     for (col_scan = 0; col_scan < expanded -> dimension; col_scan++) {
      if (((expanded -> segment_code) [col_scan] == VALID_SEGMENT) &&
          (al_scan >= (expanded -> starts) [col_scan]) &&
          (al_scan <= (expanded -> stops) [col_scan])) {
       total++;
       if ((expanded -> texts) [col_scan] [al_scan] == GAP_SYMBOL)
        ngap++;
       else if ((expanded -> texts) [col_scan] [al_scan] == 'A') na++;
       else if ((expanded -> texts) [col_scan] [al_scan] == 'C') nc++;
       else if ((expanded -> texts) [col_scan] [al_scan] == 'G') ng++;
       else if ((expanded -> texts) [col_scan] [al_scan] == 'T') nt++;
       else no++;
      }
     }
     if ((average_subtract != ACTIVE_AVERAGE) || (total >= active)) {
      if ((no == 0) && ((ngap == 0) || (gaps != EXCLUDE_GAPS))) {
       if (gaps == IGNORE_GAPS) { total -= ngap; ngap = 0; }

       if (total == 0) {
        fprintf (stderr, "Illegal column in the alignment (%ld).\n", al_scan);
        return -7;
       }
       else {             /* Calculate the information content of the column */
        fa = ((double) na) / ((double) total);
        fc = ((double) nc) / ((double) total);
        fg = ((double) ng) / ((double) total);
        ft = ((double) nt) / ((double) total);
        fgap = ((double) ngap) / ((double) total);

        da = fa / pa; dg = fg / pg; dt = ft / pt; dc = fc / pc;
        dgap = fgap / pgap;

        if (da > FLOAT_LN_LIMIT) da = log (da); else da = 0.0;
        if (dg > FLOAT_LN_LIMIT) dg = log (dg); else dg = 0.0;
        if (dt > FLOAT_LN_LIMIT) dt = log (dt); else dt = 0.0;
        if (dc > FLOAT_LN_LIMIT) dc = log (dc); else dc = 0.0;

        if (dgap > FLOAT_LN_LIMIT) dgap = log (dgap); else dgap = 0.0;
        
        *average = (*average) +
           (float) (fa * da + fg * dg + ft * dt + fc * dc + fgap * dgap);
        col_count++;
       }
      }
     }
    }
    if (col_count > 0) *average = (*average) / ((float) col_count);
    else *average = 0.0;
   }
   else *average = 0.0;

   /* Check if a range within the alignment is requested, and cut it to size */

   if (sequence != NULL) {
    if ((err_stat = NTL1_Cut_Alignment (file_info, expanded, sequence,
                                        start, stop)) != NULL) {
     if (err_stat -> kind == WARNING) {
      fprintf (stderr, "WARNING:  %s.\n", err_stat -> message);
     }
     else {
      fprintf (stderr, "Can't get the range from the alignment (%s).\n",
                       err_stat -> message);
      return -8;
     }
    }
   }
  }
 }
 /* Locate the sequence in the alignment that sets the counter for output    */

 if (*counter != UNDEFINED) {            /* Selection by the sequence number */
  if (*counter > expanded -> dimension) {
   fprintf (stderr,
            "Counter sequence (%d) not in the alignment (of dimension %d).\n",
            *counter, expanded -> dimension);
   return -9;
  }
  else if ((expanded -> segment_code) [(*counter) - 1] != VALID_SEGMENT) {
   fprintf (stderr, "Counter sequence (%d) not active in the range.\n",
                    *counter);
   return -10;
  }
  else {                         /* Existing and valid sequence in the range */
   *counter_seq = (expanded -> texts) [(*counter) - 1];
   *range_len = expanded -> size;
   *track = (expanded -> begin) [(*counter) - 1];
  }
 }
 else if (*counter_seq == NULL) {
  fprintf (stderr, "Must have a way to locate the counter sequence.\n");
  return -11;
 }
 else {                                /* Counter sequence given by its name */
  index = 0; found = FALSE;
  while ((!found) && (index < expanded -> dimension)) {
   if ((expanded -> segment_code) [index] == VALID_SEGMENT) {
    if (!strcmp (((file_info -> sequences) [index]).seq_name, *counter_seq))
     found = TRUE;
    else index++;
   }
   else index++;
  }
  if (!found) {
   fprintf (stderr, "Counter sequence '%s' not found in the selected range.\n",
                    *counter_seq);
   return -12;
  }
  index++; *counter = index; found = FALSE;
  while ((!found) && (index < expanded -> dimension)) {
   if ((expanded -> segment_code) [index] == VALID_SEGMENT) {
    if (!strcmp (((file_info -> sequences) [index]).seq_name, *counter_seq))
     found = TRUE;
    else index++;
   }
   else index++;
  }
  if (found) {
   fprintf (stderr, "Ambiguos counter sequence '%s' in the selected range.\n",
            *counter_seq);
   return -13;
  }
  else {                        /* Unique sequence found with the given name */
   free (*counter_seq);
   *counter_seq = (expanded -> texts) [(*counter) - 1];
   *range_len = expanded -> size;
   *track = (expanded -> begin) [(*counter) - 1];
  }
 }
 /* Now copy expanded alignment into more compact structure for processing   */

 *alignment = (restricted_ptr) NTL0_ckalloc (sizeof (Restricted_Struct));

 count = 0;
 for (index = 0; index < expanded -> dimension; index++) {
  if ((expanded -> segment_code) [index] == VALID_SEGMENT) count++;
 }
 (*alignment) -> dimension = count;
 (*alignment) -> size = expanded -> size;

 (*alignment) -> texts = (char **) NTL0_ckalloc (count * sizeof (char *));
 (*alignment) -> begin = (long int *)
                         NTL0_ckalloc (count * sizeof (long int *));
 (*alignment) -> end = (long int *)
                       NTL0_ckalloc (count * sizeof (long int *));
 (*alignment) -> starts = (long int *)
                          NTL0_ckalloc (count * sizeof (long int *));
 (*alignment) -> stops = (long int *)
                         NTL0_ckalloc (count * sizeof (long int *));

 count = 0;
 for (index = 0; index < expanded -> dimension; index++) {
  if ((expanded -> segment_code) [index] == VALID_SEGMENT) {
   ((*alignment) -> texts) [count] = (expanded -> texts) [index];
   ((*alignment) -> begin) [count] = (expanded -> begin) [index];
   ((*alignment) -> end) [count] = (expanded -> end) [index];
   ((*alignment) -> starts) [count] = (expanded -> starts) [index];
   ((*alignment) -> stops) [count] = (expanded -> stops) [index];
   count++;
  }
 }
 /* Destroy the "file_info" structure, since its contents are not needed     */

 /* ### To be defined ###################################################### */

 /* Destroy "partially unpacked" structure used in the expansion - done      */

 if (packed -> begin != NULL) free (packed -> begin);
 if (packed -> end != NULL) free (packed -> end);

 if (packed -> lines != NULL) {
  for (index = 0; index < expanded -> dimension; index++) {
   line_al = (packed -> lines) [index]; while (line_al != NULL) {
    last_line = line_al; line_al = line_al -> next; free (last_line);
   }
  }
 }
 free (packed);

 /* Destroy the expanded alignment info now when the restricted copy made    */

 /* ### To be defined ###################################################### */

 return 0;
}


/*****************************************************************************/
/*                                                                           */
/* Procedure: infocon_output_header                                          */
/*                                                                           */
/* Procedure writes a number of comment lines in the output - output type    */
/*   identification and data about the settings under which it was           */
/*   generated, alignment file name and requested range, counter data,       */
/*   adjustments done to the calculated information content, minimal number  */
/*   of active sequences required and the kind of requested regions          */

void infocon_output_header (char *file_name, char *sequence, long int start,
                            long int stop, char *counter_seq, int counter,
                            int average_subtract, float average,
                            float adjustment, int length, int active, int gaps)
{
 printf ("#:plain:\n\n");
 printf ("# Output of the high information content region location.\n");
 printf ("# Alignment file:  '%s'\n", file_name);
 if (sequence != NULL)
  printf ("#   restricted to the range [%ld,%ld] in \"%s\".\n",
          start, stop, sequence);
 if (counter_seq != NULL)
  printf ("# Counting done in '%s' (sequence %d in the alignment).\n",
          counter_seq, counter);
 else printf ("# Counting done in sequence %d of the alignment.\n", counter);
 if (average_subtract != NO_AVERAGE) {
  printf (
    "# Average information for the alignment subtracted from columns.\n");
  if (average_subtract == TOTAL_AVERAGE)
   printf ("#   Average information based on all alignment columns.\n");
  else {
   printf ("#   Average information based on columns meeting minimal active");
   printf (" sequences criteria.\n");
  }
  printf ("#     Calculated average information content: %5.3f\n", average);
 }
 if ((adjustment < -EPSILON) || (adjustment > EPSILON))
  printf ("# Column scores adjusted by subtracting: %5.3f\n", adjustment);
 printf ("# Minimal length of the region required:  %d\n", length);
 printf ("# Minimal number of active sequences required in regions:  %d\n",
         active);
 printf ("# Kind of regions requested:  ");
 if (gaps == GENERAL_GAPS)
  printf ("general (no special treatment of gaps).\n\n\n");
 else if (gaps == IGNORE_GAPS)
  printf (
    "ignored (gap at any position is treated as if it does not exist).\n\n\n");
 else
  printf ("strict (no gaps permitted within regions).\n\n\n");
}


/*****************************************************************************/
/*                                                                           */
/* Procedure: infocon_process_data                                           */
/*                                                                           */

int infocon_process_data (long int range_len, char *counter_seq,
                          long int track, restricted_ptr alignment,
                          float adjustment, int length, int active, int gaps)
{
 long int lines_in_output, *low, *high, index, num_intervals, scanner;
 long int low_track, high_track, spell;
 char *consensus; int total, na, nc, ng, nt, no, ngap, row, gap_mode;
 float *scores; double fa, fc, fg, ft, fgap, da, dc, dg, dt, dgap;

 lines_in_output = 0;

 if (alignment -> dimension < active) {
  fprintf (stderr,
           "WARNING: The number of active sequences within the region (%d)\n",
           alignment -> dimension);
  fprintf (stderr, "         is less than minimum required (%d).\n", active);
 }
 else if (range_len < length) {
  fprintf (stderr, "WARNING: The total length of the segment (%ld)\n",
           range_len);
  fprintf (stderr, "         is less than minimal length of a region (%d).\n",
           length);
 }
 else {
  /* Scan the alignment to score all its columns first                       */

  scores = (float *) NTL0_ckalloc ((range_len + 1) * sizeof (float));

  for (index = 0; index < range_len; index++) {
   total = na = nc = ng = nt = no = ngap = 0;
   for (row = 0; row < alignment -> dimension; row++) {
    if (((alignment -> starts) [row] <= index) &&
        ((alignment -> stops) [row] >= index)) {
     total++;
     if ((alignment -> texts) [row] [index] == GAP_SYMBOL) ngap++;
     else if ((alignment -> texts) [row] [index] == 'A') na++;
     else if ((alignment -> texts) [row] [index] == 'C') nc++;
     else if ((alignment -> texts) [row] [index] == 'G') ng++;
     else if ((alignment -> texts) [row] [index] == 'T') nt++;
     else no++;
    }
   }
   if (no > 0) {
    fprintf (stderr,
          "WARNING: Column %ld contains ambiguity codes - no info content.\n",
             index);
    scores [index] = -INFINITY;
   }
   else if ((ngap > 0) && (gaps == EXCLUDE_GAPS)) scores [index] = -INFINITY;
   else if (total < active)  scores [index] = -INFINITY;
   else {
    if (gaps == IGNORE_GAPS) { total -= ngap; ngap = 0; }

    if (total == 0) {
     fprintf (stderr, "Illegal column (%ld) in the alignment.\n", index);
     return -101;
    }
    else {          /* Calculate and record the column's information content */
     fa = ((double) na) / ((double) total);
     fc = ((double) nc) / ((double) total);
     fg = ((double) ng) / ((double) total);
     ft = ((double) nt) / ((double) total);
     fgap = ((double) ngap) / ((double) total);

     da = fa / pa; dg = fg / pg; dt = ft / pt; dc = fc / pc;
     dgap = fgap / pgap;

     if (da > FLOAT_LN_LIMIT) da = log (da); else da = 0.0;
     if (dc > FLOAT_LN_LIMIT) dc = log (dc); else dc = 0.0;
     if (dg > FLOAT_LN_LIMIT) dg = log (dg); else dg = 0.0;
     if (dt > FLOAT_LN_LIMIT) dt = log (dt); else dt = 0.0;

     if (dgap > FLOAT_LN_LIMIT) dgap = log (dgap); else dgap = 0.0;

     scores [index] = (float)
                      (fa * da + fg * dg + ft * dt + fc * dc + fgap * dgap);
     scores [index] = scores [index] - adjustment;
    }
   }
  }
  /* Determine all full runs within the established vector of scores         */

  infocon_full_runs (scores, range_len, &num_intervals, &low, &high);
  free (scores);

  if (num_intervals > 0) {     /* There are some full runs in this alignment */

   /* Determine the majority character consensus for the entire alignment    */

   if (gaps == GENERAL_GAPS) gap_mode = ORDINARY_GAP;   /* As any other char */
   else if (gaps == IGNORE_GAPS) gap_mode = PROHIBIT_GAP; /* No gaps in con. */
   else gap_mode = PROMOTE_GAP;      /* Promote gaps to center, on occurence */

   consensus = NTL1_Consensus_Sequence (alignment, gap_mode);

   /* Proceed to report all the full runs with their consensus sequences     */

   scanner = 0; for (index = 0; index < num_intervals; index++) {
    while (scanner < low [index]) {
     if (counter_seq [scanner] != GAP_SYMBOL) track++; scanner++;
    }
    low_track = track; while (scanner < high [index]) {
     if (counter_seq [scanner] != GAP_SYMBOL) track++; scanner++;
    }
    if (counter_seq [scanner] == GAP_SYMBOL) {
     if (track > low_track) high_track = track - 1; else high_track = track;
    }
    else high_track = track;

    if (high [index] - low [index] + 1 >= length) {
     printf ("%ld %ld ", low_track, high_track);
     for (spell = low [index]; spell <= high [index]; spell++)
      printf ("%c", consensus [spell]);
     printf ("\n"); lines_in_output++;
    }
   }
  }
 }
 printf ("\n\n# Total number of regions detected:  %ld\n", lines_in_output);
 return 0;
}


/*****************************************************************************/
/*                                                                           */
/* Procedure: infocon_full_runs                                              */
/*                                                                           */

void infocon_full_runs (float *scores, long int range_len,
                        long int *intervals, long int **low_ends,
                        long int **high_ends)
{
 long int *low, *high, range_start, range_stop, current, scan, items;
 bool all_positive, all_negative; float *low_score, *high_score, *sigma;

 /* Allocate space required by the implementation of full runs algorithm     */

 low = (long int *) NTL0_ckalloc (range_len * sizeof (long int));
 high = (long int *) NTL0_ckalloc (range_len * sizeof (long int));
 low_score = (float *) NTL0_ckalloc (range_len * sizeof (float));
 high_score = (float *) NTL0_ckalloc (range_len * sizeof (float));

 /* Allocate space needed for the temporary scores and output delivery str.  */

 sigma = (float *) NTL0_ckalloc ((range_len + 1) * sizeof (float));
 *low_ends = (long int *) NTL0_ckalloc (range_len * sizeof (long int));
 *high_ends = (long int *) NTL0_ckalloc (range_len * sizeof (long int));

 /* Initialize the internal control variables for patching the score array   */

 range_start = 0; range_stop = 0; *intervals = 0;
 while (range_stop < range_len) {
  while ((range_start < range_len) &&
         (scores [range_start] <= -INFINITY + FP_SHIELD)) range_start++;
  range_stop = range_start;

  if (range_stop < range_len) {
   all_positive = TRUE; all_negative = TRUE;
   while ((range_stop < range_len) &&
          (scores [range_stop] > -INFINITY + FP_SHIELD)) {
    if (scores [range_stop] < 0.0) all_positive = FALSE;
    else all_negative = FALSE;
    range_stop++;
   }
   if (all_positive) {
    (*low_ends) [*intervals] = range_start;
    (*high_ends) [*intervals] = range_stop - 1;
    (*intervals)++;
   }
   else if (!all_negative) {
    current = range_stop - range_start;
    for (scan = 0; scan < current; scan++)
     sigma [scan] = scores [range_start + scan];

    /* Determine all full runs within the copied vector                      */

    NTL0_Full_Runs (sigma, current, &items, low, high, low_score, high_score);

    for (scan = 0; scan < items; scan++) {
     (*low_ends) [*intervals] = range_start + low [scan];
     (*high_ends) [*intervals] = range_start + high [scan];
     (*intervals)++;
    }
   }
   range_start = range_stop;
  }
 }
 free (sigma); free (low); free (high); free (low_score); free (high_score);
}


/*****************************************************************************/
/*                                                                           */
/* Procedure: infocon_is_long                                                */
/*                                                                           */
/* Procedure determines whether the input string represents a long integer   */
/*   number, and places its value into a reference parameter; returns TRUE   */
/*   if a correct number has been extracted, FALSE otherwise                 */

bool infocon_is_long (char *string, long int *value)
{
 char *scan, *num_start, saver; int count; long int long_val; bool negative;

 scan = string; *value = 0;
 while ((*scan == ' ') || (*scan == '\t') || (*scan == '\n')) scan++;
 if (*scan == '\0') return FALSE;
 else {
  if (*scan == '-') {
   negative = TRUE; scan++;
   while ((*scan == ' ') || (*scan == '\t') || (*scan == '\n')) scan++;
  }
  else negative = FALSE;

  if (*scan == '\0') return FALSE;
  else if (*scan == '0') {
   if (negative) return FALSE;
   else {
    scan++;
    while ((*scan == ' ') || (*scan == '\t') || (*scan == '\n')) scan++;
    *value = 0;
    if (*scan == '\0') return TRUE; else return FALSE;
   }
  }
  else {
   count = 0; num_start = scan; while ((*scan >= '0') && (*scan <= '9')) {
    count++; scan++;
   }
   if ((count < 1) || (count > MAX_LONG_LEN)) return FALSE;
   else {
    if ((*scan != '\0') && (*scan != ' ') &&
        (*scan != '\t') && (*scan != '\n')) return FALSE;
    else {
     saver = *scan; *scan = '\0'; long_val = atol (num_start);
     *scan = saver;

     while ((*scan == ' ') || (*scan == '\t') || (*scan == '\n')) scan++;
     if (*scan != '\0') return FALSE;
     else {
      if (negative) *value = -long_val; else *value = long_val; return TRUE;
     }
    }
   }
  }
 }
}


/*****************************************************************************/
/*                                                                           */
/* Procedure: infocon_is_int                                                 */
/*                                                                           */
/* Procedure determines whether the input string represents an integer       */
/*   number, and places its value into a reference parameter; returns TRUE   */
/*   if a correct number has been extracted, FALSE otherwise                 */

bool infocon_is_int (char *string, int *value)
{
 long int long_val;

 if (!infocon_is_long (string, &long_val)) { *value = 0; return FALSE; }
 else if ((long_val < MININT_VALUE) || (long_val > MAXINT_VALUE)) {
  *value = 0; return FALSE;
 }
 else { *value = (int) long_val; return TRUE; }
}


/*****************************************************************************/
/*                                                                           */
/* Procedure: infocon_is_float                                               */
/*                                                                           */
/* Procedure determines whether the input string represents a floating point */
/*   (real) number, and places its value into a reference parameter; returns */
/*   TRUE if a correct number has been extracted, FALSE otherwise            */

bool infocon_is_float (char *string, float *value)
{
 char *s; bool negative, fraction; int whole, frac, count; float real_frac;

 s = string; whole = 0; frac = 0; count = 0;                   /* Initialize */

 /* Consult the first symbol in the string and set the appropriate controls  */

 if (*s == '-') { negative = TRUE; s++; if (*s == '\0') return FALSE; }
 else negative = FALSE;
 if (*s == '.') { fraction = TRUE; s++; if (*s == '\0') return FALSE; }
 else fraction = FALSE;

 if ((*s == '0') && (s [1] != '\0') && (s [1] != '.')) return FALSE;

 else {            /* Proceed to collect the number in the part of the value */
  while ((*s >= '0') && (*s <= '9')) {
   if (!fraction) {                /* The collected part is the whole number */
    whole = whole * 10 + (int) (*s - '0'); count++; s++;
    if (count > MAX_WHOLE) return FALSE;
   }
   else {           /* The collected part is the fraction part of the number */
    frac = frac * 10 + (int) (*s - '0'); count++; s++;
    if (count > MAX_FRAC) return FALSE;
   }
  }
  if ((*s != '.') && (*s != '\0')) return FALSE;
  else if ((*s == '.') && (fraction)) return FALSE;
  else if (*s == '.') {
   fraction = TRUE; count = 0; s++; while ((*s >= '0') && (*s <= '9')) {
    frac = frac * 10 + (int) (*s - '0'); count++; s++;
    if (count > MAX_FRAC) return FALSE;
   }
   if (*s != '\0') return FALSE;
  }
  *value = (float) whole; real_frac = (float) frac;
  if (fraction) {
   while (count > 0) { real_frac = real_frac / 10; count--; }
  }
  *value += real_frac; if (negative) *value = -(*value);
  return TRUE;
 }
}


