
static char const rcsid [] = "$Id: kkno.c,v 1.2 1999/08/10 04:07:24 cathy Exp $";


/*****************************************************************************/
/*                                                                           */
/* Program: kkno (implementation of the "known center" algorithm)            */
/*               (formerly "khum")                                           */
/*                                                                           */
/* Author: Nikola Stojanovic                                                 */
/*                                                                           */
/* Revision:    12 SEP 96   Version 1.0                                      */
/*              24 APR 97   Version 2.0                                      */
/*                                                                           */
/*   Given the alignment (file and the sequences) and the center sequence,   */
/* program determines the conserved regions within it, using the "no more    */
/* than k differences with the given center" rule, and the settings provided */
/* by the command-line options. If the center sequence is not privided, the  */
/* first sequence in the alignment, assumed "human", is used as the center   */
/*                                                                           */
/*****************************************************************************/


#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "ntl.h"

/*****************************************************************************/
/*                                                                           */
/* Definitions section                                                       */
/*                                                                           */
/*****************************************************************************/


/*****************************************************************************/
/* Definitions of the constants of the program unit                          */
/*****************************************************************************/



#define DEFAULT_REFERENCE                "human"
#define DEFAULT_RNUM                     1

#define DEFAULT_PATH                     "."

#define DEFAULT_RANGE_ROW                NULL
#define DEFAULT_LOW                      0
#define DEFAULT_HIGH                     0
#define DEFAULT_SEQ_LOW                  0
#define DEFAULT_SEQ_HIGH                 0

#define GENERAL                          1
#define RESTRICTED                       2
#define STRICT                           3

#define DEFAULT_MAX_MISMATCHES           1
#define DEFAULT_MIN_LENGTH               6
#define DEFAULT_MIN_ACTIVE               3
#define DEFAULT_STRICTNESS               RESTRICTED

#define MAX_LONG_LEN                     9
#define MININT_VALUE                     -32000
#define MAXINT_VALUE                     32000



/*****************************************************************************/
/* Types used in the program unit                                            */
/*****************************************************************************/



/*****************************************************************************/
/* Prototypes of locally used functions of the program unit                  */
/*****************************************************************************/


int khum_load_data (strlist_ptr paths, char *alignment_file, char *range_row,
                    long int al_start, long int al_stop, char *sequence_file,
                    bool seq_range_set, long int seq_start, long int seq_stop,
                    char *seq_name, int seq_number, char **center,
                    long int *range_length, restricted_ptr *alignment,
                    long int *track);
void khum_output_header (char *alignment_file, char *range_row,
                         bool al_range_set, long int al_start,
                         long int al_stop, char *sequence_file,
                         bool seq_range_set, long int seq_start,
                         long int seq_stop, char *seq_name,
                         int seq_number, int max_mismatches,
                         int min_length, int min_active, int strictness);
int khum_process_data (long int length, char *center,
                       restricted_ptr alignment, int max_mismatches,
                       int min_length, int min_active, int strictness,
                       long int start_track);
bool khum_process_column (long int position, char *center,
                          restricted_ptr alignment, int max_mismatches,
                          int min_active, int strictness, int *mismatches,
                          int *violates);
bool khum_subsumes (char set, char subset);
void khum_report_region (long int start, long int stop, char *center,
                         long int start_track, long int end_track);
bool khum_is_long (char *string, long int *value);
bool khum_is_int (char *string, int *value);

/*****************************************************************************/
/*                                                                           */
/* Code section                                                              */
/*                                                                           */
/*****************************************************************************/


/*****************************************************************************/
/*                                                                           */
/* Procedure: main                                                           */
/*                                                                           */
/* "main" procedure of the program. Receives and analyses the command line   */
/*   parameters, sets the control variables of the program, checks their     */
/*   consistency and passes control to internal procedures which actually    */
/*   process the input data; returns 0 if everything is OK, non-zero status  */
/*   in case of any errors                                                   */

int main (int argc, char **argv)
{
 char *sequence_file, *seq_name, *alignment_file, *range_row, *center;
 strlist_ptr registered_paths, new_path, path_scan;
 bool seq_range_set, seq_number_set, al_range_set, mismatches_set, length_set;
 bool active_set, strictness_set, range_row_set;
 long int seq_start, seq_stop, al_start, al_stop, range_length;
 int seq_number, max_mismatches, min_length, min_active, strictness, arg_count;
 restricted_ptr alignment; long int track;

 /* Initialize the default settings and prepare for processing of arguments  */
 
 sequence_file = NULL; seq_range_set = FALSE;
 seq_start = DEFAULT_SEQ_LOW; seq_stop = DEFAULT_SEQ_HIGH;
 seq_name = NULL; seq_number_set = FALSE; seq_number = 0;
 alignment_file = NULL; range_row_set = FALSE; range_row = DEFAULT_RANGE_ROW;
 al_range_set = FALSE; al_start = DEFAULT_LOW; al_stop = DEFAULT_HIGH;
 if (DEFAULT_PATH == NULL) registered_paths = NULL;
 else {
  registered_paths = (strlist_ptr) NTL0_ckalloc (sizeof (StrList_Struct));
  registered_paths -> string = DEFAULT_PATH;
  registered_paths -> next = NULL;
 }
 mismatches_set = FALSE; max_mismatches = DEFAULT_MAX_MISMATCHES;
 length_set = FALSE; min_length = DEFAULT_MIN_LENGTH;
 active_set = FALSE; min_active = DEFAULT_MIN_ACTIVE;
 strictness_set = FALSE; strictness = DEFAULT_STRICTNESS;
 
 if (argc < 2) {
  fprintf (stderr, "usage: %s <alignment_file>\n", argv [0]);
  fprintf (stderr, "            [-I <directory_path>]*\n");
  fprintf (stderr, "            [-r <from> <to>]\n");
  fprintf (stderr, "            [-p <pivot_sequence>]\n");
  fprintf (stderr, "            [-c <center_sequence_file_name>]\n");
  fprintf (stderr, "            [-i <from> <to>]\n");
  fprintf (stderr, "            [-n <center_sequence_name>]\n");
  fprintf (stderr, "            [-o <center_sequence_number>]\n");
  fprintf (stderr, "            [-k <max_mismatches_per_sequence>]\n");
  fprintf (stderr, "            [-l <min_length_of_conserved_region]\n");
  fprintf (stderr, "            [-a <min_active_sequences>]\n");
  fprintf (stderr, "            [-g | -h | -s]\n");
  exit (1);
 }
 else {                           /* Some parameters provided - process them */
 
  /* Proceed to extract the command line parameters and create the settings  */

  arg_count = 1; while (arg_count < argc) {
  
   if (argv [arg_count] [0] != '-') {       /* Not an "-" option - file name */
    if (alignment_file != NULL) {
     fprintf (stderr, "Repeated alignment file name ('%s').\n",
                      argv [arg_count]);
     exit (1);
    }
    else { alignment_file = NTL0_strsave (argv [arg_count]); arg_count++; }
   }
   else if (!strcmp (argv [arg_count], "-I")) {        /* New "include" path */
    arg_count++;
    if (arg_count == argc) {
     fprintf (stderr, "Missing directory path for inclusion.\n"); exit (1);
    }
    else {
     new_path = (strlist_ptr) NTL0_ckalloc (sizeof (StrList_Struct));
     new_path -> string = NTL0_strsave (argv [arg_count]);
     new_path -> next = NULL;
     
     if (registered_paths == NULL) registered_paths = new_path;
     else {
      path_scan = registered_paths;
      while (path_scan -> next != NULL) path_scan = path_scan -> next;
      path_scan -> next = new_path;
     }
     arg_count++;
    }
   }
   else if (!strcmp (argv [arg_count], "-r")) {             /* Range request */
    if (al_range_set) {
     fprintf (stderr,
              "Range for alignment loading already set to [%ld-%ld].\n",
              al_start, al_stop);
     exit (1);
    }
    else if (arg_count > argc - 3) {
     fprintf (stderr, "Incomplete range specification.\n"); exit (1);
    }
    else if ((!khum_is_long (argv [arg_count + 1], &al_start)) ||
             (!khum_is_long (argv [arg_count + 2], &al_stop))) {
     fprintf (stderr, "Illegal specification for the range.\n"); exit (1);
    }
    else if ((al_start < 0) || (al_stop < 0)) {
     fprintf (stderr, "Negative values not permitted for the range.\n");
     exit (1);
    }
    else if (al_start > al_stop) {
     fprintf (stderr, "Range start (%ld) greater than range end (%ld).\n",
                      al_start, al_stop);
     exit (1);
    }
    else { al_range_set = TRUE; arg_count += 3; }
   }
   else if (!strcmp (argv [arg_count], "-p")) {      /* Pivot sequence spec. */
    arg_count++;
    if (range_row_set) {
     fprintf (stderr, "Pivot sequence for alignment range already set.\n");
     exit (1);
    }
    else {
     range_row_set = TRUE; range_row = NTL0_strsave (argv [arg_count]);
     arg_count++;
    }
   }
   else if (!strcmp (argv [arg_count], "-c")) {        /* Sequence file name */
    arg_count++;
    if (sequence_file != NULL) {
     fprintf (stderr,
              "Name of file containing the center sequence already set.\n");
     exit (1);
    }
    else { sequence_file = NTL0_strsave (argv [arg_count]); arg_count++; }
   }
   else if (!strcmp (argv [arg_count], "-i")) {   /* Range in center request */
    if (seq_range_set) {
     fprintf (stderr, "Range for center sequence already set to [%ld-%ld].\n",
              seq_start, seq_stop);
     exit (1);
    }
    else if (arg_count > argc - 3) {
     fprintf (stderr, "Incomplete sequence range specification.\n"); exit (1);
    }
    else if ((!khum_is_long (argv [arg_count + 1], &seq_start)) ||
             (!khum_is_long (argv [arg_count + 2], &seq_stop))) {
     fprintf (stderr, "Illegal specification for sequence range.\n"); exit (1);
    }
    else if ((seq_start < 0) || (seq_stop < 0)) {
     fprintf (stderr, "Negative values not permitted for sequence range.\n");
     exit (1);
    }
    else if (seq_start > seq_stop) {
     fprintf (stderr, "Range start (%ld) greater than range end (%ld).\n",
                      seq_start, seq_stop);
     exit (1);
    }
    else { seq_range_set = TRUE; arg_count += 3; }
   }
   else if (!strcmp (argv [arg_count], "-n")) {      /* Center sequence name */
    arg_count++;
    if (seq_name != NULL) {
     fprintf (stderr, "Center sequence in alignment already set.\n");
     exit (1);
    }
    else { seq_name = NTL0_strsave (argv [arg_count]); arg_count++; }
   }
   else if (!strcmp (argv [arg_count], "-o")) {    /* Center sequence number */
    if (seq_number_set) {
     fprintf (stderr,
              "Center sequence number in alignment already set to %d.\n",
              seq_number);
     exit (1);
    }
    else if (arg_count > argc - 2) {
     fprintf (stderr, "Incomplete sequence number specification.\n"); exit (1);
    }
    else if (!khum_is_int (argv [arg_count + 1], &seq_number)) {
     fprintf (stderr, "Illegal sequence number (%s).\n", argv [arg_count + 1]);
     exit (1);
    }
    else if (seq_number < 0) {
     fprintf (stderr, "Negative values not permitted for sequence number.\n");
     exit (1);
    }
    else { seq_number_set = TRUE; arg_count += 2; }
   }
   else if (!strcmp (argv [arg_count], "-k")) {  /* Maximal mismatch request */
    if (mismatches_set) {
     fprintf (stderr,
              "Maximal mismatch number per sequence already set to %d.\n",
              max_mismatches);
     exit (1);
    }
    else if (arg_count > argc - 2) {
     fprintf (stderr, "Incomplete mismatch specification.\n"); exit (1);
    }
    else if (!khum_is_int (argv [arg_count + 1], &max_mismatches)) {
     fprintf (stderr, "Illegal number of permitted mismatches (%s).\n",
              argv [arg_count + 1]);
     exit (1);
    }
    else if (max_mismatches < 0) {
     fprintf (stderr, "Negative values not permitted for mismatches.\n");
     exit (1);
    }
    else { mismatches_set = TRUE; arg_count += 2; }
   }
   else if (!strcmp (argv [arg_count], "-l")) {     /* Region length request */
    if (length_set) {
     fprintf (stderr,
              "Minimal length of conserved region already set to %d.\n",
              min_length);
     exit (1);
    }
    else if (arg_count > argc - 2) {
     fprintf (stderr, "Incomplete length specification.\n"); exit (1);
    }
    else if (!khum_is_int (argv [arg_count + 1], &min_length)) {
     fprintf (stderr, "Illegal minimal region length (%s).\n",
                      argv [arg_count + 1]);
     exit (1);
    }
    else if (min_length <= 0) {
     fprintf (stderr, "Positive length required for conserved regions.");
     exit (1);
    }
    else { length_set = TRUE; arg_count += 2; }
   }
   else if (!strcmp (argv [arg_count], "-a")) {  /* Active sequences request */
    if (active_set) {
     fprintf (stderr,
              "Minimal number of active sequences already set to %d.\n",
              min_active);
     exit (1);
    }
    else if (arg_count > argc - 2) {
     fprintf (stderr, "Incomplete active sequences specification.\n");
     exit (1);
    }
    else if (!khum_is_int (argv [arg_count + 1], &min_active)) {
     fprintf (stderr, "Illegal minimal number of active sequences (%s).\n",
                      argv [arg_count + 1]);
     exit (1);
    }
    else if (min_active <= 0) {
     fprintf (stderr, "Positive number of active sequences required.");
     exit (1);
    }
    else { active_set = TRUE; arg_count += 2; }
   }
   else if (!strcmp (argv [arg_count], "-g")) {    /* Search general regions */
    if (strictness_set) {
     fprintf (stderr, "Kind of requested regions already set.\n"); exit (1);
    }
    else { strictness = GENERAL; strictness_set = TRUE; arg_count++; }
   }
   else if (!strcmp (argv [arg_count], "-h")) { /* Search restricted regions */
    if (strictness_set) {
     fprintf (stderr, "Kind of requested regions already set.\n"); exit (1);
    }
    else {
     strictness = RESTRICTED; strictness_set = TRUE; arg_count++;
    }
   }
   else if (!strcmp (argv [arg_count], "-s")) {     /* Search strict regions */
    if (strictness_set) {
     fprintf (stderr, "Kind of requested regions already set.\n"); exit (1);
    }
    else { strictness = STRICT; strictness_set = TRUE; arg_count++; }
   }
   else {                                              /* Unknown "-" option */
    fprintf (stderr, "Illegal command line option (%s).\n", argv [arg_count]);
    fprintf (stderr, "usage: %s <alignment_file>\n", argv [0]);
    fprintf (stderr, "            [-I <directory_path>]*\n");
    fprintf (stderr, "            [-r <from> <to>]\n");
    fprintf (stderr, "            [-p <pivot_sequence>]\n");
    fprintf (stderr, "            [-c <center_sequence_file_name>]\n");
    fprintf (stderr, "            [-i <from> <to>]\n");
    fprintf (stderr, "            [-n <center_sequence_name>]\n");
    fprintf (stderr, "            [-o <center_sequence_number>]\n");
    fprintf (stderr, "            [-k <max_mismatches_per_sequence>]\n");
    fprintf (stderr, "            [-l <min_length_of_conserved_region]\n");
    fprintf (stderr, "            [-a <min_active_sequences>]\n");
    fprintf (stderr, "            [-g | -h | -s]\n");
    exit (1);
   }
  }
  /* Check whether all necessary data have been collected and consistent     */

  if (alignment_file == NULL) {
   fprintf (stderr, "Must have an alignment to process.\n"); exit (1);
  }
  else if ((al_range_set) && (range_row == NULL))
   range_row = DEFAULT_REFERENCE;
  else if ((range_row_set) && (!al_range_set)) {
   fprintf (stderr, "Must have range in requested '%s' in alignment.\n",
                    range_row);
   exit (1);
  }
  if (sequence_file != NULL) {
   if ((seq_name != NULL) || (seq_number_set)) {
    fprintf (stderr, "Can't have center from both file and alignment.\n");
    exit (1);
   }
  }
  else if (seq_range_set) {
   fprintf (stderr, "Can't have sequence range without file specification.\n");
   exit (1);
  }
  else {
   if (seq_name == NULL) seq_name = DEFAULT_REFERENCE;
   if (!seq_number_set) seq_number = DEFAULT_RNUM;
  }
  /* Proceed to collect (load) the data to be processed by the algorithm     */

  if (khum_load_data (registered_paths, alignment_file, range_row, al_start,
                      al_stop, sequence_file, seq_range_set, seq_start,
                      seq_stop, seq_name, seq_number, &center, &range_length,
                      &alignment, &track) < 0)
   exit (1);

  /* Apply the known center algorithm to the assembled data - output results */

  khum_output_header (alignment_file, range_row, al_range_set, al_start,
                      al_stop, sequence_file, seq_range_set, seq_start,
                      seq_stop, seq_name, seq_number, max_mismatches,
                      min_length, min_active, strictness);
  if (khum_process_data (range_length, center, alignment, max_mismatches,
                         min_length, min_active, strictness, track) < 0)
   exit (1);

  exit (0);
 }
}


/*****************************************************************************/
/*                                                                           */
/* Procedure: khum_load_data                                                 */
/*                                                                           */
/* Procedure collects alignment and center sequence data with respect to     */
/*   provided settings and returns them within its reference parameters;     */
/*   returns 0 if everything is OK, negative status otherwise                */

int khum_load_data (strlist_ptr paths, char *alignment_file, char *range_row,
                    long int al_start, long int al_stop, char *sequence_file,
                    bool seq_range_set, long int seq_start, long int seq_stop,
                    char *seq_name, int seq_number, char **center,
                    long int *range_length, restricted_ptr *alignment,
                    long int *track)
{
 strlist_ptr current_path; char *new_path, *full_name; int ch, index, count;
 header_ptr file_info; align_ptr packed; unpacked_ptr expanded;
 line_ptr line_al, last_line; FILE *input_file; errind err_stat;
 
 /* Load the alignment into an internal structure first                      */

 if ((paths == NULL) ||
     (alignment_file [0] == '/') || (alignment_file [0] == '~')) {
  if ((err_stat = NTL2_Load_AlignFile (alignment_file, &file_info)) != NULL) {
   fprintf (stderr, "Can't open alignment file '%s' (%s).\n",
                    alignment_file, err_stat -> message);
   return -1;
  }
  else if (file_info == NULL) {        /* File found, but nothing retrieved */
   fprintf (stderr, "No alignment to process.\n"); return -2;
  }
 }
 else {                     /* Directory path(s) provided, try them in order */
  current_path = paths; file_info = NULL;
  while ((file_info == NULL) && (current_path != NULL)) {
   new_path = NTL0_ckalloc ((strlen (current_path -> string) +
                             strlen (alignment_file) + 2) * sizeof (char));
   strcpy (new_path, current_path -> string);
   if (alignment_file [0] != '/') strcat (new_path, "/");
   strcat (new_path, alignment_file);
   
   if ((err_stat = NTL2_Load_AlignFile (new_path, &file_info)) != NULL) {
    file_info = NULL; current_path = current_path -> next;
   }
   free (new_path);
  }
  if (err_stat != NULL) {                      /* File not found on any path */
   fprintf (stderr, "Can't open alignment file '%s' (%s).\n",
                    alignment_file, err_stat -> message);
   return -3;
  }
  else if (file_info == NULL) {         /* File found, but nothing retrieved */
   fprintf (stderr, "No alignment to process.\n"); return -4;
  }
 }
 /* Now proceed to partially unpack the information from the loaded file     */
 
 if ((err_stat = NTL1_Load_Alignment (1, file_info, &packed)) != NULL) {
  fprintf (stderr, "Can't extract the alignment from the file (%s).\n",
                   err_stat -> message);
  return -5;
 }
 else {
  if ((err_stat = NTL2_Unpack_Alignment (1, file_info, packed, paths,
                                         &expanded)) != NULL) {
   fprintf (stderr, "Can't expand the alignment (%s).\n", err_stat -> message);
   return -6;
  }
  else {   /* Expansion of the full alignment done OK, check for restriction */
  
   if (range_row != NULL) {
    if ((err_stat = NTL1_Cut_Alignment (file_info, expanded, range_row,
                                        al_start, al_stop)) != NULL) {
     if (err_stat -> kind == WARNING) {
      fprintf (stderr, "WARNING:  %s.\n", err_stat -> message);
     }
     else {
      fprintf (stderr, "Can't get the range from the alignment (%s).\n",
                       err_stat -> message);
      return -7;
     }
    }
   }
  }
 }
 /* If the center sequence is to be loaded from a file, load it & set length */

 if (sequence_file != NULL) {
  *center = NULL;
  if (seq_range_set) {                    /* Only part of the file requested */
   if ((paths == NULL) ||
       (sequence_file [0] == '/') || (sequence_file [0] == '~')) {
    if ((err_stat = NTL2_Load_Sequence (sequence_file, seq_start, seq_stop,
                                        center)) != NULL) {
     fprintf (stderr, "Can't open sequence file '%s' (%s).\n",
                      sequence_file, err_stat -> message);
     return -11;
    }
    else if (*center == NULL) {         /* File found, but nothing retrieved */
     fprintf (stderr, "No center sequence.\n"); return -12;
    }
   }
   else {                   /* Directory path(s) provided, try them in order */
    current_path = paths;
    while ((*center == NULL) && (current_path != NULL)) {
     new_path = NTL0_ckalloc ((strlen (current_path -> string) +
                               strlen (sequence_file) + 2) * sizeof (char));
     strcpy (new_path, current_path -> string);
     if (sequence_file [0] != '/') strcat (new_path, "/");
     strcat (new_path, sequence_file);
     
     if ((err_stat = NTL2_Load_Sequence (new_path, seq_start, seq_stop,
                                         center)) != NULL) {
      *center = NULL; current_path = current_path -> next;
     }
     free (new_path);
    }
    if (err_stat != NULL) {                    /* File not found on any path */
     fprintf (stderr, "Can't open sequence file '%s' (%s).\n",
                      sequence_file, err_stat -> message);
     return -13;
    }
    else if (*center == NULL) {         /* File found, but nothing retrieved */
     fprintf (stderr, "Missing center sequence.\n"); return -14;
    }
   }
   *range_length = seq_stop - seq_start + 1;
   if (*range_length != strlen (*center)) {
    fprintf (stderr,
             "Incorrectly calculated length of the center sequence.\n");
    return -15;
   }
   *track = seq_start;
  }
  else {                                    /* No range set for the sequence */
   if ((paths == NULL) ||
       (sequence_file [0] == '/') || (sequence_file [0] == '~')) {
    if ((input_file = fopen (sequence_file, "r")) == NULL) {
     fprintf (stderr, "Can't open sequence file '%s'.\n", sequence_file);
     return -16;
    }
    else full_name = NTL0_strsave (sequence_file);
   }
   else {                   /* Directory path(s) provided, try them in order */
    current_path = paths; input_file = NULL;
    while ((input_file == NULL) && (current_path != NULL)) {
     new_path = NTL0_ckalloc ((strlen (current_path -> string) +
                               strlen (sequence_file) + 2) * sizeof (char));
     strcpy (new_path, current_path -> string);
     if (sequence_file [0] != '/') strcat (new_path, "/");
     strcat (new_path, sequence_file);

     if ((input_file = fopen (new_path, "r")) == NULL) {
      current_path = current_path -> next;
     }
     else full_name = NTL0_strsave (new_path);
     
     free (new_path);
    }
    if (input_file == NULL) {                  /* File not found on any path */
     fprintf (stderr, "Can't open sequence file '%s'.\n", sequence_file);
     return -17;
    }
   }
   *range_length = 0; while ((ch = fgetc (input_file)) != EOF) {
    if (ch != (int) '\n') (*range_length)++;
   }
   *center = (char *) NTL0_ckalloc (((*range_length) + 1) * sizeof (char));
   rewind (input_file);
   index = 0; while ((ch = fgetc (input_file)) != EOF) {
    if (ch != (int) '\n') (*center) [index++] = (char) ch;
   }
   if (index != *range_length) {
    fprintf (stderr, "Error calculating the length of center sequence.\n");
    return -18;
   }
   (*center) [index] = '\0';
   *track = 1;
  }
  if (*range_length != expanded -> size) {
   fprintf (stderr,
        "Length of center (%ld) does not match length of alignment (%ld).\n",
            *range_length, expanded -> size);
   return -19;
  }
 }
 else {         /* An alignment row is to be selected as the center sequence */

  if (seq_name == NULL) {
   fprintf (stderr, "No sequence name provided for the proposed center.\n");
   return -20;
  }
  if ((seq_number < 1) || (seq_number > expanded -> dimension)) {
   fprintf (stderr, "Illegal center row number (%d) for the alignment.\n",
                    seq_number);
   return -21;
  }
  if (strcmp (((file_info -> sequences) [seq_number - 1]).seq_name,
              seq_name)) {
   fprintf (stderr, "Sequence '%s' is not at row %d in the alignment.\n",
                    seq_name, seq_number);
   return -22;
  }
  *center = NTL0_strsave ((expanded -> texts) [seq_number - 1]);
  *range_length = expanded -> size;
  *track = (expanded -> begin) [seq_number - 1];
 }

 /* Now copy expanded alignment into more compact structure for processing   */
 
 *alignment = (restricted_ptr) NTL0_ckalloc (sizeof (Restricted_Struct));
 
 count = 0;
 for (index = 0; index < expanded -> dimension; index++) {
  if ((expanded -> segment_code) [index] == VALID_SEGMENT) count++;
 }
 (*alignment) -> dimension = count;
 (*alignment) -> size = expanded -> size;
 
 (*alignment) -> texts = (char **) NTL0_ckalloc (count * sizeof (char *));
 (*alignment) -> begin = (long int *)
                         NTL0_ckalloc (count * sizeof (long int *));
 (*alignment) -> end = (long int *)
                       NTL0_ckalloc (count * sizeof (long int *));
 (*alignment) -> starts = (long int *)
                          NTL0_ckalloc (count * sizeof (long int *));
 (*alignment) -> stops = (long int *)
                         NTL0_ckalloc (count * sizeof (long int *));

 count = 0;
 for (index = 0; index < expanded -> dimension; index++) {
  if ((expanded -> segment_code) [index] == VALID_SEGMENT) {
   ((*alignment) -> texts) [count] = (expanded -> texts) [index];
   ((*alignment) -> begin) [count] = (expanded -> begin) [index];
   ((*alignment) -> end) [count] = (expanded -> end) [index];
   ((*alignment) -> starts) [count] = (expanded -> starts) [index];
   ((*alignment) -> stops) [count] = (expanded -> stops) [index];
   count++;
  }
 }
 /* Destroy the "file_info" structure, since its contents are not needed     */

 /* ### To be defined ###################################################### */
    
 /* Destroy "partially unpacked" structure used in the expansion - done      */
    
 if (packed -> begin != NULL) free (packed -> begin);
 if (packed -> end != NULL) free (packed -> end);
    
 if (packed -> lines != NULL) {
  for (index = 0; index < expanded -> dimension; index++) {
   line_al = (packed -> lines) [index]; while (line_al != NULL) {
    last_line = line_al; line_al = line_al -> next; free (last_line);
   }
  }
 }
 free (packed);

 /* Destroy the expanded alignment info now when the restricted copy made    */
    
 /* ### To be defined ###################################################### */

 return 0;
}


/*****************************************************************************/
/*                                                                           */
/* Procedure: khum_output_header                                             */
/*                                                                           */
/* Procedure writes a number of comment lines in the output - output type    */
/*   identification and data about the settings under which it was           */
/*   generated, alignment file name and requested range, center sequence     */
/*   origin, maximal number of mismatches per sequence permitted, minimal    */
/*   number of active sequences required and the kind of regions requested   */
/*   (general, restricted or strict)                                         */

void khum_output_header (char *alignment_file, char *range_row,
                         bool al_range_set, long int al_start,
                         long int al_stop, char *sequence_file,
                         bool seq_range_set, long int seq_start,
                         long int seq_stop, char *seq_name,
                         int seq_number, int max_mismatches,
                         int min_length, int min_active, int strictness)
{
 printf ("#:plain:\n\n");
 printf ("# Output of the \"known center\" region location utility.\n");
 printf ("# Alignment file:  '%s'\n", alignment_file);
 if (al_range_set)
  printf ("#   restricted to the range [%ld,%ld] in \"%s\".\n",
          al_start, al_stop, range_row);
 if (sequence_file != NULL) {
  printf ("# Center sequence loaded from file:  '%s'\n", sequence_file);
  if (seq_range_set)
   printf ("#   restricted to the range:  [%ld,%ld]\n", seq_start, seq_stop);
 }
 else {
  printf (
   "# Alignment row %d, sequence \"%s\", used as template for the center.\n",
          seq_number, seq_name);
 }
 printf (
     "# Maximal number of mismatches (per row) with center permitted:  %d\n",
         max_mismatches);
 printf ("# Minimal length of the region required:  %d\n", min_length);
 printf ("# Minimal number of active sequences required in regions:  %d\n",
         min_active);
 printf ("# Kind of regions requested:  ");
 if (strictness == GENERAL)
  printf ("general (no special treatments of gaps).\n\n\n");
 else if (strictness == RESTRICTED)
  printf ("restricted (no gaps permitted in center sequence).\n\n\n");
 else
  printf ("strict (no gaps permitted within regions).\n\n\n");
}


/*****************************************************************************/
/*                                                                           */
/* Procedure: khum_process_data                                              */
/*                                                                           */
/* Procedure implements the "known center" algorithm for locating the        */
/*   conserved regions in an alignment; returns 0 if the location of regions */
/*   was successful, negative status otherwise                               */

int khum_process_data (long int length, char *center,
                       restricted_ptr alignment, int max_mismatches,
                       int min_length, int min_active, int strictness,
                       long int start_track)
{
 int index, violates, *mismatches; bool qualifies;
 long int start_scan, stop_scan, end_track, counter;
 
 counter = 0;
 if (alignment -> dimension < min_active) {
  fprintf (stderr, "WARNING: Not enough active sequences within the range.\n");
 }
 else {

  /* Initialize the starting and ending positions for the first region,      */
  /*   counter of violating sequences and the mismatch vector                */
  
  start_scan = 0; stop_scan = -1; violates = 0; end_track = start_track;
  mismatches = (int *) NTL0_ckalloc ((alignment -> dimension) * sizeof (int));
  for (index = 0; index < alignment -> dimension; index++)
   mismatches [index] = 0;

  /* Loop for all regions, until the alignment end is reached                */

  while (stop_scan < length) {

   /* Proceed to locate the next region, from the defined start              */

   qualifies = TRUE;
   while ((stop_scan < length) && (violates == 0) && (qualifies)) {
    if ((stop_scan >= 0) && (center [stop_scan] != GAP_SYMBOL)) end_track++;
    stop_scan++;
    if (stop_scan < length) {
     qualifies = khum_process_column (stop_scan, center, alignment,
                                      max_mismatches, min_active, strictness,
                                      mismatches, &violates);
    }
   }
   /* If the located region is sufficiently long, report it                  */

   if (stop_scan - start_scan >= min_length) {
    khum_report_region (start_scan, stop_scan, center, start_track, end_track);
    counter++;
   }

   /* Adjust the starting position to the start of the new region, skipping  */
   /*   these that must be contained in one already found                    */

   if (!qualifies) {   /* Current stop position can not be inside any region */

    /* Reset the controls to the next possible start of a region             */
    
    start_scan = stop_scan + 1;
    if (center [stop_scan] != GAP_SYMBOL) start_track = end_track + 1;
    else start_track = end_track;
    violates = 0;
    for (index = 0; index < alignment -> dimension; index++)
     mismatches [index] = 0;
   }
   else {   /* Do the normal adjusting, position to start of next (overlap?) */

    while (violates > 0) {
     for (index = 0; index < alignment -> dimension; index++) {
      if (((alignment -> starts) [index] <= start_scan) &&
          ((alignment -> stops) [index] >= start_scan)) {
       if (!khum_subsumes (center [start_scan],
                           (alignment -> texts) [index] [start_scan])) {
        (mismatches [index])--;
        if (mismatches [index] == max_mismatches) violates--;
       }
      }
     }
     if (center [start_scan] != GAP_SYMBOL) start_track++;
     start_scan++;
    }
   }
  }
  free (mismatches);
 }
 printf ("\n# Total number of conforming regions determined:  %ld\n", counter);
 return 0;
}


/*****************************************************************************/
/*                                                                           */
/* Procedure: khum_process_column                                            */
/*                                                                           */
/* Procedure processes the data from the current alignment column, updates   */
/*   the mismatch vector, examines whether the maximal number of mismatches  */
/*   per column has been exceeded by the column, and, before that, whether   */
/*   the column satisfies the criterions for inclusion in any conserved      */
/*   region; returns TRUE if it does, FALSE if it does not                   */

bool khum_process_column (long int position, char *center,
                          restricted_ptr alignment, int max_mismatches,
                          int min_active, int strictness, int *mismatches,
                          int *violates)
{
 int total_sequences, index;
 
 if ((center [position] == GAP_SYMBOL) &&
     ((strictness == RESTRICTED) || (strictness == STRICT))) return FALSE;
 else {
  total_sequences = 0;
  for (index = 0; index < alignment -> dimension; index++) {
   if (((alignment -> starts) [index] <= position) &&
       ((alignment -> stops) [index] >= position)) {
    total_sequences++;
    if ((strictness == STRICT) &&
        ((alignment -> texts) [index] [position] == GAP_SYMBOL)) return FALSE;
    if (!khum_subsumes (center [position],
                        (alignment -> texts) [index] [position])) {
     (mismatches [index])++;
     if (mismatches [index] > max_mismatches) (*violates)++;
    }
   }
  }
  if (total_sequences < min_active) return FALSE;
  else return TRUE;
 }
}


/*****************************************************************************/
/*                                                                           */
/* Procedure: khum_subsumes                                                  */
/*                                                                           */

bool khum_subsumes (char set, char subset)
{
 if (set == subset) return TRUE;
 else if ((set == GAP_SYMBOL) ||
          (set == 'A') || (set == 'C') || (set == 'G') || (set == 'T'))
  return FALSE;
 else if (subset == GAP_SYMBOL) return FALSE;
 else if ((set == 'N') || (set == 'X')) return TRUE;
 else if ((subset == 'N') || (subset == 'X')) return FALSE;
 else if ((subset == 'A') && ((set == 'W') ||
                              (set == 'R') ||
                              (set == 'M') ||
                              (set == 'D') ||
                              (set == 'H') ||
                              (set == 'V'))) return TRUE;
 else if ((subset == 'C') && ((set == 'Y') ||
                              (set == 'S') ||
                              (set == 'M') ||
                              (set == 'B') ||
                              (set == 'H') ||
                              (set == 'V'))) return TRUE;
 else if ((subset == 'G') && ((set == 'S') ||
                              (set == 'R') ||
                              (set == 'K') ||
                              (set == 'B') ||
                              (set == 'D') ||
                              (set == 'V'))) return TRUE;
 else if ((subset == 'T') && ((set == 'W') ||
                              (set == 'Y') ||
                              (set == 'K') ||
                              (set == 'B') ||
                              (set == 'D') ||
                              (set == 'H'))) return TRUE;
 else if ((set == 'B') && ((subset == 'K') ||
                           (subset == 'S') ||
                           (subset == 'Y'))) return TRUE;
 else if ((set == 'D') && ((subset == 'K') ||
                           (subset == 'R') ||
                           (subset == 'W'))) return TRUE;
 else if ((set == 'H') && ((subset == 'M') ||
                           (subset == 'W') ||
                           (subset == 'Y'))) return TRUE;
 else if ((set == 'V') && ((subset == 'M') ||
                           (subset == 'R') ||
                           (subset == 'S'))) return TRUE;
 else if ((set != 'W') && (set != 'Y') && (set != 'S') && (set != 'R') &&
          (set != 'M') && (set != 'K') && (set != 'B') && (set != 'D') &&
          (set != 'H') && (set != 'V')) {
  fprintf (stderr,
           "WARNING: Unrecognized character ('%c') in center sequence.\n", set);
  return FALSE;
 }
 else if ((subset != 'A') && (subset != 'C') && (subset != 'G') &&
          (subset != 'T') && (subset != 'W') && (subset != 'Y') &&
          (subset != 'S') && (subset != 'R') && (subset != 'M') &&
          (subset != 'K') && (subset != 'B') && (subset != 'D') &&
          (subset != 'H') && (subset != 'V')) {
  fprintf (stderr, "WARNING: Unrecognized character ('%c') in alignment.\n",
                   subset);
  return FALSE;
 }
 else return FALSE;
}


/*****************************************************************************/
/*                                                                           */
/* Procedure: khum_report_region                                             */
/*                                                                           */

void khum_report_region (long int start, long int stop, char *center,
                         long int start_track, long int end_track)
{
 long int index;
 
 printf ("%ld ", start_track);
 if (stop > 0) {
  if (end_track > start_track) printf ("%ld ", end_track - 1);
  else if (start_track == end_track) printf ("%ld ", end_track);
  else {
   fprintf (stderr, "PANIC: start track (%ld) greater than end track (%ld).\n",
                    start_track, end_track);
   exit (2);
  }
 }
 else { fprintf (stderr, "PANIC: Zero length region reporting.\n"); exit (2); }

 /* Print the part of the center corresponding to this region                */

 for (index = start; index < stop; index++) printf ("%c", center [index]);
 printf ("\n");
}


/*****************************************************************************/
/*                                                                           */
/* Procedure: khum_is_long                                                   */
/*                                                                           */

bool khum_is_long (char *string, long int *value)
{
 char *scan, *num_start, saver; int count; long int long_val; bool negative;
 
 scan = string; *value = 0;
 while ((*scan == ' ') || (*scan == '\t') || (*scan == '\n')) scan++;
 if (*scan == '\0') return FALSE;
 else {
  if (*scan == '-') {
   negative = TRUE; scan++;
   while ((*scan == ' ') || (*scan == '\t') || (*scan == '\n')) scan++;
  }
  else negative = FALSE;

  if (*scan == '\0') return FALSE;
  else if (*scan == '0') {
   if (negative) return FALSE;
   else {
    scan++;
    while ((*scan == ' ') || (*scan == '\t') || (*scan == '\n')) scan++;
    *value = 0;
    if (*scan == '\0') return TRUE; else return FALSE;
   }
  }
  else {
   count = 0; num_start = scan; while ((*scan >= '0') && (*scan <= '9')) {
    count++; scan++;
   }
   if ((count < 1) || (count > MAX_LONG_LEN)) return FALSE;
   else {
    if ((*scan != '\0') && (*scan != ' ') &&
        (*scan != '\t') && (*scan != '\n')) return FALSE;
    else {
     saver = *scan; *scan = '\0'; long_val = atol (num_start);
     *scan = saver;
     
     while ((*scan == ' ') || (*scan == '\t') || (*scan == '\n')) scan++;
     if (*scan != '\0') return FALSE;
     else {
      if (negative) *value = -long_val; else *value = long_val; return TRUE;
     }
    }
   }
  }
 }
}


/*****************************************************************************/
/*                                                                           */
/* Procedure: khum_is_int                                                    */
/*                                                                           */

bool khum_is_int (char *string, int *value)
{
 long int long_val;

 if (!khum_is_long (string, &long_val)) { *value = 0; return FALSE; }
 else if ((long_val < MININT_VALUE) || (long_val > MAXINT_VALUE)) {
  *value = 0; return FALSE;
 }
 else { *value = (int) long_val; return TRUE; }
}


