static char const rcsid [] = "$Id: fulrun.c,v 1.1 1998/04/21 10:36:23 stojanov Exp $"; /*****************************************************************************/ /* */ /* Program: fulrun (locating maximal full intervals in a given array) */ /* */ /* Author: Nikola Stojanovic */ /* */ /* Revision: 03 MAR 98 Version 1.0 */ /* */ /*****************************************************************************/ #include #include #include #include #include #include "ntl.h" /*****************************************************************************/ /* */ /* Definitions section */ /* */ /*****************************************************************************/ /*****************************************************************************/ /* Definitions of the constants of the program unit */ /*****************************************************************************/ #define UNDEFINED 31781 #define MAX_LONG_LEN 9 #define MININT_VALUE -32000 #define MAXINT_VALUE 32000 #define MAX_WHOLE 9 #define MAX_FRAC 20 #define DEFAULT_MINIMAL_LENGTH 6 #define BUFFER_SIZE 300 /*****************************************************************************/ /* Prototypes of locally used functions of the program unit */ /*****************************************************************************/ int fulrun_load_scores (char *scores_file, bool range_set, long int *start, long int stop, float **scores, long int *size); void fulrun_output_header (char *scores_file, bool range_set, long int start, long int stop, int length, bool infinity_set, float infinity); int fulrun_process_data (float *scores, long int size, long int start, int length, bool infinity_set, float infinity); void fulrun_runs (float *scores, long int range_len, bool infinity_set, float infinity, long int *intervals, long int **low_ends, long int **high_ends); bool fulrun_is_long (char *string, long int *value); bool fulrun_is_int (char *string, int *value); bool fulrun_is_float (char *string, float *value); /*****************************************************************************/ /* */ /* Code section */ /* */ /*****************************************************************************/ /*****************************************************************************/ /* */ /* Procedure: main */ /* */ /* "main" procedure of the program. Receives and analyses the command line */ /* parameters, sets the control variables of the program, checks their */ /* consistency and passes control to internal procedures which actually */ /* process the input data; returns 0 if everything is OK, non-zero status */ /* in case of any errors */ int main (int argc, char **argv) { bool range_set, length_set, infinity_set; char *scores_file; int length, arg_count; long int start, stop, size; float infinity, *scores; /* Initialize the settings and prepare for the processing of the arguments */ scores_file = NULL; range_set = FALSE; start = stop = UNDEFINED; length_set = FALSE; length = DEFAULT_MINIMAL_LENGTH; infinity_set = FALSE; infinity = (float) UNDEFINED; if (argc < 2) { fprintf (stderr, "usage: %s \n", argv [0]); fprintf (stderr, " [-r ]\n"); fprintf (stderr, " [-l ]\n"); exit (1); } else { /* Some parameters provided - process them */ /* Proceed to extract the command line parameters and create the settings */ arg_count = 1; while (arg_count < argc) { if (argv [arg_count] [0] != '-') { /* Not an "-" option - file name */ if (scores_file != NULL) { fprintf (stderr, "Repeated scores file name ('%s').\n", argv [arg_count]); exit (1); } else { scores_file = NTL0_strsave (argv [arg_count]); arg_count++; } } else if (!strcmp (argv [arg_count], "-r")) { /* Range request */ if (range_set) { fprintf (stderr, "Range for score array loading already set to [%ld-%ld].\n", start, stop); exit (1); } else if (arg_count > argc - 3) { fprintf (stderr, "Incomplete range specification.\n"); exit (1); } else if ((!fulrun_is_long (argv [arg_count + 1], &start)) || (!fulrun_is_long (argv [arg_count + 2], &stop))) { fprintf (stderr, "Illegal specification for the range.\n"); exit (1); } else if ((start <= 0) || (stop <= 0)) { fprintf (stderr, "Positive values required for the range.\n"); exit (1); } else if (start > stop) { fprintf (stderr, "Range start (%ld) greater than range end (%ld).\n", start, stop); exit (1); } else { range_set = TRUE; arg_count += 3; } } else if (!strcmp (argv [arg_count], "-l")) { /* Region length request */ if (length_set) { fprintf (stderr, "Minimal length of full intervals already set to %d.\n", length); exit (1); } else if (arg_count > argc - 2) { fprintf (stderr, "Incomplete length specification.\n"); exit (1); } else if (!fulrun_is_int (argv [arg_count + 1], &length)) { fprintf (stderr, "Illegal minimal interval length (%s).\n", argv [arg_count + 1]); exit (1); } else if (length <= 0) { fprintf (stderr, "Positive length required for full intervals."); exit (1); } else { length_set = TRUE; arg_count += 2; } } else if (!strcmp (argv [arg_count], "-i")) { /* Infinity setting */ if (infinity_set) { fprintf (stderr, "Duplicate infinity value request.\n"); exit (1); } else if (arg_count > argc - 2) { fprintf (stderr, "Incomplete infinity value specification.\n"); exit (1); } else if (!fulrun_is_float (argv [arg_count + 1], &infinity)) { fprintf (stderr, "Illegal floating point value (%s).\n", argv [arg_count + 1]); exit (1); } else if (infinity > 0) { fprintf (stderr, "Positive value set for minus infinity (%9.3f).\n", infinity); exit (1); } else { infinity_set = TRUE; arg_count += 2; } } else { /* Unknown "-" option */ fprintf (stderr, "Illegal command line option (%s).\n", argv [arg_count]); fprintf (stderr, "usage: %s \n", argv [0]); fprintf (stderr, " [-r ]\n"); fprintf (stderr, " [-l ]\n"); exit (1); } } /* Check whether all necessary data have been collected and consistent */ if (scores_file == NULL) { fprintf (stderr, "Must have a scores array to process.\n"); exit (1); } /* Load the scores array, display settings info and determine intervals */ if (fulrun_load_scores (scores_file, range_set, &start, stop, &scores, &size) < 0) exit (1); fulrun_output_header (scores_file, range_set, start, stop, length, infinity_set, infinity); if (fulrun_process_data (scores, size, start, length, infinity_set, infinity) < 0) exit (1); exit (0); } } /*****************************************************************************/ /* */ /* Procedure: fulrun_load_scores */ /* */ /* Procedure collects the array of scores from the data retrieved from the */ /* given file; returns 0 if everything is OK, negative status otherwise */ int fulrun_load_scores (char *scores_file, bool range_set, long int *start, long int stop, float **scores, long int *size) { FILE *in_file; char *in_buff, *stat, *scan, *str, term; long int scores_size, index; float current_score; if ((in_file = fopen (scores_file, "r")) == NULL) { fprintf (stderr, "Can't open scores file '%s'.\n", scores_file); return -1; } /* File found - allocate space for loading the scores, initialize control */ in_buff = (char *) NTL0_ckalloc (BUFFER_SIZE * sizeof (char)); *scores = (float *) NTL0_ckalloc (BUFFER_SIZE * sizeof (float)); scores_size = BUFFER_SIZE; *size = 0; /* Scan the file line-by-line and get the score amount, if any in the line */ while ((stat = fgets (in_buff, (int) (BUFFER_SIZE - 1), in_file)) != NULL) { scan = in_buff; /* Remove all "white space" characters from the beginning of line */ while ((*scan == ' ') || (*scan == '\t') || (*scan == '\n')) scan++; if ((*scan != '#') && (*scan != '\0')) { /* Not a comment or empty line */ str = scan; while ((*scan != ' ') && (*scan != '\t') && (*scan != '\n') && (*scan != '\0')) scan++; /* Move to end of num. */ term = *scan; *scan = '\0'; if (!fulrun_is_float (str, ¤t_score)) { fprintf (stderr, "Illegal score in file %s (%s).\n", scores_file, str); return -2; } /* If the current acceptance buffer is full, extend it to accept new val. */ if (*size == scores_size) { *scores = (float *) NTL0_ckrealloc ((char *) (*scores), 2 * scores_size * sizeof (float)); scores_size *= 2; } (*scores) [*size] = current_score; (*size)++; *scan = term; /* After the score is recorded, check legality of the rest of the line */ while ((*scan == ' ') || (*scan == '\t') || (*scan == '\n')) scan++; if ((*scan != '#') && (*scan != '\0')) { fprintf (stderr, "Illegal continuation of a line containing score (%c).\n", *scan); return -3; } } } fclose (in_file); free (in_buff); /* Not needed any more */ /* Reallocate the scores array, to release extra space taken while loading */ *scores = (float *) NTL0_ckrealloc ((char *) (*scores), (*size) * sizeof (float)); if (range_set) { /* A specific range in the array has been requested */ if ((*start > *size) || (stop < 1)) { fprintf (stderr, "Requested range [%ld,%ld] out of loaded scores range [%ld,%ld].\n", *start, stop, (long int) 1, *size); return -4; } else if ((*start < 1) && (stop > *size)) { /* Completely enclosing array */ fprintf (stderr, "WARNING: Requested range [%ld,%ld] truncated to [%ld,%ld] - ", *start, stop, (long int) 1, *size); fprintf (stderr, "entire scores array in file %s.\n", scores_file); *start = 1; } else if (*start < 1) { /* Overlapping start of the array */ fprintf (stderr, "WARNING: Requested range [%ld,%ld] truncated to [%ld,%ld].\n", *start, stop, (long int) 1, *size); *start = 1; *size = stop; *scores = (float *) NTL0_ckrealloc ((char *) (*scores), (*size) * sizeof (float)); } else if (stop > *size) { /* Overlapping the end of the array */ fprintf (stderr, "WARNING: Requested range [%ld,%ld] truncated to [%ld,%ld].\n", *start, stop, *start, *size); for (index = 0; index < (*size) - *start + 1; index++) { (*scores) [index] = (*scores) [(*start) + index - 1]; } *size = (*size) - *start + 1; *scores = (float *) NTL0_ckrealloc ((char *) (*scores), (*size) * sizeof (float)); } else if ((*start > 1) || (stop < *size)) { /* Proper interior of the array */ for (index = 0; index < stop - *start + 1; index++) { (*scores) [index] = (*scores) [(*start) + index - 1]; } *size = stop - *start + 1; *scores = (float *) NTL0_ckrealloc ((char *) (*scores), (*size) * sizeof (float)); } } else *start = 1; /* No specific range requested - count from the beginning */ return 0; } /*****************************************************************************/ /* */ /* Procedure: fulrun_output_header */ /* */ /* Procedure writes a number of comment lines in the output - output type */ /* identification and data about the settings under which it was */ /* generated, scores file name and requested range, minimal length of */ /* regions and the value assumed to be minus infinity, if any */ void fulrun_output_header (char *scores_file, bool range_set, long int start, long int stop, int length, bool infinity_set, float infinity) { printf ("#:plain:\n\n"); printf ("# Output of full intervals located in an array of scores.\n"); printf ("# Scores file: \"%s\"\n", scores_file); if (range_set) { printf ("# restricted to the range [%ld,%ld] within.\n", start, stop); } printf ("# Minimal length of the region required: %d\n", length); if (infinity_set) { printf ("\n# Value %9.3f assumed to be minus infinity.\n", infinity); } printf ("\n# Output format: region_start region_stop\n\n\n"); } /*****************************************************************************/ /* */ /* Procedure: fulrun_process_data */ /* */ /* Procedure determines all full runs within the established array of scores */ /* and reports them; returns 0 if OK, negative status otherwise */ int fulrun_process_data (float *scores, long int size, long int start, int length, bool infinity_set, float infinity) { long int lines_in_output, *low, *high, index, num_int, scanner, from, to; lines_in_output = 0; /* Initialize the counter of the reported regions */ /* Determine all full runs in the scores array first, then report them */ fulrun_runs (scores, size, infinity_set, infinity, &num_int, &low, &high); free (scores); /* Full intervals determined - scores not needed any more */ if (num_int > 0) { /* There are some full runs in this alignment */ /* Report all the full runs with their ancestral sequences */ scanner = 0; for (index = 0; index < num_int; index++) { /* Adjust the track value for reporting to the start position of region */ while (scanner < low [index]) { scanner++; start++; } /* Proceed to find the track for the end position of the region */ from = start; while (scanner < high [index]) { scanner++; start++; } to = start; /* If the region is sufficiently long, report it */ if (high [index] - low [index] + 1 >= length) { printf ("%ld %ld\n", from, to); lines_in_output++; } } } printf ("\n\n# Total number of regions detected: %ld\n", lines_in_output); return 0; } /*****************************************************************************/ /* */ /* Procedure: fulrun_runs */ /* */ /* Procedure determines all full intervals in the given array of scores, */ /* piecewise processing it (in contiguos finitelly scoring intervals) by */ /* invoking the full intervals algorithm (implemented as library routine) */ void fulrun_runs (float *scores, long int range_len, bool infinity_set, float infinity, long int *intervals, long int **low_ends, long int **high_ends) { long int *low, *high, range_start, range_stop, current, scan, items; bool all_positive; float *low_score, *high_score, *sigma; /* Allocate space required by the implementation of full runs algorithm */ low = (long int *) NTL0_ckalloc (range_len * sizeof (long int)); high = (long int *) NTL0_ckalloc (range_len * sizeof (long int)); low_score = (float *) NTL0_ckalloc (range_len * sizeof (float)); high_score = (float *) NTL0_ckalloc (range_len * sizeof (float)); /* Allocate space needed for the temporary scores and output delivery str. */ sigma = (float *) NTL0_ckalloc ((range_len + 1) * sizeof (float)); *low_ends = (long int *) NTL0_ckalloc (range_len * sizeof (long int)); *high_ends = (long int *) NTL0_ckalloc (range_len * sizeof (long int)); /* Initialize the internal control variables for patching the score array */ range_start = 0; range_stop = 0; *intervals = 0; while (range_stop < range_len) { /* Continue until last segment processed */ /* Skip all position scoring negative (either infinite or patch starting) */ while ((range_start < range_len) && (scores [range_start] < 0)) range_start++; range_stop = range_start; /* Initialize range stop to start of segment */ if (range_stop < range_len) { /* If there is a segment before array end */ all_positive = TRUE; while ((range_stop < range_len) && ((!infinity_set) || (scores [range_stop] > infinity + EPSILON))) { if (scores [range_stop] < 0.0) all_positive = FALSE; range_stop++; } if (all_positive) { /* The interval is automatically full */ (*low_ends) [*intervals] = range_start; (*high_ends) [*intervals] = range_stop - 1; (*intervals)++; } else { /* There must be one non-negative score, but not all of them */ current = range_stop - range_start; for (scan = 0; scan < current; scan++) sigma [scan] = scores [range_start + scan]; /* Determine all full intervals within the copied vector - library call */ NTL0_Full_Runs (sigma, current, &items, low, high, low_score, high_score); /* Add all discovered full runs to the list of these already established */ for (scan = 0; scan < items; scan++) { (*low_ends) [*intervals] = range_start + low [scan]; (*high_ends) [*intervals] = range_start + high [scan]; (*intervals)++; } } range_start = range_stop; /* Prepare to search for the next segment */ } } /* Release structures which are not needed after full runs are determined */ free (sigma); free (low); free (high); free (low_score); free (high_score); } /*****************************************************************************/ /* */ /* Procedure: fulrun_is_long */ /* */ /* Procedure determines whether the input string represents a long integer */ /* number, and places its value into a reference parameter; returns TRUE */ /* if a correct number has been extracted, FALSE otherwise */ bool fulrun_is_long (char *string, long int *value) { char *scan, *num_start, saver; int count; long int long_val; bool negative; scan = string; *value = 0; while ((*scan == ' ') || (*scan == '\t') || (*scan == '\n')) scan++; if (*scan == '\0') return FALSE; else { if (*scan == '-') { negative = TRUE; scan++; while ((*scan == ' ') || (*scan == '\t') || (*scan == '\n')) scan++; } else negative = FALSE; if (*scan == '\0') return FALSE; else if (*scan == '0') { if (negative) return FALSE; else { scan++; while ((*scan == ' ') || (*scan == '\t') || (*scan == '\n')) scan++; *value = 0; if (*scan == '\0') return TRUE; else return FALSE; } } else { count = 0; num_start = scan; while ((*scan >= '0') && (*scan <= '9')) { count++; scan++; } if ((count < 1) || (count > MAX_LONG_LEN)) return FALSE; else { if ((*scan != '\0') && (*scan != ' ') && (*scan != '\t') && (*scan != '\n')) return FALSE; else { saver = *scan; *scan = '\0'; long_val = atol (num_start); *scan = saver; while ((*scan == ' ') || (*scan == '\t') || (*scan == '\n')) scan++; if (*scan != '\0') return FALSE; else { if (negative) *value = -long_val; else *value = long_val; return TRUE; } } } } } } /*****************************************************************************/ /* */ /* Procedure: fulrun_is_int */ /* */ /* Procedure determines whether the input string represents an integer */ /* number, and places its value into a reference parameter; returns TRUE */ /* if a correct number has been extracted, FALSE otherwise */ bool fulrun_is_int (char *string, int *value) { long int long_val; if (!fulrun_is_long (string, &long_val)) { *value = 0; return FALSE; } else if ((long_val < MININT_VALUE) || (long_val > MAXINT_VALUE)) { *value = 0; return FALSE; } else { *value = (int) long_val; return TRUE; } } /*****************************************************************************/ /* */ /* Procedure: fulrun_is_float */ /* */ /* Procedure determines whether the input string represents a floating point */ /* (real) number, and places its value into a reference parameter; returns */ /* TRUE if a correct number has been extracted, FALSE otherwise */ bool fulrun_is_float (char *string, float *value) { char *s; bool negative, fraction; int count; long int whole; float frac; s = string; whole = 0; frac = 0.0; count = 0; /* Initialize */ /* Consult the first symbol in the string and set the appropriate controls */ if (*s == '-') { negative = TRUE; s++; if (*s == '\0') return FALSE; } else negative = FALSE; if (*s == '.') { fraction = TRUE; s++; if (*s == '\0') return FALSE; } else fraction = FALSE; if ((*s == '0') && (s [1] != '\0') && (s [1] != '.')) return FALSE; else { /* Proceed to collect the number in the part of the value */ while ((*s >= '0') && (*s <= '9')) { if (!fraction) { /* The collected part is the whole number */ whole = whole * 10 + (int) (*s - '0'); count++; s++; if (count > MAX_WHOLE) return FALSE; } else { /* The collected part is the fraction part of the number */ frac = frac * 10.0 + (float) (*s - '0'); count++; s++; if (count > MAX_FRAC) return FALSE; } } if ((*s != '.') && (*s != '\0')) return FALSE; else if ((*s == '.') && (fraction)) return FALSE; else if (*s == '.') { fraction = TRUE; count = 0; s++; while ((*s >= '0') && (*s <= '9')) { frac = frac * 10.0 + (float) (*s - '0'); count++; s++; if (count > MAX_FRAC) return FALSE; } if (*s != '\0') return FALSE; } *value = (float) whole; if (fraction) { while (count > 0) { frac = frac / 10.0; count--; } } *value += frac; if (negative) *value = -(*value); return TRUE; } }