/*****************************************************************************/ /* */ /* UNIT: NTL2_Load_Plain (Level 2 library routine) */ /* */ /* Author: Nikola Stojanovic */ /* */ /* Revision: 13 MAR 97 Version 1.0 */ /* 06 APR 97 Version 1.1 */ /* */ /* Function: */ /* */ /* Procedure loads the specified plain form landmarks file, and stores */ /* its fields into an internal list for further processing. */ /* Procedure receives the name of the data file, fills the pointer */ /* (reference parameter) to the begining of the assembled list; returns the */ /* error structure, NULL if everything was OK */ /* */ /* */ /* Expected format of the plain form landmarks file: */ /* */ /* - Any line can contain a comment, starting with '#' symbol - the rest of */ /* the line is then considered a comment, and ignored by the procedure. */ /* - The first text in the file must be "#:plain:" to serve as an identifier */ /* for this, or any other reading program. */ /* - Any data line contains: */ /* */ /* s_1 s_2 [ txt m/s "s/c" "fac" std fid fnm ref rng ] */ /* */ /* s_1 = starting position of the region represented by the line */ /* s_2 = ending position of the region represented by the line */ /* txt = text of the sequence between the starting and ending positions */ /* m/s = text of the motif matched at the represented positions, or */ /* score of the match found between the starting & ending position */ /* s/c = site name of the motif matched at the represented positions, or */ /* consensus sequence for the motif matched at these positions */ /* fac = name of the factor discovered to bind at the represented site */ /* std = indication of the strand in which the motif match was discovered */ /* fid = indication of the database to which the matched factor belongs */ /* fnm = name of the file to which the matched factor belongs */ /* ref = reference code for the matched factor - codes journal citation */ /* rng = range of the data represented by the line (when compressed) */ /* */ /* Fields enclosed in the brackets [] need not be present, but if present, */ /* all preceding ones must be present, too - when empty, word "null" serve */ /* to indicate that. */ /* */ /*****************************************************************************/ #include #include #include #include "ntl2.h" /*****************************************************************************/ /* */ /* Definitions section */ /* */ /*****************************************************************************/ /*****************************************************************************/ /* Definitions of local constants of the unit */ /*****************************************************************************/ /* Limit for the length of an error message that can be assembled */ #define ERR_MSGLIMIT 128 /* Information is read from database files by supplementary buffer of size: */ #define IN_LIMIT 200 /*****************************************************************************/ /* Prototypes of all locally used functions of this unit */ /*****************************************************************************/ errind NTL2_LP_Read_Plain_File (FILE **plain_file, plain_ptr *records); errind NTL2_LP_Assemble_Error (int severity, int code, char *comment, int description); /*****************************************************************************/ /* Definitions of global (static) variables of the unit */ /*****************************************************************************/ static char In_Buff [IN_LIMIT]; /* Global input line buffer, for convenience */ static char Error_Message [ERR_MSGLIMIT]; /* Temporary buffer, error passing */ /*****************************************************************************/ /* */ /* Code section */ /* */ /*****************************************************************************/ /*****************************************************************************/ /* */ /* Procedure: NTL2_Load_Plain */ /* */ /* Main (interface) procedure of this unit */ errind NTL2_Load_Plain (char *file_name, plain_ptr *records) { FILE *plain_file; errind erret; /* Initialize the list of loaded records to return to "empty" first */ *records = NULL; /* Check whether the file with given file name exists and open it for read */ if ((plain_file = fopen (file_name, "r")) == NULL) { /* Error condition */ sprintf (Error_Message, "File <%s> does not exist", file_name); return NTL2_LP_Assemble_Error (USER_ERROR, ERR_NO_FILE, Error_Message, 0); } else { /* There is a file with the specified name */ /* Now load the records from the file and close the file when done */ erret = NTL2_LP_Read_Plain_File (&plain_file, records); /* Get contents */ if ((erret != NULL) && (erret -> kind != WARNING)) { /* Error status ret. */ /* Release the assembled part of the records list - it won't be needed */ *records = NTL1_Destroy_Plain_List (*records); } fclose (plain_file); return erret; } } /*****************************************************************************/ /* */ /* Procedure: NTL2_LP_Read_Plain_File */ /* */ /* Top level procedure for reading the plain format landmarks file contents; */ /* loops for every line collecting the data (one record per line); */ /* receives the file pointer and (already initialized) list of records to */ /* be filled, returns the assembled error record, NULL if there were no */ /* errors in loading */ errind NTL2_LP_Read_Plain_File (FILE **plain_file, plain_ptr *records) { plain_ptr new_plain, last_ins, next_rec; long int start_pos, end_pos; char *scan, *start_num, save_scan, *start_str, *land_text; bool plain; char *process_buffer; int process_size, residue_size; char *matched, *site_text, *binding, strand, *database, *reference, *c_range; char *originating; /* Get the file identification first, to verify the data format */ plain = FALSE; while ((!plain) && (fgets (In_Buff, IN_LIMIT, *plain_file) != NULL) && (In_Buff [0] != '\0')) { /* Skip all the "white-space" characters at the beginning of the line */ scan = In_Buff; while ((*scan == ' ') || (*scan == '\t')) scan++; if (*scan != '\n') { /* If this is not just a blank line ... */ start_str = scan; while ((*scan != ' ') && (*scan != '\t') && (*scan != '\n') && (*scan != '\0')) scan++; save_scan = *scan; *scan = '\0'; if (strcmp (start_str, "#:plain:")) { return NTL2_LP_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Not a plain format landmarks file", 0); } else { plain = TRUE; *scan = save_scan; } } In_Buff [0] = '\0'; } last_ins = NULL; /* Last record inserted in the list, to speed up things */ process_buffer = (char *) NTL0_ckalloc (IN_LIMIT * sizeof (char)); process_size = IN_LIMIT; process_buffer [0] = '\0'; residue_size = 0; while ((fgets (In_Buff, IN_LIMIT, *plain_file) != NULL) && (In_Buff [0] != '\0')) { if (residue_size > 0) { if (residue_size + strlen (In_Buff) >= process_size) { process_size = 2 * process_size + IN_LIMIT; process_buffer = (char *) NTL0_ckrealloc (process_buffer, process_size * sizeof (char)); } strcat (process_buffer, In_Buff); residue_size += strlen (In_Buff); } else { strcpy (process_buffer, In_Buff); residue_size = strlen (In_Buff); } In_Buff [0] = '\0'; if (process_buffer [residue_size - 1] == '\n') { /* Skip all the "white-space" characters at the beginning of the line */ scan = process_buffer; while ((*scan == ' ') || (*scan == '\t')) scan++; if ((*scan != '#') && (*scan != '\n')) { /* Non-empty non-comment line */ /* Get the landmark start position */ if ((*scan != '-') && ((*scan < '1') || (*scan > '9'))) return NTL2_LP_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Illegal number in input file", 0); start_num = scan; scan++; if ((*start_num == '-') && ((*scan < '1') || (*scan > '9'))) return NTL2_LP_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Illegal number in input file", 0); while ((*scan >= '0') && (*scan <= '9')) scan++; if ((*scan != ' ') && (*scan != '\t')) return NTL2_LP_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Illegal input file format", 0); save_scan = *scan; *scan = '\0'; start_pos = atol (start_num); *scan = save_scan; scan++; while ((*scan == ' ') || (*scan == '\t')) scan++; /* Get the landmark ending position */ if ((*scan != '-') && ((*scan < '1') || (*scan > '9'))) return NTL2_LP_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Illegal number in input file", 0); start_num = scan; scan++; if ((*start_num == '-') && ((*scan < '1') || (*scan > '9'))) return NTL2_LP_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Illegal number in input file", 0); while ((*scan >= '0') && (*scan <= '9')) scan++; if ((*scan != ' ') && (*scan != '\t') && (*scan != '\n') && (*scan != '\0')) return NTL2_LP_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Illegal number in input file", 0); save_scan = *scan; *scan = '\0'; end_pos = atol (start_num); *scan = save_scan; if (end_pos < start_pos) return NTL2_LP_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Bad region boundaries", 0); /* Progress to the next field in the landmark line, if any */ while ((*scan == ' ') || (*scan == '\t')) scan++; if ((*scan != '\n') && (*scan != '\0') && (*scan != '#')) { start_str = scan; while ((*scan != ' ') && (*scan != '\t') && (*scan != '\n') && (*scan != '\0')) scan++; save_scan = *scan; *scan = '\0'; land_text = NTL0_strsave (start_str); *scan = save_scan; if ((!strcmp (land_text, "null")) || (!strcmp (land_text, "NULL")) || (!strcmp (land_text, "Null"))) { free (land_text); land_text = NULL; } } else land_text = NULL; /* If there are more fields, get the motif text / match precision */ while ((*scan == ' ') || (*scan == '\t')) scan++; if ((*scan != '\n') && (*scan != '\0') && (*scan != '#')) { start_str = scan; while ((*scan != ' ') && (*scan != '\t') && (*scan != '\n') && (*scan != '\0')) scan++; save_scan = *scan; *scan = '\0'; matched = NTL0_strsave (start_str); *scan = save_scan; if ((!strcmp (matched, "null")) || (!strcmp (matched, "NULL")) || (!strcmp (matched, "Null"))) { free (matched); matched = NULL; } } else matched = NULL; /* If there are more fields, get the site name / motif consensus */ while ((*scan == ' ') || (*scan == '\t')) scan++; if ((*scan != '\n') && (*scan != '\0') && (*scan != '#')) { if (*scan == '\"') { scan++; start_str = scan; while ((*scan != '\"') && (*scan != '\n') && (*scan != '\0')) scan++; if ((*scan == '\n') || (*scan == '\0')) return NTL2_LP_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Unterminated string in input", 0); save_scan = '\"'; *scan = '\0'; site_text = NTL0_strsave (start_str); if (strlen (site_text) == 0) { free (site_text); site_text = NULL; } *scan = save_scan; scan++; if ((*scan != ' ') && (*scan != '\t') && (*scan != '\n') && (*scan != '\0')) return NTL2_LP_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Improperly concatenated strings in input", 0); } else { start_str = scan; while ((*scan != ' ') && (*scan != '\t') && (*scan != '\n') && (*scan != '\0')) scan++; save_scan = *scan; *scan = '\0'; site_text = NTL0_strsave (start_str); if ((!strcmp (site_text, "null")) || (!strcmp (site_text, "NULL")) || (!strcmp (site_text, "Null"))) { free (site_text); site_text = NULL; } else return NTL2_LP_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Illegal text for site name", 0); *scan = save_scan; } } else site_text = NULL; /* If there are more fields, get the binding factor name */ while ((*scan == ' ') || (*scan == '\t')) scan++; if ((*scan != '\n') && (*scan != '\0') && (*scan != '#')) { if (*scan == '\"') { scan++; start_str = scan; while ((*scan != '\"') && (*scan != '\n') && (*scan != '\0')) scan++; if ((*scan == '\n') || (*scan == '\0')) return NTL2_LP_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Unterminated string in input", 0); save_scan = '\"'; *scan = '\0'; binding = NTL0_strsave (start_str); if (strlen (binding) == 0) { free (binding); binding = NULL; } *scan = save_scan; scan++; if ((*scan != ' ') && (*scan != '\t') && (*scan != '\n') && (*scan != '\0')) return NTL2_LP_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Improperly concatenated strings in input", 0); } else { start_str = scan; while ((*scan != ' ') && (*scan != '\t') && (*scan != '\n') && (*scan != '\0')) scan++; save_scan = *scan; *scan = '\0'; binding = NTL0_strsave (start_str); if ((!strcmp (binding, "null")) || (!strcmp (binding, "NULL")) || (!strcmp (binding, "Null"))) { free (binding); binding = NULL; } else return NTL2_LP_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Illegal text for site name", 0); *scan = save_scan; } } else binding = NULL; /* If there are more fields, get the strand in whitch the match is done */ while ((*scan == ' ') || (*scan == '\t')) scan++; if ((*scan != '\n') && (*scan != '\0') && (*scan != '#')) { if (*scan == LITERAL_CODE) strand = LITERAL_CODE; else if (*scan == TRUNCATED_LITERAL_CODE) strand = TRUNCATED_LITERAL_CODE; else if (*scan == INVERSE_CODE) strand = INVERSE_CODE; else if (*scan == TRUNCATED_INVERSE_CODE) strand = TRUNCATED_INVERSE_CODE; else if (*scan == BOTH_STRAND_CODE) strand = BOTH_STRAND_CODE; else if (*scan == NO_STRAND_CODE) strand = NO_STRAND_CODE; else return NTL2_LP_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Illegal strand code", 0); scan++; if ((*scan != ' ') && (*scan != '\t') && (*scan != '\n') && (*scan != '\0')) return NTL2_LP_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Incorrect separation of strand code", 0); } else strand = NO_STRAND_CODE; /* If there are more fields, get the database containing the motif */ while ((*scan == ' ') || (*scan == '\t')) scan++; if ((*scan != '\n') && (*scan != '\0') && (*scan != '#')) { start_str = scan; while ((*scan != ' ') && (*scan != '\t') && (*scan != '\n') && (*scan != '\0')) scan++; save_scan = *scan; *scan = '\0'; database = NTL0_strsave (start_str); *scan = save_scan; if ((!strcmp (database, "null")) || (!strcmp (database, "NULL")) || (!strcmp (database, "Null"))) { free (database); database = NULL; } } else database = NULL; /* If there are more fields, get the file the motif is originating from */ while ((*scan == ' ') || (*scan == '\t')) scan++; if ((*scan != '\n') && (*scan != '\0') && (*scan != '#')) { start_str = scan; while ((*scan != ' ') && (*scan != '\t') && (*scan != '\n') && (*scan != '\0')) scan++; save_scan = *scan; *scan = '\0'; originating = NTL0_strsave (start_str); *scan = save_scan; if ((!strcmp (originating, "null")) || (!strcmp (originating, "NULL")) || (!strcmp (originating, "Null"))) { free (originating); originating = NULL; } } else originating = NULL; /* If there are more fields, get the reference code for the motif */ while ((*scan == ' ') || (*scan == '\t')) scan++; if ((*scan != '\n') && (*scan != '\0') && (*scan != '#')) { start_str = scan; while ((*scan != ' ') && (*scan != '\t') && (*scan != '\n') && (*scan != '\0')) scan++; save_scan = *scan; *scan = '\0'; reference = NTL0_strsave (start_str); *scan = save_scan; if ((!strcmp (reference, "null")) || (!strcmp (reference, "NULL")) || (!strcmp (reference, "Null"))) { free (reference); reference = NULL; } } else reference = NULL; /* If there are more fields, get the recorded compression range */ while ((*scan == ' ') || (*scan == '\t')) scan++; if ((*scan != '\n') && (*scan != '\0') && (*scan != '#')) { start_str = scan; while ((*scan != ' ') && (*scan != '\t') && (*scan != '\n') && (*scan != '\0')) scan++; save_scan = *scan; *scan = '\0'; c_range = NTL0_strsave (start_str); *scan = save_scan; if ((!strcmp (c_range, "null")) || (!strcmp (c_range, "NULL")) || (!strcmp (c_range, "Null"))) { free (c_range); c_range = NULL; } } else c_range = NULL; /* Get the other fields from the plain format file line, if any */ /* ### To be defined ################################################### */ /* Now save the collected information in a new record of the plain list */ new_plain = (plain_ptr) NTL0_ckalloc (sizeof (Plain_Struct)); new_plain -> start = start_pos; new_plain -> stop = end_pos; new_plain -> text = land_text; if (new_plain -> text != NULL) NTL0_uppercase (new_plain -> text); new_plain -> matched = matched; new_plain -> site = site_text; new_plain -> binding = binding; new_plain -> strand = strand; new_plain -> database = database; new_plain -> file = originating; new_plain -> reference = reference; new_plain -> c_range = c_range; new_plain -> full_ref = NULL; new_plain -> next = NULL; /* Now connect the newly defined record into the list of known records */ if (*records == NULL) *records = new_plain; else if ((*records) -> start > new_plain -> start) { new_plain -> next = *records; *records = new_plain; } else if ((last_ins -> start < new_plain -> start) && ((last_ins -> next == NULL) || ((last_ins -> next) -> start > new_plain -> start))) { new_plain -> next = last_ins -> next; last_ins -> next = new_plain; } else { next_rec = *records; while ((next_rec -> next != NULL) && ((next_rec -> next) -> start < new_plain -> start)) next_rec = next_rec -> next; new_plain -> next = next_rec -> next; next_rec -> next = new_plain; } last_ins = new_plain; } process_buffer [0] = '\0'; residue_size = 0; } } if (residue_size != 0) return NTL2_LP_Assemble_Error (WARNING, ERR_FILE_FORMAT, "Unresolved residue in the input buffer", 0); else return NULL; } /*****************************************************************************/ /* */ /* Procedure: NTL2_LP_Assemble_Error */ /* */ /* Service procedure for assembling and returnning an error report, based on */ /* the values of the input parameters; returns the record with the report */ errind NTL2_LP_Assemble_Error (int severity, int code, char *comment, int description) { char *report; errind assembled; report = (char *) NTL0_ckalloc ( (strlen (comment) + strlen ("_Load_Plain_File: ") + 1) * sizeof (char)); sprintf (report, "_Load_Plain_File: %s", comment); assembled = NTL1_Error_Record (severity, code, report, description); free (report); return assembled; }