/*****************************************************************************/ /* */ /* UNIT: NTL2_Load_Enzymes (Level 2 library routine) */ /* */ /* Author: Nikola Stojanovic */ /* */ /* Revision: 26 DEC 94 Version 1.0 */ /* */ /* Function: */ /* */ /* Procedure loads the "enzymes structure" (data structure containing */ /* information about the enzymes and their cutting sites) from the specified */ /* file containing enzymes information in the "database tuples" format. */ /* Procedure receives the name of the file containing enzymes data, fills */ /* the pointer (reference parameter) to the begining of the assembled list */ /* of enzymes; returns the error structure, NULL if everything was OK */ /* */ /* */ /* Expected format of the "enzymes" file: */ /* */ /* - First non-white-space sequence of symbols in the file must be: */ /* */ /* #:TUPLES: */ /* */ /* - */ /* */ /*****************************************************************************/ #include #include #include #include "ntl2.h" /*****************************************************************************/ /* */ /* Definitions section */ /* */ /*****************************************************************************/ /*****************************************************************************/ /* Definitions of local constants of the unit */ /*****************************************************************************/ /* Limit for the length of an error message that can be assembled */ #define ERR_MSGLIMIT 128 /* Information is read from database files by supplementary buffer of size: */ #define IN_LIMIT 512 #define LNUM_SIZE 8 /* Values of "tokens" found in enzymes file */ #define FTOK_EOF 0 #define FTOK_ENZYME 1 #define FTOK_ITEM 2 #define FTOK_NAME 4 #define FTOK_SEQ 5 #define FTOK_CUT 6 #define FTOK_INVCUT 7 #define FTOK_NOTE 8 /*****************************************************************************/ /* Prototypes of all locally used functions of this unit */ /*****************************************************************************/ errind NTL2_LE_Read_Enzymes (FILE **enz_file, enzyme_ptr *enzymes); errind NTL2_LE_Next_Enzyme (FILE **enz_file, enzyme_ptr enzyme_rec, enzyme_ptr *new_enzyme); errind NTL2_LE_Enzyme_Token (FILE **enz_file, int *next_tok, char **str); errind NTL2_LE_Fill_Enzyme (enzyme_ptr enzyme_rec, int next_tok, char **str); errind NTL2_LE_Assemble_Error (int severity, int code, char *comment, int description); /*****************************************************************************/ /* Definitions of global (static) variables of the unit */ /*****************************************************************************/ static char In_Buff [IN_LIMIT]; /* Global input line buffer, for convenience */ static char Error_Message [ERR_MSGLIMIT]; /* Temporary buffer, error passing */ /*****************************************************************************/ /* */ /* Code section */ /* */ /*****************************************************************************/ /*****************************************************************************/ /* */ /* Procedure: NTL2_Load_Enzymes */ /* */ /* Main (interface) procedure of this unit */ errind NTL2_Load_Enzymes (char *file_name, enzyme_ptr *enzymes) { FILE *enz_file; char ch; errind erret; enzyme_ptr sscan, tscan; /* Initialize the list of enzymes to return to "empty", before actions */ *enzymes = NULL; /* Check whether the file with given file name exists and open it for read */ if ((enz_file = fopen (file_name, "r")) == NULL) { /* Error condition */ sprintf (Error_Message, "File <%s> does not exist", file_name); return NTL2_LE_Assemble_Error (USER_ERROR, ERR_NO_FILE, Error_Message, 1); } else { /* There is a file with the specified name */ /* Check the file format by means of the recorded header code */ do { /* Eliminate the initial "white space" symbols */ ch = fgetc (enz_file); } while ((ch == ' ') || (ch == '\n') || (ch == '\t')); if (ch != '#') return NTL2_LE_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Illegal file format", 2); ch = fgetc (enz_file); if (ch != ':') return NTL2_LE_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Illegal file format", 3); ch = fgetc (enz_file); if ((ch != 't') && (ch != 'T')) return NTL2_LE_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Illegal file format", 4); ch = fgetc (enz_file); if ((ch != 'u') && (ch != 'U')) return NTL2_LE_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Illegal file format", 5); ch = fgetc (enz_file); if ((ch != 'p') && (ch != 'P')) return NTL2_LE_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Illegal file format", 6); ch = fgetc (enz_file); if ((ch != 'l') && (ch != 'L')) return NTL2_LE_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Illegal file format", 7); ch = fgetc (enz_file); if ((ch != 'e') && (ch != 'E')) return NTL2_LE_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Illegal file format", 8); ch = fgetc (enz_file); if ((ch != 's') && (ch != 'S')) return NTL2_LE_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Illegal file format", 9); ch = fgetc (enz_file); if (ch != ':') return NTL2_LE_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Illegal file format", 10); /* Now load the enzymes from the file and close the file when done */ erret = NTL2_LE_Read_Enzymes (&enz_file, enzymes); /* Read the file */ if ((erret != NULL) && (erret -> kind != WARNING)) { /* Error status ret. */ /* Release the assembled part of the enzymes list - it won't be needed */ sscan = *enzymes; while (sscan != NULL) { tscan = sscan; sscan = sscan -> next; free (tscan); } *enzymes = NULL; } fclose (enz_file); return erret; } } /*****************************************************************************/ /* */ /* Procedure: NTL2_LE_Read_Enzymes */ /* */ /* Top level procedure for reading the enzymes file contents; loops for each */ /* new enzyme record in the file (and accounts for field contents */ /* inherritance, as well) and loads it via lower-level procedures. */ /* Procedure receives the file pointer and (already initialized) list of */ /* enzyme data; returns the assembled error record, NULL if there was no */ /* error in loading */ errind NTL2_LE_Read_Enzymes (FILE **enz_file, enzyme_ptr *enzymes) { enzyme_ptr enzyme_rec, new_enzyme; errind erret; /* Allocate a record to store the new loaded enzyme before passing it to */ /* procedure: since there is no attribute to inherit, initialize contents */ enzyme_rec = (enzyme_ptr) NTL0_ckalloc (sizeof (Enzyme_Struct)); enzyme_rec -> name = NULL; enzyme_rec -> sequence = NULL; enzyme_rec -> cut_site = 0; enzyme_rec -> inverse_cut = 0; enzyme_rec -> note = NULL; enzyme_rec -> next = NULL; /* Loop to get enzymes until there are no more in the file */ while (((erret = NTL2_LE_Next_Enzyme (enz_file, enzyme_rec, &new_enzyme)) == NULL) && (new_enzyme != NULL)) { /* Allocate the next record and prepare to copy previous contents for inhr.*/ enzyme_rec = (enzyme_ptr) NTL0_ckalloc (sizeof (Enzyme_Struct)); if (new_enzyme -> name != NULL) enzyme_rec -> name = NTL0_strsave (new_enzyme -> name); else enzyme_rec -> name = NULL; if (new_enzyme -> sequence != NULL) enzyme_rec -> sequence = NTL0_strsave (new_enzyme -> sequence); else enzyme_rec -> sequence = NULL; enzyme_rec -> cut_site = new_enzyme -> cut_site; enzyme_rec -> inverse_cut = new_enzyme -> inverse_cut; if (new_enzyme -> note != NULL) enzyme_rec -> note = NTL0_strsave (new_enzyme -> note); else enzyme_rec -> note = NULL; enzyme_rec -> next = NULL; /* Now link the new enzyme record to the list of already loaded enzymes */ new_enzyme -> next = *enzymes; *enzymes = new_enzyme; } /* At the end of the loop no enzyme was found, so release the preallocated */ if (enzyme_rec -> name != NULL) free (enzyme_rec -> name); if (enzyme_rec -> sequence != NULL) free (enzyme_rec -> sequence); if (enzyme_rec -> note != NULL) free (enzyme_rec -> note); free (enzyme_rec); return erret; /* Return whatever was the error status from lower level */ } /*****************************************************************************/ /* */ /* Procedure: NTL2_LE_Next_Enzyme */ /* */ /* Procedure extracts information about one enzyme at a time from the file */ /* and loads it into a new enzyme structure, which coincides with the */ /* structure passed in the procedure (if there were no further enzyme */ /* block information in the file, that value returns as NULL); returns the */ /* "standard" error structure, NULL if there were no errors in loading */ /* the structure */ errind NTL2_LE_Next_Enzyme (FILE **enz_file, enzyme_ptr enzyme_rec, enzyme_ptr *new_enzyme) { int next_tok; char *str; errind status; *new_enzyme = NULL; /* Start with assumption that there is no new enzyme */ /* Get the next relevant token from the file: if it indicates a start of */ /* entirely new enzymes block, reset inherited fields in reserved record */ if ((status = NTL2_LE_Enzyme_Token (enz_file, &next_tok, &str)) != NULL) { return status; } else if (next_tok == FTOK_ENZYME) { if (enzyme_rec -> name != NULL) free (enzyme_rec -> name); if (enzyme_rec -> sequence != NULL) free (enzyme_rec -> sequence); enzyme_rec -> name = enzyme_rec -> sequence = NULL; if (enzyme_rec -> note != NULL) free (enzyme_rec -> note); enzyme_rec -> note = NULL; enzyme_rec -> cut_site = 0; enzyme_rec -> inverse_cut = 0; } else if (next_tok == FTOK_ITEM) { /* There was supposed to be some contents */ *new_enzyme = enzyme_rec; return NTL2_LE_Assemble_Error (WARNING, ERR_FILE_FORMAT, "No contents in enzyme block", 11); } else if (next_tok == FTOK_EOF) return NULL; /* EOF - no more enzymes */ else if ((status = NTL2_LE_Fill_Enzyme (enzyme_rec, next_tok, &str)) != NULL) return status; /* Enzyme contents or error in assembling it */ /* At this point it is not possible that there won't be a new enzyme - fill */ *new_enzyme = enzyme_rec; /* Whatever is now in old gets into the new one */ /* Now proceed to get all the attributes of the enzyme that has been seen */ do { status = NTL2_LE_Enzyme_Token (enz_file, &next_tok, &str); /* Token... */ if (status != NULL) return status; /* Error in the file layout */ else { switch (next_tok) { case FTOK_ENZYME: { /* Already started - no other start before "item" */ return NTL2_LE_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Illegal file format - double enzymes", 12); } case FTOK_ITEM: return NULL; /* So full contents are collected */ case FTOK_EOF: { /* Any enzyme must complete ("item") before EOF */ return NTL2_LE_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Illegal file format - unterminated", 13); } default: if ((status = NTL2_LE_Fill_Enzyme (enzyme_rec, next_tok, &str)) != NULL) return status; /* Attribute or error assembling */ } } } while (TRUE); /* The endless loop will be broken when "item" returns */ } /*****************************************************************************/ /* */ /* Procedure: NTL2_LE_Enzyme_Token */ /* */ /* Procedure gets the next "token" from the enzymes file (or file containing */ /* enzymes) plus the accompanying information in the form of a string, if */ /* any; returns the error structure, NULL if there was no error discovered */ errind NTL2_LE_Enzyme_Token (FILE **enz_file, int *next_tok, char **str) { char ch; int accept_pos; int pat, reached, in_comment; errind status; /* Skip the heading white spaces and all comments possibly interleaved */ while (TRUE) { ch = ' '; while ((ch == ' ') || (ch == '\n') || (ch == '\t')) ch = fgetc (*enz_file); if (ch == EOF) { *str = NULL; *next_tok = FTOK_EOF; return NULL; } else if (ch == '#') { in_comment = TRUE; while (in_comment) { while ((ch != '\n') && (ch != EOF)) ch = fgetc (*enz_file); in_comment = FALSE; while ((ch == ' ') || (ch == '\n') || (ch == '\t')) ch = fgetc (*enz_file); if (ch == EOF) { *str = NULL; *next_tok = FTOK_EOF; return NULL; } else if (ch == '#') in_comment = TRUE; } } /* At this point some significant text has been seen - investigate */ accept_pos = 0; while ((accept_pos < IN_LIMIT) && (ch != EOF) && (ch != ' ') && (ch != '\n') && (ch != '\t') && (ch != ';') && (ch != '=')) { In_Buff [accept_pos++] = ch; ch = fgetc (*enz_file); } if (accept_pos >= IN_LIMIT) { /* Error condition */ return NTL2_LE_Assemble_Error (USER_ERROR, ERR_BUFFER_OVERFLOW, "Text in file too long", 14); } else if (ch == EOF) { /* End-of-file prematurely hit */ return NTL2_LE_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Unexpected end-of-file", 15); } else { /* Text up to first "white space" is placed in accepting buffer */ In_Buff [accept_pos] = '\0'; if ((!strcmp (In_Buff, "begin")) || (!strcmp (In_Buff, "BEGIN"))) { /* Beginning of some block */ while ((ch == ' ') || (ch == '\n') || (ch == '\t')) ch = fgetc (*enz_file); accept_pos = 0; while ((accept_pos < IN_LIMIT) && (ch != EOF) && (ch != ' ') && (ch != '\n') && (ch != '\t')) { In_Buff [accept_pos++] = ch; ch = fgetc (*enz_file); } if (accept_pos >= IN_LIMIT) { /* Error condition */ return NTL2_LE_Assemble_Error (USER_ERROR, ERR_BUFFER_OVERFLOW, "Text in file too long", 16); } else if (ch == EOF) { /* Error condition */ return NTL2_LE_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Unexpected end-of-file", 17); } else { /* Name of the block collected in acceptance buffer - check */ In_Buff [accept_pos] = '\0'; if (!strcmp (In_Buff, "enzyme")) { *str = NULL; *next_tok = FTOK_ENZYME; /* Beginning of a new enzyme */ return NULL; } else { /* So this was beginning of something else - find next enzyme */ reached = FALSE; while ((ch != EOF) && (!reached)) { while ((ch != EOF) && (ch != 'b') && (ch != 'B')) ch = fgetc (*enz_file); if (ch != EOF) { pat = TRUE; ch = fgetc (*enz_file); if ((ch != 'e') && (ch != 'E')) pat = FALSE; if (pat) { ch = fgetc (*enz_file); if ((ch != 'g') && (ch != 'G')) pat = FALSE; } if (pat) { ch = fgetc (*enz_file); if ((ch != 'i') && (ch != 'I')) pat = FALSE; } if (pat) { ch = fgetc (*enz_file); if ((ch != 'n') && (ch != 'N')) pat = FALSE; } if (pat) { ch = fgetc (*enz_file); if ((ch != ' ') && (ch != '\t') && (ch != '\n')) pat = FALSE; if (pat) { /* Eliminate the "white space" symbols between "begin" and ident. */ while ((ch == ' ') || (ch == '\t') || (ch == '\n')) ch = fgetc (*enz_file); /* Check if the identifier of what starts is "enzyme" */ if ((ch != 'e') && (ch != 'E')) pat = FALSE; if (pat) { ch = fgetc (*enz_file); if ((ch != 'n') && (ch != 'N')) pat = FALSE; } if (pat) { ch = fgetc (*enz_file); if ((ch != 'z') && (ch != 'Z')) pat = FALSE; } if (pat) { ch = fgetc (*enz_file); if ((ch != 'y') && (ch != 'Y')) pat = FALSE; } if (pat) { ch = fgetc (*enz_file); if ((ch != 'm') && (ch != 'M')) pat = FALSE; } if (pat) { ch = fgetc (*enz_file); if ((ch != 'e') && (ch != 'E')) pat = FALSE; } if (pat) { ch = fgetc (*enz_file); if ((ch == ' ') || (ch == '\t') || (ch == '\n')) reached = TRUE; } } } } } /* At this point it is either a beginning of new enzyme or EOF */ if (ch == EOF) { *str = NULL; *next_tok = FTOK_EOF; return NULL; } else { *str = NULL; *next_tok = FTOK_ENZYME; return NULL; } } } } /* Since the text seen was not beginning of block, must be in some enzyme */ else if (!strcmp (In_Buff, "name")) { while ((ch == ' ') || (ch == '\t') || (ch == '\n')) ch = fgetc (*enz_file); if (ch != '=') { /* Error condition */ return NTL2_LE_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Illegal file format", 18); } else { /* So name specification is OK, get the name */ ch = fgetc (*enz_file); while ((ch == ' ') || (ch == '\t') || (ch == '\n')) ch = fgetc (*enz_file); if ((status = NTL2_Assemble_String (enz_file, ch, str)) == NULL) { *next_tok = FTOK_NAME; return NULL; } else return status; } } else if (!strcmp (In_Buff, "sequence")) { while ((ch == ' ') || (ch == '\t') || (ch == '\n')) ch = fgetc (*enz_file); if (ch != '=') { /* Error condition */ return NTL2_LE_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Illegal file format", 19); } else { ch = fgetc (*enz_file); while ((ch == ' ') || (ch == '\t') || (ch == '\n')) ch = fgetc (*enz_file); if ((status = NTL2_Assemble_String (enz_file, ch, str)) == NULL) { *next_tok = FTOK_SEQ; return NULL; } } } else if (!strcmp (In_Buff, "cut_site")) { while ((ch == ' ') || (ch == '\t') || (ch == '\n')) ch = fgetc (*enz_file); if (ch != '=') { /* Error condition */ return NTL2_LE_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Illegal file format", 19); } else { /* Proceed to find what follows the "equal" sign */ ch = fgetc (*enz_file); while ((ch == ' ') || (ch == '\t') || (ch == '\n')) ch = fgetc (*enz_file); if ((ch < '0') || (ch > '9')) { /* Error condition */ return NTL2_LE_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Illegal number in file", 20); } else { In_Buff [0] = ch; accept_pos = 1; while ((accept_pos < LNUM_SIZE) && (ch >= '0') && (ch <= '9')) { ch = fgetc (*enz_file); In_Buff [accept_pos++] = ch; } if (accept_pos >= LNUM_SIZE) { /* Error condition */ return NTL2_LE_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Illegal number in file", 21); } else { while ((ch == ' ') || (ch == '\t')) ch = fgetc (*enz_file); if (ch != '\n') { /* Error condition */ return NTL2_LE_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Illegal file format", 22); } else { accept_pos--; In_Buff [accept_pos] = '\0'; *str = NTL0_strsave (In_Buff); *next_tok = FTOK_CUT; return NULL; } } } } } else if (!strcmp (In_Buff, "inverse_cut")) { while ((ch == ' ') || (ch == '\t') || (ch == '\n')) ch = fgetc (*enz_file); if (ch != '=') { /* Error condition */ return NTL2_LE_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Illegal file format", 23); } else { /* Proceed to find what follows the "equal" sign */ ch = fgetc (*enz_file); while ((ch == ' ') || (ch == '\t') || (ch == '\n')) ch = fgetc (*enz_file); if ((ch < '0') || (ch > '9')) { /* Error condition */ return NTL2_LE_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Illegal number in file", 24); } else { In_Buff [0] = ch; accept_pos = 1; while ((accept_pos < LNUM_SIZE) && (ch >= '0') && (ch <= '9')) { ch = fgetc (*enz_file); In_Buff [accept_pos++] = ch; } if (accept_pos >= LNUM_SIZE) { /* Error condition */ return NTL2_LE_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Illegal number in file", 25); } else { while ((ch == ' ') || (ch == '\t')) ch = fgetc (*enz_file); if (ch != '\n') { /* Error condition */ return NTL2_LE_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Illegal file format", 26); } else { accept_pos--; In_Buff [accept_pos] = '\0'; *str = NTL0_strsave (In_Buff); *next_tok = FTOK_INVCUT; return NULL; } } } } } else if (!strcmp (In_Buff, "note")) { while ((ch == ' ') || (ch == '\t') || (ch == '\n')) ch = fgetc (*enz_file); if (ch != '=') { /* Error condition */ return NTL2_LE_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Illegal file format", 27); } else { ch = fgetc (*enz_file); while ((ch == ' ') || (ch == '\t') || (ch == '\n')) ch = fgetc (*enz_file); if ((status = NTL2_Assemble_String (enz_file, ch, str)) == NULL) { *next_tok = FTOK_NOTE; return NULL; } } } else if ((!strcmp (In_Buff, "end")) || (!strcmp (In_Buff, "END"))) { /* The end of current enzyme */ while ((ch == ' ') || (ch == '\t') || (ch == '\n')) ch = fgetc (*enz_file); accept_pos = 0; while ((accept_pos < IN_LIMIT) && (ch != EOF) && (ch != ' ') && (ch != '\n') && (ch != '\t')) { In_Buff [accept_pos++] = ch; ch = fgetc (*enz_file); } if (accept_pos >= IN_LIMIT) { /* Error condition */ return NTL2_LE_Assemble_Error (USER_ERROR, ERR_BUFFER_OVERFLOW, "Text in file too long", 28); } else if (ch == EOF) { /* Error condition */ return NTL2_LE_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Unexpected end-of-file", 29); } else { /* Name of the block collected in acceptance buffer - check */ In_Buff [accept_pos] = '\0'; if (!strcmp (In_Buff, "enzyme")) { *str = NULL; *next_tok = FTOK_ITEM; return NULL; } return NTL2_LE_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Illegal conserved block conclusion", 30); } } else { /* Error condition */ return NTL2_LE_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Illegal token for enzyme in file", 31); } } } } /*****************************************************************************/ /* */ /* Procedure: NTL2_LE_Fill_Enzyme */ /* */ /* Procedure fills single received enzyme record with information passed to */ /* it, which depends on the token that caused the record change; returns */ /* NULL if everything was OK, or indication of problem in the program if */ /* an unknown type of token has been passed to it */ errind NTL2_LE_Fill_Enzyme (enzyme_ptr enzyme_rec, int next_tok, char **str) { switch (next_tok) { case FTOK_NAME: { enzyme_rec -> name = *str; *str = NULL; return NULL; } case FTOK_SEQ: { enzyme_rec -> sequence = *str; *str = NULL; return NULL; } case FTOK_CUT: { enzyme_rec -> cut_site = atol (*str); return NULL; } case FTOK_INVCUT: { enzyme_rec -> inverse_cut = atol (*str); return NULL; } case FTOK_NOTE: { enzyme_rec -> note = *str; *str = NULL; return NULL; } default: { return NTL2_LE_Assemble_Error (FATAL_ERROR, ERR_CODE_PROBLEM, "Illegal token passed to assembling", 32); } } } /*****************************************************************************/ /* */ /* Procedure: NTL2_LE_Assemble_Error */ /* */ /* Service procedure for assembling and returnning an error report, based on */ /* the values of the input parameters; returns the record with the report */ errind NTL2_LE_Assemble_Error (int severity, int code, char *comment, int description) { char *report; errind assembled; report = (char *) NTL0_ckalloc ( (strlen (comment) + strlen ("_Load_Enzymes: ") + 1) * sizeof (char)); sprintf (report, "_Load_Enzymes: %s", comment); assembled = NTL1_Error_Record (severity, code, report, description); free (report); return assembled; }