/*****************************************************************************/ /* */ /* UNIT: NTL2_Load_AlignFile (Level 2 library routine) */ /* */ /* Author: Nikola Stojanovic */ /* */ /* Revision: 11 JUL 94 Version 1.0 */ /* */ /* Function: */ /* */ /* Procedure loads the alignment structures (as defined in the system) */ /* from the specified file in one of the recognized formats ("lav" and "lat" */ /* as of now). */ /* Procedure receives the name of the file containing alignment data and */ /* fills the pointer to the "header structure", containing data retrieved */ /* from the file; returns the error structure, NULL if everything was OK */ /* */ /* */ /* Expected format of the alignment file(s): */ /* */ /* 1. "lav" format: */ /* */ /* - First non-white-space sequence of symbols in the file must be: */ /* */ /* #:lav */ /* */ /* - */ /* */ /* 2. "lat" format: */ /* */ /* - First non-white-space sequence of symbols in the file must be: */ /* */ /* #:lat */ /* */ /* - */ /* */ /*****************************************************************************/ #include #include #include #include "ntl2.h" /*****************************************************************************/ /* */ /* Definitions section */ /* */ /*****************************************************************************/ /*****************************************************************************/ /* Definitions of local constants of the unit */ /*****************************************************************************/ /* Limits for predefined buffer sizes */ #define ERR_MSGLIMIT 128 /* Information is read from alignment files by supplementary buffer of size: */ #define IN_LIMIT 512 /*****************************************************************************/ /* Prototypes of all locally used functions of this unit */ /*****************************************************************************/ errind NTL2_LAF_Load_Blocks (header_ptr target, FILE **source); int NTL2_LAF_Get_Format (FILE **source); char NTL2_LAF_To_Block (FILE **file_ptr); errind NTL2_LAF_Get_Block (FILE **file_ptr, block_ptr *list); errind NTL2_LAF_Get_Description (FILE **file_ptr, char **desc); errind NTL2_LAF_Get_Sequences (FILE **source, int *dimension, seq_ptr *seq); int NTL2_LAF_Legal_Desc (char ch); void NTL2_LAF_Enter_Char (char ch, int *in_temp, char **perm, int *in_perm, int *perm_size); int NTL2_LAF_End_Check (char *buff); errind NTL2_LAF_Assemble_Error (int severity, int code, char *comment, int description); /*****************************************************************************/ /* Definitions of global (static) variables of the unit */ /*****************************************************************************/ static char In_Buff [IN_LIMIT]; /* Global input line buffer, for convenience */ static char Error_Message [ERR_MSGLIMIT]; /* Temporary buffer, error passing */ /*****************************************************************************/ /* */ /* Code section */ /* */ /*****************************************************************************/ /*****************************************************************************/ /* */ /* PROCEDURE: NTL2_Load_AlignFile */ /* */ /* Central procedure for the alignment file loading unit */ errind NTL2_Load_AlignFile (char *file_name, header_ptr *file_data) { FILE *source_file; errind status; /* Check first whether the file with specified name exists, error if not */ if ((source_file = fopen (file_name, "r")) == NULL) { /* Error - no file */ sprintf (Error_Message, "File <%s> does not exist", file_name); return NTL2_LAF_Assemble_Error (USER_ERROR, ERR_NO_FILE, Error_Message, 0); } else { /* File exists, now load its contents */ /* Allocate space for the header of the file to be loaded */ *file_data = (header_ptr) NTL0_ckalloc (sizeof (Header_Struct)); /* Proceed to fill in the database contents based on the loaded file */ strcpy ((*file_data) -> tag, file_name); /* File name serve as tag */ (*file_data) -> dimension = 0; /* It will be set up after data is loaded */ (*file_data) -> description = NULL; (*file_data) -> sequences = NULL; (*file_data) -> generated = NULL; (*file_data) -> keywords = NULL; (*file_data) -> alignments = NULL; (*file_data) -> numalign = 0; /* Now when the new header is allocated and initialized, proceed to load */ /* the specified file - the number of alignments will be set in process */ if ((status = NTL2_LAF_Load_Blocks (*file_data, &source_file)) != NULL) *file_data = NTL1_Destroy_Header (*file_data); fclose (source_file); return status; } } /*****************************************************************************/ /* */ /* Procedure: NTL2_LAF_Load_Blocks */ /* */ /* Procedure for loading a new file in appropriate block format into the */ /* given database; returns the status of loading */ errind NTL2_LAF_Load_Blocks (header_ptr target, FILE **source) { int num_aligns; char selector, ch; errind status; /* Get the format of the file and store it into "format" field of header */ if ((target -> format = NTL2_LAF_Get_Format (source)) == NO_FORMAT) return NTL2_LAF_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Illegal file format", 0); else { /* File is of correct format */ num_aligns = 0; /* Prepare to count alignments */ /* Continue reading information blocks from the file until the end is seen */ while ((selector = NTL2_LAF_To_Block (source)) != '\0') { switch (selector) { case 'g': { /* Start of the "generated" type block */ if ((status = NTL2_LAF_Get_Block (source, &(target -> generated))) != NULL) return status; else break; } case 'd': { /* Start of the "description" type block */ if (target -> description != NULL) return NTL2_LAF_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Multiple descriptions in file", 0); else if ((status = NTL2_LAF_Get_Description (source, &(target -> description))) != NULL) return status; else break; } case 's': { /* Start of the "sequences" type block */ if (target -> sequences != NULL) return NTL2_LAF_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Multiple sequences block in file", 0); else if ((status = NTL2_LAF_Get_Sequences (source, &(target -> dimension), &(target -> sequences))) != NULL) return status; else break; } case 'k': { /* Start of the "keywords" type block */ if ((status = NTL2_LAF_Get_Block (source, &(target -> keywords))) != NULL) return status; else break; } case 'a': { /* Start of the "alignment" type block */ if ((status = NTL2_LAF_Get_Block (source, &(target -> alignments))) != NULL) return status; else { num_aligns++; break; } } default: { /* Unknown descriptor of a block */ sprintf (Error_Message, "Illegal block descriptor (%c) in file", selector); return NTL2_LAF_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, Error_Message, 0); } } } } /* If "survived" up to this point, there may had not been any errors */ target -> numalign = num_aligns; /* Record # of alignments */ /* Flip all list entries connected to the header (to restore ordering) */ if (target -> generated != NULL) target -> generated = NTL1_Flip_Blocks (target -> generated); if (target -> keywords != NULL) target -> keywords = NTL1_Flip_Blocks (target -> keywords); if (target -> alignments != NULL) target -> alignments = NTL1_Flip_Blocks (target -> alignments); return NULL; } /*****************************************************************************/ /* */ /* Procedure: NTL2_LAF_Get_Format */ /* */ /* Procedure to retrieve the format of the given alignment file, based on */ /* its "identification string"; returns format code, if any, or */ /* "NO_FORMAT" if identification string could not be recognized */ int NTL2_LAF_Get_Format (FILE **source) { char ch; do { ch = fgetc (*source); } while ((ch != EOF) && (ch != '#')); if (ch == EOF) return NO_FORMAT; else { if (fgetc (*source) != ':') return NO_FORMAT; if (fgetc (*source) != 'l') return NO_FORMAT; if (fgetc (*source) != 'a') return NO_FORMAT; ch = fgetc (*source); if (ch == 'v') return FORMAT_LAV; else if (ch == 't') return FORMAT_LAT; else return NO_FORMAT; } } /*****************************************************************************/ /* */ /* Procedure: NTL2_LAF_To_Block */ /* */ /* Procedure reads and discards all file contents up to the start of the */ /* next block of information (or up to the end of file, if there are no */ /* blocks ahead); receives the file pointer (pointer to pointer), returns */ /* '\0' if end-of-file has been hit, code of new block otherwise */ char NTL2_LAF_To_Block (FILE **file_ptr) { char *cstr, sel; int end_hit; do { /* Read next buffer from the file */ cstr = fgets (In_Buff, IN_LIMIT, *file_ptr); } while ((cstr != NULL) && ((sel = NTL2_LAF_Legal_Desc (In_Buff [0])) == '\0') && (!(end_hit = NTL2_LAF_End_Check (In_Buff)))); if ((cstr == NULL) || (end_hit)) return '\0'; /* This is end-of-file */ else return sel; /* Character code of the new block */ } /*****************************************************************************/ /* */ /* Procedure: NTL2_LAF_Get_Block */ /* */ /* Procedure reads the contents of informational block from the file and */ /* stores it in an internal record; receives the file pointer and pointer */ /* to the start of appropriate list for file blocks storage, appends the */ /* new block to existing ones; returns the operation status (error */ /* structure or NULL if everything was OK) */ errind NTL2_LAF_Get_Block (FILE **file_ptr, block_ptr *list) { int repeat_new, repeat_space, end_hit; char ch, *accept_buff; int in_buff, in_accept, accept_size, index; block_ptr new_block; /* Set the control variables to check for duplicate new lines and spaces */ repeat_new = TRUE; repeat_space = FALSE; end_hit = FALSE; in_buff = in_accept = accept_size = 0; accept_buff = NULL; do { ch = fgetc (*file_ptr); /* Get next character from alignment file */ if ((ch == EOF) || (ch == '\0')) /* Not permitted inside a block */ return NTL2_LAF_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Unexpected end-of-file", 0); else if ((ch == ' ') || (ch == '\t')) { /* Space or seen */ if ((!repeat_new) && (!repeat_space)) { /* ... duplicated? */ NTL2_LAF_Enter_Char (ch, &in_buff, &accept_buff, &in_accept, &accept_size); repeat_space = TRUE; } } else if (ch == '\n') { /* Start of a new line */ if ((!repeat_new) && (!repeat_space)) { /* Check for duplicate */ NTL2_LAF_Enter_Char (ch, &in_buff, &accept_buff, &in_accept, &accept_size); repeat_new = TRUE; } else if (repeat_space) { /* If spaces before this newline, remove them */ if (in_buff > 0) In_Buff [in_buff - 1] = ch; else accept_buff [in_accept - 1] = ch; repeat_space = FALSE; repeat_new = TRUE; } } else if ((repeat_new) && (ch == '}')) end_hit = TRUE; /* '}' after newline */ else { /* Not a separator character, just store it */ NTL2_LAF_Enter_Char (ch, &in_buff, &accept_buff, &in_accept, &accept_size); repeat_new = repeat_space = FALSE; } } while (!end_hit); /* Keep handling character until the end of the block */ /* Now when the end-of-block is seen, proceed to copy all residue data from */ /* temporary input buffer and finalize the string containing the block */ if (in_buff > 0) { /* There are residue data in the input buffer */ if (accept_buff == NULL) { accept_size = in_buff + 1; accept_buff = (char *) NTL0_ckalloc (accept_size); for (index = 0; index < in_buff; index++) accept_buff [index] = In_Buff [index]; accept_buff [accept_size - 1] = '\0'; } else { /* There is already some contents in accepting buffer - expand */ accept_size = in_accept + in_buff + 1; accept_buff = (char *) NTL0_ckrealloc (accept_buff, accept_size); for (index = 0; index < in_buff; index++) accept_buff [in_accept + index] = In_Buff [index]; accept_buff [accept_size - 1] = '\0'; } } else if (accept_buff == NULL) { /* No contents found in the block */ /* Since there are no guarantees how warnings will be handled, allocate */ new_block = (block_ptr) NTL0_ckalloc (sizeof (Block_Struct)); new_block -> contents = (char *) NTL0_ckalloc (2 * sizeof (char)); (new_block -> contents) [0] = '\n'; (new_block -> contents) [1] = '\0'; new_block -> next = *list; *list = new_block; return NTL2_LAF_Assemble_Error (WARNING, ERR_FILE_FORMAT, "Empty block in alignment file", 0); } else { /* All data from temporary input buffer already copied */ if (in_accept == accept_size) { accept_buff = (char *) NTL0_ckrealloc (accept_buff, accept_size + 1); accept_buff [accept_size] = '\0'; accept_size++; } else { accept_buff [in_accept] = '\0'; accept_size = in_accept + 1; } } if (accept_buff [accept_size - 2] != '\n') { /* Check if newline before end */ return NTL2_LAF_Assemble_Error (FATAL_ERROR, ERR_CODE_PROBLEM, "String in accept buffer without new_line", 0); } /* Allocate a cell for the new block */ new_block = (block_ptr) NTL0_ckalloc (sizeof (Block_Struct)); new_block -> contents = accept_buff; new_block -> next = *list; *list = new_block; /* Connect to header */ return NULL; /* If this point is reached, there could be no errors */ } /*****************************************************************************/ /* */ /* Procedure: NTL2_LAF_Get_Description */ /* */ /* Load the contents of the alignment file description block, as character */ /* buffer; return status of loading */ errind NTL2_LAF_Get_Description (FILE **file_ptr, char **desc) { char ch; int in_buff, in_accept, accept_size, index; int quote, end_hit, last_new; end_hit = quote = FALSE; last_new = TRUE; /* Settings for status of text */ in_buff = in_accept = accept_size = 0; do { /* Proceed to collect "description" character by character */ ch = fgetc (*file_ptr); if ((ch == EOF) || (ch == '\0')) /* Unexpected end-of-file inside block */ return NTL2_LAF_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Unexpected end-of-file", 0); else if (ch == '\"') { if (quote) quote = FALSE; else quote = TRUE; /* No quote nesting */ NTL2_LAF_Enter_Char (ch, &in_buff, desc, &in_accept, &accept_size); last_new = FALSE; } else if (ch == '\n') { NTL2_LAF_Enter_Char (ch, &in_buff, desc, &in_accept, &accept_size); last_new = TRUE; } else if (ch == '}') { /* End of "description" block, if not masked */ if ((last_new) && (!quote)) { end_hit = TRUE; while ((in_buff > 0) && ((In_Buff [in_buff - 1] == ' ') || (In_Buff [in_buff - 1] == '\t'))) in_buff--; /* Eliminate all trailing spaces from the block */ if (in_buff == 0) while ((in_accept > 0) && (((*desc) [in_accept - 1] == ' ') || ((*desc) [in_accept - 1] == '\t'))) in_accept--; } else { /* Just an "ordinary" character to store */ NTL2_LAF_Enter_Char (ch, &in_buff, desc, &in_accept, &accept_size); last_new = FALSE; } } else { /* Just an "ordinary" character to store */ NTL2_LAF_Enter_Char (ch, &in_buff, desc, &in_accept, &accept_size); last_new = FALSE; } } while (!end_hit); /* Keep collecting characters until the end of block */ /* Now when the end-of-block is seen, proceed to copy all residue data from */ /* temporary input buffer and finalize the string containing description */ if (in_buff > 0) { if (*desc == NULL) { accept_size = in_buff + 1; *desc = (char *) NTL0_ckalloc (accept_size); for (index = 0; index < in_buff; index++) (*desc) [index] = In_Buff [index]; (*desc) [accept_size - 1] = '\0'; } else { /* There is some already allocated contents - expand */ accept_size = in_accept + in_buff + 1; *desc = (char *) NTL0_ckrealloc (*desc, accept_size); for (index = 0; index < in_buff; index++) (*desc) [in_accept + index] = In_Buff [index]; (*desc) [accept_size - 1] = '\0'; } } else if (*desc == NULL) { /* Descriptions should not be empty */ accept_size = 2; *desc = (char *) NTL0_ckalloc (accept_size); (*desc) [0] = '\n'; (*desc) [1] = '\0'; return NTL2_LAF_Assemble_Error (WARNING, ERR_FILE_FORMAT, "Empty description in alignment file", 0); } else { /* Check if there is space for string terminator in the buffer */ if (in_accept == accept_size) { *desc = (char *) NTL0_ckrealloc (*desc, accept_size + 1); (*desc) [accept_size] = '\0'; accept_size++; } else { (*desc) [in_accept] = '\0'; accept_size = in_accept + 1; } } if ((*desc) [accept_size - 2] != '\n') { return NTL2_LAF_Assemble_Error (FATAL_ERROR, ERR_CODE_PROBLEM, "String in accept buffer without new_line", 0); } else return NULL; /* Everything in correct format */ } /*****************************************************************************/ /* */ /* Procedure: NTL2_LAF_Get_Sequences */ /* */ /* Load data about the sequences in the alignments, along with the dimension */ /* of alignments; return the status of loading */ errind NTL2_LAF_Get_Sequences (FILE **source, int *dimension, seq_ptr *seq) { char *accept, *first, *rest; int quotes, index; block_ptr dummy; errind stat; /* Get block buffer with text from the file first */ dummy = NULL; if ((stat = NTL2_LAF_Get_Block (source, &dummy)) != NULL) return stat; else { accept = dummy -> contents; free (dummy); /* Block cell not needed */ } /* Proceed to organize information about sequences extracted from buffer */ rest = accept; quotes = 0; while (*rest != '\0') { if (*rest == '\"') quotes++; rest++; } *dimension = (int) (quotes / 2); if ((*dimension) * 2 != quotes) /* Count of quotes seen not even */ return NTL2_LAF_Assemble_Error (USER_ERROR, ERR_FILE_FORMAT, "Incorrect format of sequences block", 0); else { /* Correct alignment dimension established */ (*seq) = (seq_ptr) NTL0_ckalloc ((*dimension) * sizeof (Seq_Struct)); /* Scan all sequence data from the buffer, dimension by dimension */ rest = accept; for (index = 0; index < *dimension; index++) { NTL0_tsplit (rest, &first, &rest); /* Extract the first word from rest */ first [strlen (first) - 1] = '\0'; ((*seq) [index]).seq_name = NTL0_strsave (&(first [1])); free (first); ((*seq) [index]).alias = NULL; /* No aliases in loading */ /* Now get the range that is involved in the alignments from the file */ NTL0_tsplit (rest, &first, &rest); ((*seq) [index]).begin = atol (first); free (first); NTL0_tsplit (rest, &first, &rest); ((*seq) [index]).end = atol (first); free (first); } if (*rest != '\0') { /* More contents left in the buffer */ return NTL2_LAF_Assemble_Error (FATAL_ERROR, ERR_CODE_PROBLEM, "Incorrect balancing of sequences string", 0); } else { free (accept); /* Raw text not needed any more - information extracted */ return NULL; /* Sequences and ranges correctly recorded */ } } } /*****************************************************************************/ /* */ /* Procedure: NTL2_LAF_Legal_Desc */ /* */ /* Check whether the received character represents legal block type code */ int NTL2_LAF_Legal_Desc (char ch) { if ((ch == 'g') || (ch == 'd') || (ch == 's') || (ch == 'k') || (ch == 'a')) return ch; else return '\0'; } /*****************************************************************************/ /* */ /* Procedure: NTL2_LAF_Enter_Char */ /* */ /* Supplementary procedure for placing a character into a given buffer, */ /* "through" temporary "In_Buff": when temporary buffer is filed, space */ /* is allocated and all contents moved into the actual storage, resetting */ /* the temporary buffer settings */ void NTL2_LAF_Enter_Char (char ch, int *in_temp, char **perm, int *in_perm, int *perm_size) { int index; In_Buff [(*in_temp)++] = ch; if (*in_temp == IN_LIMIT) { if (*perm_size == 0) { *perm = (char *) NTL0_ckalloc (IN_LIMIT); for (index = 0; index < IN_LIMIT; index++) (*perm) [index] = In_Buff [index]; *perm_size = IN_LIMIT; *in_perm = IN_LIMIT; } else { *perm = (char *) NTL0_ckrealloc (*perm, *perm_size + IN_LIMIT); for (index = 0; index < IN_LIMIT; index++) (*perm) [*in_perm + index] = In_Buff [index]; *perm_size += IN_LIMIT; *in_perm += IN_LIMIT; } *in_temp = 0; } } /*****************************************************************************/ /* */ /* Procedure: NTL2_LAF_End_Check */ /* */ /* Procedure to check for the occurence of the "endmarker" in the received */ /* string; returns TRUE if enmarker is recognized, FALSE otherwise */ int NTL2_LAF_End_Check (char *buff) { char *sc; sc = buff; while ((*sc != '#') && (*sc != '\n') && (*sc != '\0')) sc++; if ((*sc == '\n') || (*sc == '\0')) return FALSE; else { sc++; if (*sc != ':') return FALSE; sc++; if (*sc != 'e') return FALSE; sc++; if (*sc != 'n') return FALSE; sc++; if (*sc != 'd') return FALSE; else return TRUE; } } /*****************************************************************************/ /* */ /* Procedure: NTL2_LAF_Assemble_Error */ /* */ /* Service procedure for assembling and returnning an error report, based on */ /* the values of the input parameters; returns the record with the report */ errind NTL2_LAF_Assemble_Error (int severity, int code, char *comment, int description) { char *report; errind assembled; report = (char *) NTL0_ckalloc ( (strlen (comment) + strlen ("_Load_AlignFile: ") + 1) * sizeof (char)); sprintf (report, "_Load_AlignFile: %s", comment); assembled = NTL1_Error_Record (severity, code, report, description); free (report); return assembled; }