/*****************************************************************************/ /* */ /* UNIT: NTL2_Unpack_Alignment (Level 2 library routine) */ /* */ /* Author: Nikola Stojanovic */ /* */ /* Revision: 05 SEP 96 Version 1.0 */ /* */ /* Function: */ /* */ /* Procedure uses the partially unpacked information about an alignment to */ /* completely "unpack" it and create the appropriate internal structure */ /* which also includes the complete text of the sequences, with gap symbols */ /* inserted at the right places (the alignment information must have already */ /* been read from the file, along with other information, and the alignment */ /* must have already been partially unpacked); returns status of expansion */ /* in "standard" error structure, NULL if there were no errors */ /* */ /*****************************************************************************/ #include #include #include #include "ntl2.h" /*****************************************************************************/ /* */ /* Definitions section */ /* */ /*****************************************************************************/ /*****************************************************************************/ /* Definitions of local constants of the unit */ /*****************************************************************************/ /* Limits for predefined buffer sizes */ #define ERR_MSGLIMIT 128 /*****************************************************************************/ /* Types used only locally in this unit */ /*****************************************************************************/ typedef struct seqfile_fields { char *str; long int from; long int to; char *text; struct seqfile_fields *next; } SeqFile_Struct; typedef SeqFile_Struct *seqfile_ptr; /*****************************************************************************/ /* Prototypes of all locally used functions of this unit */ /*****************************************************************************/ errind NTL2_UA_Get_Raw (header_ptr file_info, align_ptr packed_info, strlist_ptr seq_paths, char ***strings); errind NTL2_UA_Assemble_Error (int severity, int code, char *comment, int description); /*****************************************************************************/ /* Definitions of global (static) variables of the unit */ /*****************************************************************************/ static char Error_Message [ERR_MSGLIMIT]; /* Temporary buffer, error passing */ /*****************************************************************************/ /* */ /* Code section */ /* */ /*****************************************************************************/ /*****************************************************************************/ /* */ /* PROCEDURE: NTL2_Unpack_Alignment */ /* */ /* Central procedure for the unit - full alignment expansion; returns error */ /* status, NULL if expansion was done without problems */ errind NTL2_Unpack_Alignment (int number, header_ptr file_info, align_ptr packed_info, strlist_ptr seq_paths, unpacked_ptr *expanded) { int seqs; long int place_pos, retr_pos, lindex, start_count, end_count; char **temp, *scan; line_ptr line_scan; errind err; /* Allocate the record for the alignment and start filling the contents */ *expanded = (unpacked_ptr) NTL0_ckalloc (sizeof (Unpacked_Alignment)); (*expanded) -> number = number; (*expanded) -> dimension = file_info -> dimension; (*expanded) -> score = packed_info -> score; /* Record begin and end vectors for the current alignment - needed range */ (*expanded) -> begin = (long int *) NTL0_ckalloc ((file_info -> dimension) * sizeof (long int)); (*expanded) -> end = (long int *) NTL0_ckalloc ((file_info -> dimension) * sizeof (long int)); for (seqs = 0; seqs < file_info -> dimension; seqs++) { ((*expanded) -> begin) [seqs] = (packed_info -> begin) [seqs]; ((*expanded) -> end) [seqs] = (packed_info -> end) [seqs]; } (*expanded) -> cut = FULL_ALIGNMENT; /* Assume uncut alignment here */ /* Proceed to extract the actual text of the sequences from data files */ if ((err = NTL2_UA_Get_Raw (file_info, packed_info, seq_paths, &temp)) != NULL) return err; /* Error loading sequences - return the error record */ /* Now use the alignment lines information to expand the sequence strings */ /* and insert gaps (gap symbols) to appropriate places */ (*expanded) -> texts = (char **) NTL0_ckalloc (((*expanded) -> dimension) * sizeof (char *)); for (seqs = 0; seqs < (*expanded) -> dimension; seqs++) { ((*expanded) -> texts) [seqs] = (char *) NTL0_ckalloc (((packed_info -> size) + 1) * sizeof (char)); /* Reformat the actual texts by placing characters & gaps where needed */ place_pos = retr_pos = 0; for (line_scan = (packed_info -> lines) [seqs]; line_scan != NULL; line_scan = line_scan -> next) { if (line_scan -> code == GAP_STRETCH) { for (lindex = 0; lindex < line_scan -> length; lindex++) { (((*expanded) -> texts) [seqs]) [place_pos] = GAP_SYMBOL; place_pos++; } } else if (line_scan -> code == TEXT_STRETCH) { for (lindex = 0; lindex < line_scan -> length; lindex++) { (((*expanded) -> texts) [seqs]) [place_pos] = (temp [seqs]) [retr_pos]; retr_pos++; place_pos++; } } else { sprintf (Error_Message, "Unknown line stretch code (%d)", line_scan -> code); return NTL2_UA_Assemble_Error (SYSTEM_ERROR, ERR_ILLEGAL_CODE, Error_Message, 0); } } (((*expanded) -> texts) [seqs]) [place_pos] = '\0'; } (*expanded) -> size = packed_info -> size; /* Release the temporary string buffers now - not needed any more */ for (seqs = 0; seqs < (*expanded) -> dimension; seqs++) free (temp [seqs]); free (temp); (*expanded) -> segment_code = (int *) NTL0_ckalloc (((*expanded) -> dimension) * sizeof (int)); for (seqs = 0; seqs < (*expanded) -> dimension; seqs++) { ((*expanded) -> segment_code) [seqs] = VALID_SEGMENT; } /* Now record the absolute starting positions for sequences in alignment */ (*expanded) -> starts = (long int *) NTL0_ckalloc (((*expanded) -> dimension) * sizeof (long int)); for (seqs = 0; seqs < (*expanded) -> dimension; seqs++) { scan = ((*expanded) -> texts) [seqs]; start_count = 0; while (*scan == GAP_SYMBOL) { scan++; start_count++; } if (*scan == '\0') return NTL2_UA_Assemble_Error (SYSTEM_ERROR, ERR_BAD_STRUCTURE, "Sequence has all gaps", 0); else ((*expanded) -> starts) [seqs] = start_count; } /* Record the absolute ending positions for sequences in the alignment */ (*expanded) -> stops = (long int *) NTL0_ckalloc (((*expanded) -> dimension) * sizeof (long int)); for (seqs = 0; seqs < (*expanded) -> dimension; seqs++) { scan = &(((*expanded) -> texts) [seqs] [((*expanded) -> size) - 1]); end_count = ((*expanded) -> size) - 1; while (*scan == GAP_SYMBOL) { scan--; end_count--; } if (end_count < 0) return NTL2_UA_Assemble_Error (SYSTEM_ERROR, ERR_BAD_STRUCTURE, "Sequence has all gaps", 0); else ((*expanded) -> stops) [seqs] = end_count; } return NULL; } /*****************************************************************************/ /* */ /* Internal procedures/utilities for serving the external request */ /* */ /*****************************************************************************/ /*****************************************************************************/ /* */ /* Procedure: NTL2_UA_Get_Raw */ /* */ /* Procedure to fill-in the contents of buffer of strings containing raw */ /* text of the sequences in the alignment, as read from the file; returns */ /* the buffer, as parameter, and the error structure, NULL if OK */ errind NTL2_UA_Get_Raw (header_ptr file_info, align_ptr packed_info, strlist_ptr seq_paths, char ***strings) { seqfile_ptr differs, sd, temp, trash; int seqs; bool found; char *full_name; long int amount, start, index, copier; char **rbuf; errind errstat; strlist_ptr current_path; /* Create the list of files with the range that encloses all used ranges */ differs = NULL; errstat = NULL; for (seqs = 0; seqs < file_info -> dimension; seqs++) { found = FALSE; sd = differs; while ((!found) && (sd != NULL)) { if (!strcmp (((file_info -> sequences) [seqs]).seq_name, sd -> str)) { if ((packed_info -> begin) [seqs] < sd -> from) sd -> from = (packed_info -> begin) [seqs]; if ((packed_info -> end) [seqs] > sd -> to) sd -> to = (packed_info -> end) [seqs]; found = TRUE; } else sd = sd -> next; /* Not the same file */ } if (!found) { /* Another file, not already "seen" */ temp = (seqfile_ptr) NTL0_ckalloc (sizeof (SeqFile_Struct)); temp -> str = ((file_info -> sequences) [seqs]).seq_name; temp -> from = (packed_info -> begin) [seqs]; temp -> to = (packed_info -> end) [seqs]; temp -> text = NULL; temp -> next = differs; differs = temp; } } for (sd = differs; sd != NULL; sd = sd -> next) { sd -> text = NULL; /* No text in any record before it has been read */ /* Try to access the current sequence in the current working directory 1st */ if ((errstat = NTL2_Load_Sequence (sd -> str, sd -> from, sd -> to, &(sd -> text))) != NULL) { /* Since the "open" attempt in the current directory failed, try all */ /* provided paths, if any, in order of their appearance */ sd -> text = NULL; current_path = seq_paths; while ((sd -> text == NULL) && (current_path != NULL)) { full_name = NTL0_ckalloc ((strlen (current_path -> string) + strlen (sd -> str) + 2) * sizeof (char)); strcpy (full_name, current_path -> string); strcat (full_name, "/"); strcat (full_name, sd -> str); if ((errstat = NTL2_Load_Sequence (full_name, sd -> from, sd -> to, &(sd -> text))) != NULL) { sd -> text = NULL; current_path = current_path -> next; } free (full_name); } } /* Now check if the sequence text has been loaded, and report error if not */ if (sd -> text == NULL) { if (errstat != NULL) return errstat; else { sprintf (Error_Message, "Empty text for alignment sequence(s) '%s'", sd -> str); return NTL2_UA_Assemble_Error (FATAL_ERROR, ERR_NO_VALUE, Error_Message, 0); } } } /* Proceed to extract appropriate ranges for all involved sequences */ rbuf = (char **) NTL0_ckalloc ((file_info -> dimension) * sizeof (char *)); for (seqs = 0; seqs < file_info -> dimension; seqs++) { /* Find the appropriate sequence record first */ found = FALSE; sd = differs; while ((!found) && (sd != NULL)) { if (!strcmp (((file_info -> sequences) [seqs]).seq_name, sd -> str)) found = TRUE; else sd = sd -> next; } if (!found) { sprintf (Error_Message, "Sequence name '%s' not found in raw", ((file_info -> sequences) [seqs]).seq_name); return NTL2_UA_Assemble_Error (SYSTEM_ERROR, ERR_NO_VALUE, Error_Message, 0); } /* Now allocate the buffer and copy the contents */ amount = (packed_info -> end) [seqs] - (packed_info -> begin) [seqs] + 1; rbuf [seqs] = (char *) NTL0_ckalloc ((amount + 1) * sizeof (char)); start = (packed_info -> begin) [seqs] - (sd -> from); index = 0; for (copier = start; copier < start + amount; copier++) rbuf [seqs] [index++] = (sd -> text) [copier]; rbuf [seqs] [index] = '\0'; } /* Now deallocate all used cells from the list of involved files */ sd = differs; while (sd != NULL) { trash = sd; sd = sd -> next; free (trash -> text); free (trash); } differs = NULL; *strings = rbuf; return NULL; } /*****************************************************************************/ /* */ /* Procedure: NTL2_UA_Assemble_Error */ /* */ /* Service procedure for assembling and returnning an error report, based on */ /* the values of the input parameters; returns the record with the report */ errind NTL2_UA_Assemble_Error (int severity, int code, char *comment, int description) { char *report; errind assembled; report = (char *) NTL0_ckalloc ( (strlen (comment) + strlen ("_Unpack_Alignment: ") + 1) * sizeof (char)); sprintf (report, "_Unpack_Alignment: %s", comment); assembled = NTL1_Error_Record (severity, code, report, description); free (report); return assembled; }