/*****************************************************************************/ /* */ /* UNIT: NTL2_Load_IMDfile (Level 2 library routine) */ /* */ /* Author: Nikola Stojanovic */ /* */ /* Revision: 06 MAY 96 Version 1.0 */ /* 27 MAR 97 Version 1.1 */ /* */ /* Function: */ /* */ /* Procedure loads the specified Information Matrix Database file, in */ /* the "plain text" format, into internal structures more convenient for */ /* processing the information and converts the "summary" matrix format into */ /* "specificity" format. */ /* Procedure receives the name of the file containing IMD data and the */ /* alphabet information (arrays - letter indices of matrix rows and letter */ /* relative frequences), fills the pointer (reference parameter) to the */ /* begining of the assembled list of motifs; returns the error structure, */ /* NULL if everything was OK */ /* */ /* */ /* Expected format of the Information Matrix Database file: */ /* */ /* */ /*****************************************************************************/ #include #include #include #include #include "ntl2.h" /*****************************************************************************/ /* */ /* Definitions section */ /* */ /*****************************************************************************/ /*****************************************************************************/ /* Definitions of local constants of the unit */ /*****************************************************************************/ /* Limit for the length of an error message that can be assembled */ #define ERR_MSGLIMIT 128 /* Information is read from database files by supplementary buffer of size: */ #define IN_LIMIT 100 /* Definition of the delimiting symbol between matrix row index and contents */ #define SEPARATOR_SYMBOL '|' #define LOG_BASE 2.0 /*****************************************************************************/ /* Prototypes of all locally used types of this unit */ /*****************************************************************************/ typedef struct summary_fields { char *site_name; double cutoff; double max_score; char *sequence; int seq_length; char *reference; int **matrix; struct summary_fields *next; } Summary_Struct; typedef Summary_Struct *summary_ptr; /*****************************************************************************/ /* Prototypes of all locally used functions of this unit */ /*****************************************************************************/ errind NTL2_LI_Read_IMDfile (FILE **imd_file, int dimension, int *Indices, summary_ptr *matrices); errind NTL2_LI_Convert_IMDfile (char *file_name, summary_ptr summaries, int dimension, int *Indices, double *Frequencies, imd_ptr *motifs); int NTL2_LI_Alphabet_Size (int *Indices); summary_ptr NTL2_LI_Destroy_TempList (summary_ptr list); errind NTL2_LI_Assemble_Error (int severity, int code, char *comment, int description); /*****************************************************************************/ /* Definitions of global (static) variables of the unit */ /*****************************************************************************/ static char In_Buff [IN_LIMIT]; /* Global input line buffer, for convenience */ static char Error_Message [ERR_MSGLIMIT]; /* Temporary buffer, error passing */ /*****************************************************************************/ /* */ /* Code section */ /* */ /*****************************************************************************/ /*****************************************************************************/ /* */ /* Procedure: NTL2_Load_IMDfile */ /* */ /* Main (interface) procedure of this unit */ errind NTL2_Load_IMDfile (char *file_name, int *Indices, double *Frequencies, imd_ptr *motifs) { FILE *imd_file; summary_ptr temps; int dim; errind erret; /* Initialize the list of summary matrices loaded to "empty" before actions */ temps = NULL; /* Check whether the file with given file name exists and open it for read */ if ((imd_file = fopen (file_name, "r")) == NULL) { /* Error condition */ sprintf (Error_Message, "File <%s> does not exist", file_name); return NTL2_LI_Assemble_Error (USER_ERROR, ERR_NO_FILE, Error_Message, 0); } else { /* There is a file with the specified name */ /* Find out the size of the alphabet (number of rows in each matrix) */ dim = NTL2_LI_Alphabet_Size (Indices); /* Now load the motifs from the file and close the file when done */ erret = NTL2_LI_Read_IMDfile (&imd_file, dim, Indices, &temps); if ((erret != NULL) && (erret -> kind != WARNING)) { /* Error status ret. */ /* Release the assembled part of the motifs list - it won't be needed */ temps = NTL2_LI_Destroy_TempList (temps); } fclose (imd_file); /* Initialize the list of motifs to return to "empty" before processing */ *motifs = NULL; if ((erret == NULL) || (erret -> kind == WARNING)) { /* If loading was OK */ /* Convert the collected matrices from summary to specificity format */ erret = NTL2_LI_Convert_IMDfile (file_name, temps, dim, Indices, Frequencies, motifs); } return erret; } } /*****************************************************************************/ /* */ /* Procedure: NTL2_LI_Read_IMDfile */ /* */ /* Top level procedure for reading the Information Matrix Database file */ /* contents; loops for each motif (matrix) defined in the file; receives */ /* the file pointer, alphabet size (number of rows in any matrix), already */ /* set vector of indices into matrix rows for permitted letters and */ /* (already initialized) list of temporary summary matrix structures to be */ /* filled, returns the assembled error record, NULL if there were no */ /* errors in loading */ errind NTL2_LI_Read_IMDfile (FILE **imd_file, int dimension, int *Indices, summary_ptr *matrices) { char *scan, *start_str, save_scan; int columns, rows, row_index; summary_ptr new_matrix; char *collect_buffer; int collect_size, items; collect_buffer = (char *) NTL0_ckalloc (IN_LIMIT * sizeof (char)); collect_size = IN_LIMIT; collect_buffer [0] = '\0'; items = 0; In_Buff [0] = '\0'; while ((fgets (In_Buff, IN_LIMIT, *imd_file) != NULL) && (In_Buff [0] != '\0')) { if (items + strlen (In_Buff) >= collect_size) { collect_size = collect_size * 2 + IN_LIMIT; collect_buffer = (char *) NTL0_ckrealloc (collect_buffer, collect_size * sizeof (char)); } strcat (collect_buffer, In_Buff); items += strlen (In_Buff); In_Buff [0] = '\0'; if (collect_buffer [items - 1] == '\n') { scan = collect_buffer; while ((*scan == ' ') || (*scan == '\t')) scan++; if ((*scan != '\n') && (*scan != '\0')) { /* Allocate a record for the new matrix to be loaded from the file */ new_matrix = (summary_ptr) NTL0_ckalloc (sizeof (Summary_Struct)); /* Extract the site name from the string loaded from the file */ start_str = scan; while ((*scan != ' ') && (*scan != '\t')) scan++; if ((*scan == '\n') || (*scan == '\0')) return NTL2_LI_Assemble_Error (USER_ERROR, ERR_BAD_STRUCTURE, "Bad IMD site information line", 0); save_scan = *scan; *scan = '\0'; new_matrix -> site_name = NTL0_strsave (start_str); *scan = save_scan; while ((*scan == ' ') || (*scan == '\t')) scan++; if ((*scan == '\n') || (*scan == '\0')) return NTL2_LI_Assemble_Error (USER_ERROR, ERR_BAD_STRUCTURE, "Bad IMD site information line", 0); /* Extract the threshold score from the string loaded from the file */ start_str = scan; while ((*scan != ' ') && (*scan != '\t')) { if ((*scan != '.') && ((*scan < '0') || (*scan > '9'))) return NTL2_LI_Assemble_Error (USER_ERROR, ERR_BAD_STRUCTURE, "Bad threshold for IMD matrix", 0); else scan++; } if ((*scan == '\n') || (*scan == '\0')) return NTL2_LI_Assemble_Error (USER_ERROR, ERR_BAD_STRUCTURE, "Bad IMD site information line", 0); save_scan = *scan; *scan = '\0'; new_matrix -> cutoff = atof (start_str); *scan = save_scan; while ((*scan == ' ') || (*scan == '\t')) scan++; if ((*scan == '\n') || (*scan == '\0')) return NTL2_LI_Assemble_Error (USER_ERROR, ERR_BAD_STRUCTURE, "Bad IMD site information line", 0); /* Extract the maximal score for the matrix from the loaded string */ start_str = scan; while ((*scan != ' ') && (*scan != '\t')) { if ((*scan != '.') && ((*scan < '0') || (*scan > '9'))) return NTL2_LI_Assemble_Error (USER_ERROR, ERR_BAD_STRUCTURE, "Bad maximal score for IMD matrix", 0); else scan++; } if ((*scan == '\n') || (*scan == '\0')) return NTL2_LI_Assemble_Error (USER_ERROR, ERR_BAD_STRUCTURE, "Bad IMD site information line", 0); save_scan = *scan; *scan = '\0'; new_matrix -> max_score = atof (start_str); *scan = save_scan; while ((*scan == ' ') || (*scan == '\t')) scan++; if ((*scan == '\n') || (*scan == '\0')) return NTL2_LI_Assemble_Error (USER_ERROR, ERR_BAD_STRUCTURE, "Bad IMD site information line", 0); /* Extract the sequence string for the matrix from the loaded string */ start_str = scan; while ((*scan != ' ') && (*scan != '\t')) { if ((*scan < 'A') || (*scan > 'Z')) return NTL2_LI_Assemble_Error (USER_ERROR, ERR_BAD_STRUCTURE, "Bad sequence for IMD motif", 0); else scan++; } if ((*scan == '\n') || (*scan == '\0')) return NTL2_LI_Assemble_Error (USER_ERROR, ERR_BAD_STRUCTURE, "Bad IMD site information line", 0); save_scan = *scan; *scan = '\0'; new_matrix -> sequence = NTL0_strsave (start_str); if ((new_matrix -> sequence) [0] == '\0') return NTL2_LI_Assemble_Error (USER_ERROR, ERR_BAD_STRUCTURE, "Bad sequence for IMD motif", 0); new_matrix -> seq_length = strlen (new_matrix -> sequence); *scan = save_scan; while ((*scan == ' ') || (*scan == '\t')) scan++; if ((*scan == '\n') || (*scan == '\0')) return NTL2_LI_Assemble_Error (USER_ERROR, ERR_BAD_STRUCTURE, "Bad IMD site information line", 0); /* Extract matrix information number from the loaded description line */ if (*scan != 'M') return NTL2_LI_Assemble_Error (USER_ERROR, ERR_BAD_STRUCTURE, "Bad IMD matrix ID", 0); start_str = scan; scan++; while ((*scan != ' ') && (*scan != '\t') && (*scan != '\n') && (*scan != '\0')) { if ((*scan < '0') || (*scan > '9')) return NTL2_LI_Assemble_Error (USER_ERROR, ERR_BAD_STRUCTURE, "Bad IMD matrix ID", 0); else scan++; } save_scan = *scan; *scan = '\0'; new_matrix -> reference = NTL0_strsave (start_str); *scan = save_scan; while ((*scan == ' ') || (*scan == '\t')) scan++; if ((*scan != '\n') && (*scan != '\0')) return NTL2_LI_Assemble_Error (USER_ERROR, ERR_BAD_STRUCTURE, "Bad IMD site information line", 0); /* Initialize the matrix structure before loading its contents */ new_matrix -> matrix = (int **) NTL0_ckalloc ((new_matrix -> seq_length) * sizeof (int *)); for (columns = 0; columns < new_matrix -> seq_length; columns++) { (new_matrix -> matrix) [columns] = (int *) NTL0_ckalloc (dimension * sizeof (int)); } collect_buffer [0] = '\0'; items = 0; /* Proceed to load the lines describing the rows of the matrix */ rows = 0; while (rows < dimension) { if ((fgets (In_Buff, IN_LIMIT, *imd_file) == NULL) || (In_Buff [0] == '\0')) return NTL2_LI_Assemble_Error (USER_ERROR, ERR_BAD_STRUCTURE, "Bad IMD matrix specification", 0); if (items + strlen (In_Buff) >= collect_size) { collect_size = collect_size * 2 + IN_LIMIT; collect_buffer = (char *) NTL0_ckrealloc (collect_buffer, collect_size * sizeof (char)); } strcat (collect_buffer, In_Buff); items += strlen (In_Buff); In_Buff [0] = '\0'; if (collect_buffer [items - 1] == '\n') { rows++; /* Proceed to find the letter whose row is now being filled */ scan = collect_buffer; while ((*scan == ' ') || (*scan == '\t')) scan++; if ((*scan < 'A') || (*scan > 'Z')) return NTL2_LI_Assemble_Error (USER_ERROR, ERR_BAD_STRUCTURE, "Bad IMD matrix specification", 0); else row_index = Indices [(int) (*scan) - (int) 'A']; if (row_index < 0) return NTL2_LI_Assemble_Error (USER_ERROR, ERR_BAD_STRUCTURE, "Illegal character row in IMD matrix", 0); /* Proceed to find the separator symbol for the start of the row */ scan++; while ((*scan == ' ') || (*scan == '\t')) scan++; if (*scan != SEPARATOR_SYMBOL) return NTL2_LI_Assemble_Error (USER_ERROR, ERR_BAD_STRUCTURE, "Bad IMD matrix specification", 0); else scan++; /* Loop to collect all columns of the located matrix row */ for (columns = 0; columns < new_matrix -> seq_length; columns++) { while ((*scan == ' ') || (*scan == '\t')) scan++; if ((*scan < '0') || (*scan > '9')) return NTL2_LI_Assemble_Error (USER_ERROR, ERR_BAD_STRUCTURE, "Bad IMD matrix element", 0); start_str = scan; while ((*scan >= '0') && (*scan <= '9')) scan++; save_scan = *scan; *scan = '\0'; (new_matrix -> matrix) [columns] [row_index] = atoi (start_str); *scan = save_scan; if ((*scan != ' ') && (*scan != '\t') && (*scan != '\n') && (*scan != '\0')) return NTL2_LI_Assemble_Error (USER_ERROR, ERR_BAD_STRUCTURE, "Bad IMD matrix specification", 0); } while ((*scan == ' ') || (*scan == '\t')) scan++; if ((*scan != '\n') && (*scan != '\0')) return NTL2_LI_Assemble_Error (USER_ERROR, ERR_BAD_STRUCTURE, "Bad IMD matrix specification", 0); collect_buffer [0] = '\0'; items = 0; } } /* Now connect the newly defined record into the list of known matrices */ new_matrix -> next = *matrices; *matrices = new_matrix; } } } if (In_Buff [0] != '\0') return NTL2_LI_Assemble_Error (USER_ERROR, ERR_BAD_STRUCTURE, "Bad IMD file layout", 0); free (collect_buffer); return NULL; } /*****************************************************************************/ /* */ /* Procedure: NTL2_LI_Convert_IMDfile */ /* */ /* Procedure for conversion of the list of summary matrices into a list of */ /* specificity matrices; returns the error structure, or NULL if the */ /* conversion has been properly done */ errind NTL2_LI_Convert_IMDfile (char *file_name, summary_ptr summaries, int dimension, int *Indices, double *Frequencies, imd_ptr *motifs) { summary_ptr original; imd_ptr new_matrix; bool found; int columns, rows, column_sum, temp_sum, index; double total_sum, ln_base, adjusted, normalized; *motifs = NULL; /* Loop for all records of the received summary matrix list to convert them */ /* into specificity matrix records, one at a time */ for (original = summaries; original != NULL; original = original -> next) { new_matrix = (imd_ptr) NTL0_ckalloc (sizeof (IMD_Struct)); new_matrix -> file_name = NTL0_strsave (file_name); new_matrix -> site_name = original -> site_name; original -> site_name = NULL; new_matrix -> bind_factor = NULL; new_matrix -> seq_length = original -> seq_length; new_matrix -> sequence = original -> sequence; original -> sequence = NULL; new_matrix -> cutoff = original -> cutoff; new_matrix -> max_score = original -> max_score; /* Do the conversion - process the summary matrix into specificity matrix */ new_matrix -> matrix = (double **) NTL0_ckalloc ((new_matrix -> seq_length) * sizeof (double *)); for (columns = 0; columns < new_matrix -> seq_length; columns++) (new_matrix -> matrix) [columns] = (double *) NTL0_ckalloc (dimension * sizeof (double)); /* Now make sure that all columns of the summary matrix have the same summ */ column_sum = 0; for (rows = 0; rows < dimension; rows++) column_sum += (original -> matrix) [0] [rows]; for (columns = 1; columns < original -> seq_length; columns++) { temp_sum = 0; for (rows = 0; rows < dimension; rows++) temp_sum += (original -> matrix) [columns] [rows]; if (temp_sum != column_sum) { sprintf (Error_Message, "Column sums of matrix '%s' of file '%s' do not agree", original -> site_name, new_matrix -> file_name); return NTL2_LI_Assemble_Error (USER_ERROR, ERR_BAD_VALUE, Error_Message, 0); } } total_sum = (double) column_sum; ln_base = log (LOG_BASE); /* Used for binary logarithm in calculations */ /* Proceed to determine the elements of the specificity matrix row by row */ for (rows = 0; rows < dimension; rows++) { /* Determine index for this row first (letter to which row corresponds) */ found = FALSE; index = 0; while ((!found) && (index < 26)) { if (Indices [index] == rows) found = TRUE; else index++; } if (!found) { sprintf (Error_Message, "Row %d does not have an alphabet letter to which it corresponds", rows); return NTL2_LI_Assemble_Error (USER_ERROR, ERR_BAD_VALUE, Error_Message, 0); } for (columns = 0; columns < original -> seq_length; columns++) { adjusted = (double) ((original -> matrix) [columns] [rows]) + 0.01; normalized = adjusted / total_sum / Frequencies [index]; (new_matrix -> matrix) [columns] [rows] = log (normalized) / ln_base; } } new_matrix -> reference = original -> reference; original -> reference = NULL; new_matrix -> next = *motifs; *motifs = new_matrix; } summaries = NTL2_LI_Destroy_TempList (summaries); return NULL; } /*****************************************************************************/ /* */ /* Procedure: NTL2_LI_Alphabet_Size */ /* */ /* Service procedure for the retrieval of the size of the alphabet in use */ /* based on the contents of the received array of indices; returns the */ /* size of the used alphabet, if everything is OK, negative number otherws */ int NTL2_LI_Alphabet_Size (int *Indices) { int size, index; size = 0; for (index = 0; index < 26; index++) { if (Indices [index] >= 0) size++; } return size; } /*****************************************************************************/ /* */ /* Procedure: NTL2_LI_Destroy_TempList */ /* */ /* Service procedure for the destruction of the temporary list of the loaded */ /* summary matrices; returns NULL as the proper new value for the head of */ /* the list */ summary_ptr NTL2_LI_Destroy_TempList (summary_ptr list) { summary_ptr trash; int index; while (list != NULL) { trash = list; list = list -> next; if (trash -> site_name != NULL) free (trash -> site_name); if (trash -> sequence != NULL) free (trash -> sequence); if (trash -> matrix != NULL) { for (index = 0; index < trash -> seq_length; index++) { if ((trash -> matrix) [index] != NULL) free ((trash -> matrix) [index]); } free (trash -> matrix); } if (trash -> reference != NULL) free (trash -> reference); free (trash); } return NULL; } /*****************************************************************************/ /* */ /* Procedure: NTL2_LI_Assemble_Error */ /* */ /* Service procedure for assembling and returnning an error report, based on */ /* the values of the input parameters; returns the record with the report */ errind NTL2_LI_Assemble_Error (int severity, int code, char *comment, int description) { char *report; errind assembled; report = (char *) NTL0_ckalloc ( (strlen (comment) + strlen ("_Load_IMDfile: ") + 1) * sizeof (char)); sprintf (report, "_Load_IMDfile: %s", comment); assembled = NTL1_Error_Record (severity, code, report, description); free (report); return assembled; }