/* * findblocks.c - find multiple aligned blocks generated by a connected family * of pairwise alignments * * The command syntax is * findblocks [-k1] align1 [-k2] align2 ... * where align1, align2, etc., are pairwise alignment files in the format used * by the LAV family of alignment visualization tools. An optional integer * k1, k2, etc., gives the multiplicity of the alignment that it precedes, but * can only differ from 1 (the default) for self alignments. The aligned blocks * generated by the pairwise alignments are written to standard out. */ #include #include #define MAX_SEQ 100 #define MAX_ALIGN 100 /* -------------------------- Input Data ----------------------------------- */ struct { char *name; int self_align, lo_addr, hi_addr; } sequence[MAX_SEQ]; struct { int seq1, seq2, mtp; char *name; } alignment[MAX_ALIGN]; int n_seq = 0, n_align = 0; /* -------------------------- Constraint Graph ---------------------------- */ typedef struct edge *Edge_ptr; typedef struct edge { int con_pos, con_align; Edge_ptr next; } Edge; int n_node = 0, /* number of nodes in graph */ *seq_at_pos, /* index of sequence at a given position */ *pos_of_seq; /* position of sequence i in the list */ Edge_ptr *edges; /* edges[i] pts to list of edges constrainiing node i */ /* -------------------------------- Main Program ---------------------------- */ main(argc, argv) int argc; char *argv[]; { int i, k; if (argc < 2) fatalf("%s [-k1] alignment1 [-k2] alignment2 ...", argv[0]); /* read input */ for (i = 1; i < argc; ++i) { k = 1; if (argv[i][0] == '-' && (k = atoi(argv[i++]+1)) <= 0) fatal("Multiplicity must be positive."); read_align(k, argv[i]); } #ifdef DEBUG_INPUT for (i = 0; i < n_seq; ++i) printf("sequence %d: name = \"%s\", length = %d, self_align = %d\n", i, sequence[i].name, sequence[i].hi_addr, sequence[i].self_align); for (i = 0; i < n_align; ++i) printf("alignment %d: sequence %d <--> sequence %d; k = %d\n", i, alignment[i].seq1, alignment[i].seq2, alignment[i].mtp); #endif /* build the constraint graph */ build_cons(); #ifdef DEBUG_GRAPH { Edge_ptr ep; for (i = 0; i < n_node; ++i) { printf("seq %d at pos %d;", seq_at_pos[i], i); printf(" constraining (pos,alignment) ="); for (ep = edges[i]; ep; ep = ep->next) printf(" (%d,%d)", ep->con_pos, ep->con_align); putchar('\n'); } exit(0); } #endif /* build the lists of matches */ build_matches(); /* find and store the aligned blocks */ build_blocks(); /* print a lav-style prolog for the aligned blocks */ #ifndef NO_ALIGN print_top(argc, argv); #endif print_blocks(); exit(0); } /* ------------------------------ Input Routines ---------------------------- */ /* * Information listed above under "Input Data" is determined and n_node is set * to the number of nodes in the Constraint Graph (every distinct sequence has * a node and a self-alignment of multiplicity k adds k "secondary" nodes). */ read_align(k, file) char *file; { FILE *fp, *ckopen(); char *fgets(), line[100]; int i, j; if (n_align >= MAX_ALIGN) fatalf("Number of alignments cannot exceed %d\n", MAX_ALIGN); alignment[n_align].mtp = k; alignment[n_align].name = file; /* get the sequence numbers and perform some error checks */ fp = ckopen(file, "r"); while (fgets(line, 100, fp) != NULL && strncmp(line, "s {", 3)) ; i = alignment[n_align].seq1 = get_seq_info(fp, 1); if ((j = alignment[n_align].seq2 = get_seq_info(fp, 2)) < 0) fatalf("`%s' seems to be a multiple alignment.", file); if (i == j) if (sequence[i].self_align > -1) fatal("Cannot handle multiple self_alignments."); else { sequence[i].self_align = n_align; n_node += k; } else if (k > 1) fatal("Multiplicity > 1 only for self-alignments."); ++n_align; fclose(fp); } int get_seq_info(fp, which) FILE *fp; int which; { char *fgets(), line[100], *s, *t, *strsave(); int seq_nbr; fgets(line, 100, fp); for (s = line; *s == ' '; ++s) ; if (*s != '"') fatal("Expecting \"\n"); for (t = ++s; *t && *t != '"'; ++t) ; if (*t != '"') fatal("Could not find end of sequence name."); *t = '\0'; for (seq_nbr = 0; seq_nbr < n_seq; ++seq_nbr) if (strsame(sequence[seq_nbr].name, s)) break; if (seq_nbr == n_seq) { /* first appearance of this sequence name */ if (n_seq >= MAX_SEQ) fatalf("Number of sequences cannot exceed %d\n", MAX_SEQ); sequence[n_seq].name = strsave(s); sequence[n_seq].self_align = -1; ++n_seq; ++n_node; } for (s = t+1; *s == ' '; ++s) ; if (!isdigit(*s)) fatal("Could not find initial subscript."); sequence[seq_nbr].lo_addr = atoi(s); while (isdigit(*s)) ++s; if (*s != ' ') fatal("Could not find blank at end of initial subscript."); while (*++s == ' ') ; if (!isdigit(*s)) fatal("Could not find final subscript."); sequence[seq_nbr].hi_addr = atoi(s); if (which == 2) { /* check if this is a pairwise alignment */ fgets(line, 100, fp); for (s = line; *s == ' ' || *s == '\t'; ++s) ; if (*s != '}') return -1; } return seq_nbr; } /* ------------------------- Build the Constraint Graph --------------------- */ /* * The sequences (nodes) are ordered, beginning with the first sequence of the * first alignment. Each node after the first node has at least one edge * (alignment) from an earlier sequence (thus some constraints are placed on * the values to be tried at that node). For each node n we save the list of * pairs (pos, alignment) of these constraining conditions, where pos < n is * the position in the list of the constraining neighbor. If the sequence at * position p has a self-alignment of multiplicity k, then "secondary" nodes * for the sequence are placed at positions p + j for 1 <= j <= k. Inedges * and outedges between a secondary node and nodes for other sequences are the * same as for the sequence's "primary" node; in addition, a node representing * a sequence has an inedge from every earlier node for that sequence. */ typedef struct vertex *Align_ptr; typedef struct vertex { int algn, nbr; Align_ptr link; } Align; build_cons() { Align_ptr ap, *aligns_of_seq; Edge_ptr ep; int i, j, k, n, neighbor, sa, nodes, p, n_stack, pn, s1, s2, stack[MAX_SEQ], unreached[MAX_SEQ]; char *ckalloc(); /* for each sequence, build the list of alignments involving it */ aligns_of_seq = (Align_ptr *) ckalloc(n_seq*sizeof(Align_ptr)); for (i = 0; i < n_seq; ++i) aligns_of_seq[i] = NULL; for (i = 0; i < n_align; ++i) /* ignore self-alignments */ if ((s1 = alignment[i].seq1) != (s2 = alignment[i].seq2)) { ap = (Align_ptr) ckalloc(sizeof(Align)); ap->algn = i; ap->nbr = s2; ap->link = aligns_of_seq[s1]; aligns_of_seq[s1] = ap; ap = (Align_ptr) ckalloc(sizeof(Align)); ap->algn = i; ap->nbr = s1; ap->link = aligns_of_seq[s2]; aligns_of_seq[s2] = ap; } /* order nodes of constraint graph; start with sequence 0 at node 0 */ pos_of_seq = (int *) ckalloc(n_seq*sizeof(int)); seq_at_pos = (int *) ckalloc(n_node*sizeof(int)); edges = (Edge_ptr *) ckalloc(n_node*sizeof(Edge_ptr)); for (i = 0; i < n_seq; ++i) { unreached[i] = 1; pos_of_seq[i] = -1; } nodes = 0; /* number of nodes currently in graph */ stack[0] = 0; /* stack node 0 */ unreached[0] = 0; n_stack = 1; /* number of nodes currently in stack */ while (n_stack) { n = stack[--n_stack]; /* POP */ pos_of_seq[n] = pn = nodes++; seq_at_pos[pn] = n; edges[pn] = NULL; /* loop over neighbors of n */ for (ap = aligns_of_seq[n]; ap; ap = ap->link) /* if the neighbor is an earlier node */ if ((p = pos_of_seq[neighbor = ap->nbr]) > -1) { /* constraining edge from neighbor to n */ add_con(p, pn, ap->algn); /* + edges from neighbor's secondary nodes */ if ((sa = sequence[neighbor].self_align) >= 0) for (k = 1; k <= alignment[sa].mtp; ++k) add_con(p+k, pn, ap->algn); } else if (unreached[neighbor]) { /* add neighbor to the stack of marked nodes */ unreached[neighbor] = 0; stack[n_stack++] = neighbor; } /* if n is self aligned */ if ((sa = sequence[n].self_align) >= 0) /* next nodes are n's secondary nodes */ for (k = 1; k <= alignment[sa].mtp; ++k, ++nodes) { edges[nodes] = NULL; seq_at_pos[nodes] = n; /* constraining edges from earlier copies */ for (j = 0; j < k; ++j) add_con(pn+j, nodes, sa); /* and duplicates of constraints on primary */ for (ep = edges[pn]; ep; ep = ep->next) add_con(ep->con_pos, nodes, ep->con_align); } } if (nodes != n_node) fatal("Graph is disconnected."); } add_con(from, to, align) int from, to, align; { Edge_ptr ep; char *ckalloc(); ep = (Edge_ptr) ckalloc(sizeof(Edge)); ep->con_pos = from; ep->con_align = align; ep->next = edges[to]; edges[to] = ep; } /* -------------------------- Recorded Matches ----------------------------- */ typedef struct match_loc *Match_ptr; typedef struct match_loc { int loc; Match_ptr next; } Match; Match_ptr **Mat; /* holds matching position */ /* ------------------------ Build the Lists of Matches ---------------------- */ /* * The node ordering constructed for the Constraint Graph determines, for * each edge (alignment), which sequence comes first (i.e., whose values * will constrain values that we try for the other sequence). For each * alignment and each position in the constraining sequences, we make a * linked list of permissible positions in the constrained sequence. * Positions within the list are ordered, which simplifies the process of * intersecting a collection of these lists. */ build_matches() { int i; char *ckalloc(); Mat = (Match_ptr **) ckalloc(n_align*sizeof(Match_ptr*)); for (i = 0; i < n_align; ++i) build_match(i); } build_match(algn) /* build match structure for alignment algn */ int algn; { int i, j, k, reverse, many, seqnum, s1, s2, x1, x2, y1, y2; char *ckalloc(); FILE *ap, *ckopen(); Match_ptr mp, p, *Mpp; s1 = alignment[algn].seq1; s2 = alignment[algn].seq2; reverse = (pos_of_seq[s1] > pos_of_seq[s2]); seqnum = (reverse ? s2 : s1); many = sequence[seqnum].hi_addr; Mpp = (Match_ptr *)ckalloc(many*sizeof(Match_ptr)); Mat[algn] = --Mpp; /* so subscripts start at 1 */ for (i = 1; i <= many; ++i) Mpp[i] = NULL; ap = ckopen(alignment[algn].name, "r"); while (get_diag(ap, &x1, &y1, &x2, &y2)) { if (s1 == s2 && x1 == 1 && y1 == x1 && x2 == many && y2 == x2) continue; /* ignore main diagonal in self-comp */ if (reverse) { i = y1; j = x1; k = y2 - y1; } else { i = x1; j = y1; k = x2 - x1; } if (i < 1 || i > many) fatal("Alignment involves illegal position."); for ( ; k >= 0; ++i, ++j, --k) { mp = (Match_ptr) ckalloc(sizeof(Match)); mp->loc = j; if (Mpp[i] == NULL || Mpp[i]->loc >= j) { mp->next = Mpp[i]; Mpp[i] = mp; } else { /* insert in sorted order */ for (p=Mpp[i]; p->next && p->next->loc < j; p = p->next) ; mp->next = p->next; p->next = mp; } } } fclose(ap); } /* read a line from a pairwise alignment file */ int get_diag(ap, x1p, y1p, x2p, y2p) FILE *ap; int *x1p, *y1p, *x2p, *y2p; { char *s, line[100]; while (fgets(line, 100, ap) != NULL) { for (s = line; *s == ' '; ++s) ; if (*s != 'l') continue; while (*s && *s != ' ') ++s; sscanf(s, "%d %d %d %d", x1p, y1p, x2p, y2p); return 1; } return 0; } /* ---------- Find Aligned Blocks Generated by Pairwise Alignments ---------- */ int *col; /* array holding the consistent column */ Match_ptr **MP; /* MP[i] pts to array used at i-th level of recursion */ build_blocks() { int i, n_cons; char *ckalloc(); Edge_ptr ep; col = (int *)ckalloc(n_node*sizeof(int)); MP = (Match_ptr **) ckalloc(n_node*sizeof(Match_ptr*)); for (i = 1; i < n_node; ++i) { /* count number of constraints (in-edges) for node i */ for (n_cons = 0, ep = edges[i]; ep; ++n_cons, ep = ep->next) ; MP[i] = (Match_ptr *) ckalloc(n_cons*sizeof(Match_ptr)); } for (i = sequence[0].lo_addr; i <= sequence[0].hi_addr; ++i) { col[0] = i; enumerate(1); cull_blocks(i); } cull_blocks(sequence[0].hi_addr + 1); } enumerate(pos) /* try all feasible values at node pos of the Constraint Graph */ { int i, n_cons, diff_first; Match_ptr *Mp; Edge_ptr ep; if (pos >= n_node) { save_col(); return; } Mp = MP[pos]; /* Mp[i] pts to next value consistent with value on node i-th constraining position */ for (n_cons = 0, ep = edges[pos]; ep; ++n_cons, ep = ep->next) if ((Mp[n_cons] = Mat[ep->con_align][col[ep->con_pos]]) == NULL) return; for ( ; ; ) { /* loop over feasible values for this position */ for (diff_first = 1; diff_first; ) { /* 'til same 1st element */ diff_first = 0; for (i = 0; i < n_cons-1; ++i) if (Mp[i]->loc != Mp[i+1]->loc) { diff_first = 1; if (Mp[i]->loc < Mp[i+1]->loc) { if ((Mp[i] = Mp[i]->next) == NULL) return; } else if ((Mp[i+1] = Mp[i+1]->next) == NULL) return; } } col[pos] = Mp[0]->loc; enumerate(pos+1); /* delete first element of every list */ for (i = 0; i < n_cons; ++i) if ((Mp[i] = Mp[i]->next) == NULL) return; } } /* -------------------- Print Top of Generated Alignment File --------------- */ print_top (argc, argv) int argc; char *argv[]; { int i, j; FILE *ap, *ckopen(); char *s, line[100], *ckalloc(); ap = ckopen(alignment[0].name, "r"); while (fgets(line, 100, ap) != NULL) { for (s = line; *s == ' '; ++s) ; if (strncmp(s, "a {", 3) == 0) break; if (strncmp(s, "s {", 3) == 0) { printf("s {\n"); for (i = 0; i < n_node; ++i) { j = seq_at_pos[i]; printf(" \"%s\" %d %d\n", sequence[j].name, sequence[j].lo_addr, sequence[j].hi_addr); } printf("}\n"); while (fgets(line, 100, ap) != NULL) if (line[0] == '}') break; } else if (strncmp(s, "d {", 3)) fputs(line, stdout); else { printf("d {\n \"%s", "blocks"); for (i = 1; i < argc; ++i) { if (i%3 == 0) printf("\n "); printf(" %s", argv[i]); } printf("\"\n}\n"); while (fgets(line, 100, ap) != NULL) if (line[0] == '}') break; } } fclose(ap); } /* ------------------- Handle Blocks of Consistent Columns ------------------ */ typedef struct cons_col *Col_ptr; typedef struct cons_col { int *start; int span; Col_ptr next; } Cons_col; Col_ptr first_col, finished_col; save_col() /* record a consistent column */ { int i, sp, *st; Col_ptr cp; char *ckalloc(); /* is it extending a diagonal run of columns? */ for (cp = first_col; cp; cp = cp->next) { st = cp->start; sp = cp->span+1; for (i = 0; i < n_node; ++i) if (col[i] != st[i]+sp) break; if (i == n_node) break; } if (cp) { /* found the diagonal run */ cp->span++; return; } cp = (Col_ptr) ckalloc(sizeof(Cons_col)); cp->start = (int *)ckalloc(n_node*sizeof(int)); for (i = 0; i < n_node; ++i) cp->start[i] = col[i]; cp->span = 0; cp->next = first_col; first_col = cp; } cull_blocks(i) /* set aside blocks not ending at loc i of sequence 0 */ { Col_ptr cp, prev; for (prev = NULL, cp = first_col; cp; ) if (cp->start[0] + cp->span != i) if (prev) { prev->next = cp->next; cp->next = finished_col; finished_col = cp; cp = prev->next; } else { first_col = first_col->next; cp->next = finished_col; finished_col = cp; cp = first_col; } else { prev = cp; cp = cp->next; } } print_blocks() { Col_ptr cp; int i, *st, sp; for (cp = finished_col; cp; cp = cp->next) { st = cp->start; sp = cp->span; #ifndef NO_ALIGN printf("a {\n s 0.0\n b"); for (i = 0; i < n_node; ++i) printf(" %d", st[i]); printf("\n e"); for (i = 0; i < n_node; ++i) printf(" %d", st[i] + sp); putchar('\n'); #endif printf(" l"); for (i = 0; i < n_node; ++i) printf(" %d", st[i]); for (i = 0; i < n_node; ++i) printf(" %d", st[i] + sp); printf(" 0.0\n"); #ifndef NO_ALIGN printf("}\n"); #endif } } /* -------------------------- Utility routines ------------------------------ */ /* fatal - print message and die */ fatal(msg) char *msg; { fprintf(stderr, "%s\n", msg); exit(1); } /* fatalf - format message, print it, and die */ fatalf(msg, val) char *msg, *val; { fprintf(stderr, msg, val); putc('\n', stderr); exit(1); } /* ckalloc - allocate space; check for success */ char *ckalloc(amount) int amount; { char *malloc(), *p; if ((p = malloc( (unsigned) amount)) == NULL) fatal("Ran out of memory."); return(p); } /* ckopen - open file; check for success */ FILE *ckopen(name, mode) char *name, *mode; { FILE *fopen(), *fp; if ((fp = fopen(name, mode)) == NULL) fatalf("Cannot open %s.", name); return(fp); } /* strsame - tell whether two strings are identical */ int strsame(s, t) char *s, *t; { return(strcmp(s, t) == 0); } /* strsave - save string s somewhere; return address */ char *strsave(s) char *s; { char *ckalloc(), *p, *strcpy(); p = ckalloc(strlen(s)+1); /* +1 to hold '\0' */ return(strcpy(p, s)); }