/* * blocks2.c - find multiple aligned blocks generated by a biconnected subset of * a given family of pairwise alignments * * The command syntax is * blocks2 [-s] [-b] [+]align1 [+]align2 ... * where align1, align2, etc., are pairwise alignment files in the format used * by the LAV family of alignment visualization tools. The aligned blocks * generated by the pairwise alignments are written to standard out. * * Flags: * + before an alignment forces the alignment to be satisfied * -s add a + to the first alignment and produce only the projection * of the biconnected position assignment onto the first two * sequences (i.e. the output data consist of 2-tuples). * -b use bridges instead of articulation points when deciding * biconnectness. (i.e. use bridge-free connectness.) */ #include #include #define BR 0 #define ART 1 #define MAX_SEQ 100 #define MAX_ALIGN 100 #define MAX_APP 1000 /* max alignment points for a single point*/ #define min(x,y) ((x) < (y) ? (x) : (y)) /* -------------------------- Input Data ----------------------------------- */ struct { char *name; int lo_addr, hi_addr; } sequence[MAX_SEQ]; struct { int seq1, seq2, force; char *name; } alignment[MAX_ALIGN]; int n_seq = 0, n_align = 0; /* -------------------------- Constraint Graph ---------------------------- */ typedef struct edge *Edge_ptr; typedef struct edge { int con_pos, con_align; Edge_ptr next; } Edge; int s_flag = 0, /* 0 default; 1 = flag s.*/ brdg_artpt = ART, /*bridge or articulation point*/ n_node = 0, /* number of nodes in graph */ *seq_at_pos, /* index of sequence at a given position */ *pos_of_seq; /* position of sequence i in the list */ Edge_ptr *edges; /* edges[i] pts to list of edges constraining node i */ /* -------------------------------- Main Program ---------------------------- */ main(argc, argv) int argc; char *argv[]; { int i, k; if (argc < 2) fatalf("%s [-s] [-b] [+]alignment1 [+]alignment2 ...", argv[0]); /* read input */ for (i = 1; i < argc; ++i) { if (argv[i][0] == '-') if (argv[i][1] == 's') { s_flag = 1; alignment[0].force = 1; continue; } else if (argv[i][1] == 'b') { brdg_artpt = BR; continue; } read_align(argv[i]); } #ifdef DEBUG_INPUT for (i = 0; i < n_seq; ++i) printf("sequence %d: name = \"%s\", length = %d, \n", i, sequence[i].name, sequence[i].hi_addr); for (i = 0; i < n_align; ++i) printf("alignment %d: sequence %d <--> sequence %d.\n", i, alignment[i].seq1, alignment[i].seq2); #endif /* build the constraint graph */ build_cons(); #ifdef DEBUG_GRAPH { Edge_ptr ep; for (i = 0; i < n_node; ++i) { printf("seq %d at pos %d;", seq_at_pos[i], i); printf(" constraining (pos,alignment) ="); for (ep = edges[i]; ep; ep = ep->next) printf(" (%d,%d)", ep->con_pos, ep->con_align); putchar('\n'); } exit(0); } #endif /* build the lists of matches */ build_matches(); /* printf(" Finish build_matches"); */ /* printf("%d", n_node);*/ /* find and store the aligned blocks */ gen_assignment(); /* printf(" Finish gen_assignment"); */ /* print a lav-style prolog for the aligned blocks */ #ifndef NO_ALIGN print_top(argc, argv); #endif print_blocks(); exit(0); } /* ------------------------------ Input Routines ---------------------------- */ /* * Information listed above under "Input Data" is determined and n_node is set * to the number of nodes in the Constraint Graph (every distinct sequence has * a node and a self-alignment of multiplicity k adds k "secondary" nodes). */ read_align(file1) char *file1; { FILE *fp, *ckopen(); char *fgets(), line[100], *file; int i, j; if (n_align >= MAX_ALIGN) fatalf("Number of alignments cannot exceed %d\n", MAX_ALIGN); if (file1[0] == '+') {file = &file1[1]; alignment[n_align].force = 1;} else file = file1; alignment[n_align].name = file; /* get the sequence numbers and perform some error checks */ fp = ckopen(file, "r"); while (fgets(line, 100, fp) != NULL && strncmp(line, "s {", 3)) ; i = alignment[n_align].seq1 = get_seq_info(fp, 1); if ((j = alignment[n_align].seq2 = get_seq_info(fp, 2)) < 0) fatalf("`%s' seems to be a multiple alignment.", file); if (i == j) fatal("Cannot handle self_alignments."); ++n_align; fclose(fp); } int get_seq_info(fp, which) FILE *fp; int which; { char *fgets(), line[100], *s, *t, *name, *strsave(); int seq_nbr, low, high; fgets(line, 100, fp); for (s = line; *s == ' '; ++s) ; if (*s != '"') fatal("Expecting \"\n"); for (name = t = ++s; *t && *t != '"'; ++t) ; if (*t != '"') fatal("Could not find end of sequence name."); *t = '\0'; for (s = t+1; *s == ' '; ++s) ; if (!isdigit(*s)) fatal("Could not find initial subscript."); low = atoi(s); while (isdigit(*s)) ++s; if (*s != ' ') fatal("Could not find blank at end of initial subscript."); while (*++s == ' ') ; if (!isdigit(*s)) fatal("Could not find final subscript."); high = atoi(s); for (seq_nbr = 0; seq_nbr < n_seq; ++seq_nbr) if (strsame(sequence[seq_nbr].name, name) && sequence[seq_nbr].lo_addr == low && sequence[seq_nbr].hi_addr == high) break; if (seq_nbr == n_seq) { /* first appearance of this sequence name */ if (n_seq >= MAX_SEQ) fatalf("Number of sequences cannot exceed %d\n", MAX_SEQ); sequence[n_seq].name = strsave(name); sequence[n_seq].lo_addr = low; sequence[n_seq].hi_addr = high; ++n_seq; ++n_node; } if (which == 2) { /* check if this is a pairwise alignment */ fgets(line, 100, fp); for (s = line; *s == ' ' || *s == '\t'; ++s) ; if (*s != '}') return -1; } return seq_nbr; } /* ------------------------- Build the Constraint Graph --------------------- */ /* * The sequences (nodes) are ordered, beginning with the first sequence of the * first alignment. Each node after the first node has at least one edge * (alignment) from an earlier sequence (thus some constraints are placed on * the values to be tried at that node). For each node n we save the list of * pairs (pos, alignment) of these constraining conditions, where pos < n is * the position in the list of the constraining neighbor. If the sequence at * position p has a self-alignment of multiplicity k, then "secondary" nodes * for the sequence are placed at positions p + j for 1 <= j <= k. Inedges * and outedges between a secondary node and nodes for other sequences are the * same as for the sequence's "primary" node; in addition, a node representing * a sequence has an inedge from every earlier node for that sequence. */ typedef struct vertex *Align_ptr; typedef struct vertex { int algn, nbr; Align_ptr link; } Align; Align_ptr *aligns_of_seq; build_cons() { Align_ptr ap; Edge_ptr ep; int i, j, k, n, neighbor, sa, nodes, p, n_stack, pn, s1, s2, stack[MAX_SEQ], unreached[MAX_SEQ]; char *ckalloc(); /* for each sequence, build the list of alignments involving it */ aligns_of_seq = (Align_ptr *) ckalloc(n_seq*sizeof(Align_ptr)); for (i = 0; i < n_seq; ++i) aligns_of_seq[i] = NULL; for (i = 0; i < n_align; ++i) { s1 = alignment[i].seq1; s2 = alignment[i].seq2; ap = (Align_ptr) ckalloc(sizeof(Align)); ap->algn = i; ap->nbr = s2; ap->link = aligns_of_seq[s1]; aligns_of_seq[s1] = ap; ap = (Align_ptr) ckalloc(sizeof(Align)); ap->algn = i; ap->nbr = s1; ap->link = aligns_of_seq[s2]; aligns_of_seq[s2] = ap; } /* order nodes of constraint graph; start with sequence 0 at node 0 */ pos_of_seq = (int *) ckalloc(n_seq*sizeof(int)); seq_at_pos = (int *) ckalloc(n_node*sizeof(int)); edges = (Edge_ptr *) ckalloc(n_node*sizeof(Edge_ptr)); for (i = 0; i < n_seq; ++i) { unreached[i] = 1; pos_of_seq[i] = -1; } nodes = 0; /* number of nodes currently in graph */ stack[0] = 0; /* stack node 0 */ unreached[0] = 0; n_stack = 1; /* number of nodes currently in stack */ while (n_stack) { n = stack[--n_stack]; /* POP */ pos_of_seq[n] = pn = nodes++; seq_at_pos[pn] = n; edges[pn] = NULL; /* loop over neighbors of n */ for (ap = aligns_of_seq[n]; ap; ap = ap->link) /* if the neighbor is an earlier node */ if ((p = pos_of_seq[neighbor = ap->nbr]) > -1) { /* constraining edge from neighbor to n */ add_con(p, pn, ap->algn); } else if (unreached[neighbor]) { /* add neighbor to the stack of marked nodes */ unreached[neighbor] = 0; stack[n_stack++] = neighbor; } } if (nodes != n_node) fatal("Graph is disconnected."); } add_con(from, to, align) int from, to, align; { Edge_ptr ep; char *ckalloc(); ep = (Edge_ptr) ckalloc(sizeof(Edge)); ep->con_pos = from; ep->con_align = align; ep->next = edges[to]; edges[to] = ep; } /* -------------------------- Recorded Matches ----------------------------- */ typedef struct match_loc *Match_ptr; typedef struct match_loc { int loc; Match_ptr next; } Match; Match_ptr **Mat1, **Mat2; /* holds matching position */ /* ------------------------ Build the Lists of Matches ---------------------- */ /* * The node ordering constructed for the Constraint Graph determines, for * each edge (alignment), which sequence comes first (i.e., whose values * will constrain values that we try for the other sequence). For each * alignment and each position in the constraining sequences, we make a * linked list of permissible positions in the constrained sequence. * Positions within the list are ordered, which simplifies the process of * intersecting a collection of these lists. */ build_matches() { int i; char *ckalloc(); Mat1 = (Match_ptr **) ckalloc(n_align*sizeof(Match_ptr*)); Mat2 = (Match_ptr **) ckalloc(n_align*sizeof(Match_ptr*)); for (i = 0; i < n_align; ++i) build_match(i); } build_match(algn) /* build match structure for alignment algn */ int algn; { int i, j, k, many1, many2, many, l, s1, s2, x1, x2, y1, y2; char *ckalloc(); FILE *ap, *ckopen(); Match_ptr mp, p, *Mpp1, *Mpp2, *Mpp; s1 = alignment[algn].seq1; s2 = alignment[algn].seq2; many1 = sequence[s1].hi_addr - sequence[s1].lo_addr + 1; many2 = sequence[s2].hi_addr - sequence[s2].lo_addr + 1; Mpp1 = (Match_ptr *)ckalloc(many1*sizeof(Match_ptr)); Mpp2 = (Match_ptr *)ckalloc(many2*sizeof(Match_ptr)); Mat1[algn] = --Mpp1; /* so subscripts start at 1 */ Mat2[algn] = --Mpp2; for (i = 1; i <= many1; ++i) Mpp1[i] = NULL; for (i = 1; i <= many2; ++i) Mpp2[i] = NULL; ap = ckopen(alignment[algn].name, "r"); many = many1; while (get_diag(ap, &x1, &y1, &x2, &y2)) { if (s1 == s2 && x1 == 1 && y1 == x1 && x2 == many && y2 == x2) continue; /* ignore main diagonal in self-comp */ i = y1; j = x1; k = y2 - y1; Mpp = Mpp2; many = many2; for (l = 0; l < 2; ++l){ if (i < 1 || i > many) fatal("Alignment involves illegal position."); for ( ; k >= 0; ++i, ++j, --k) { mp = (Match_ptr) ckalloc(sizeof(Match)); mp->loc = j; if (Mpp[i] == NULL || Mpp[i]->loc >= j) { mp->next = Mpp[i]; Mpp[i] = mp; } else { /* insert in sorted order */ for (p=Mpp[i]; p->next && p->next->loc < j; p = p->next) ; mp->next = p->next; p->next = mp; } } i = x1; j = y1; k = x2 - x1; Mpp = Mpp1; many = many1; } } fclose(ap); } /* read a line from a pairwise alignment file */ int get_diag(ap, x1p, y1p, x2p, y2p) FILE *ap; int *x1p, *y1p, *x2p, *y2p; { char *s, line[100]; while (fgets(line, 100, ap) != NULL) { for (s = line; *s == ' '; ++s) ; if (*s != 'l') continue; while (*s && *s != ' ') ++s; sscanf(s, "%d %d %d %d", x1p, y1p, x2p, y2p); return 1; } return 0; } /* ---------- Find All The Consistent Columns Respect To A DFS Tree ---------- */ typedef struct Tree *Tree_ptr; typedef struct Tree { int s, p, visit, i; /*s: sequence, p: position, i: the level */ Tree_ptr next, first, parent; /*These three pointer is standard representation of a tree, with first to its first child and next to its brother. */ Align_ptr ap; /* This is used to store the next alignment to be searched */ } Tree_node; Tree_ptr *Tr; /* Here we use the traditional notion of Low[v], i.e., the minimum of v's depth and the depth of all vertices reached by a non-tree edge from v or a successor of v. (The paper uses a slightly different notion.) The program does this so that it can also locate bridges. */ int **low, *depth, fd; /* fd = 1 after finding a position assignment (for -s flag) */ /* -------------- Data structure for blocked pairs (s, p) ------------------- */ /* Block(s,p) = lowest recursion level where (s,p) is blocked. (s,p) is added to the set of blocked nodes while recursion is at a certain level, but the node is not to be considered as blocked later in the computation at that level. (If space becomes a problem, we might try declaring "char **Blocked".) */ int **Block; typedef struct { int n,p;} Pts; /* bl and nb support fast clean-up when recursion backtracks */ Pts **bl; /* bl[level][i] = i-th blocked point at given recursion level */ int *nb; /* nb[level] = number of blocked pts at given recursion level */ /* -------------------------------------------------------------------------- */ gen_assignment() { int i, j, s, many; char *ckalloc(); Align_ptr ap, app; bl = (Pts **) ckalloc(n_node*sizeof(Pts*)); nb = (int *) ckalloc(n_node*sizeof(int)); for (i=0; i < n_node; ++i){ bl[i] = (Pts *) ckalloc(MAX_APP*sizeof(Pts)); nb[i] = 0; } nb--; low = (int **) ckalloc((n_node+1)*sizeof(int *)); depth = (int *) ckalloc(n_node*sizeof(int)); for (i=1; i <= n_node; i++) low[i] = (int *) ckalloc(n_node*sizeof(int)); Block = (int **) ckalloc(n_node*sizeof(int*)); for (i = 1; i < n_node; ++i){ s = seq_at_pos[i]; many = sequence[s].hi_addr; Block[i] = (int *) ckalloc(many*sizeof(int)); --Block[i]; for (j=1; j <= many; ++j) Block[i][j] = n_node; } Tr = (Tree_ptr *) ckalloc(n_node*sizeof( Tree_ptr )); for (i = 0; i < n_node; ++i) { Tr[i] = (Tree_ptr) ckalloc(sizeof(Tree_node)); Tr[i]->s = seq_at_pos[i]; Tr[i]->visit = 0; Tr[i]->i = i; Tr[i]->ap = NULL; Tr[i]->next = Tr[i]->parent = Tr[i]->first = NULL; } s = seq_at_pos[0]; j = sequence[s].hi_addr - sequence[s].lo_addr+1; Tr[0]->visit = 1; depth[0] = 1; if (s_flag) { /* When s flag is raised, move the first alignment * which is originally at the end of the linked list * to the head of the list */ ap = aligns_of_seq[Tr[0]->s]; if (ap) { app = NULL; for (; ap->link; ap = ap->link) app = ap; if (app) { app->link = NULL; ap->link = aligns_of_seq[Tr[0]->s]; aligns_of_seq[Tr[0]->s] = ap; } } } for (i = 0; i < n_node; ++i) low[1][i] = 0; for (i = 1; i <= j; ++i) { fd = 0; Tr[0]->p = i; DFT_enum2(0,1); cull_blocks(i); } cull_blocks(j+1); } DFT_enum2(t, n) /* tree node t = right-most leaf; n = level of recursion */ int t, n; { Match_ptr aq; Align_ptr ap; int pos, s1, s2, ss, i, j, s, ps, loc, old_t, u, bridge, k, find, notblocked(); if (n == 2) fd = 0; bridge = 0; if (n > 1) /*copy low*/ for (i=0; i < n_node; ++i) low[n][i] = low[n-1][i]; low[n][t] = depth[t]; ps = Tr[t]->p; s = Tr[t]->s; ap = aligns_of_seq[s]; for (; ap; ap = ap->link) { i = ap->algn; s1 = alignment[i].seq1; s2 = alignment[i].seq2; ss = ((s == s1) ? s2 : s1); pos = pos_of_seq[ss]; find = 0; if ((Tr[pos]->visit ==1) && (Tr[pos] != Tr[t]->parent)) { ps = Tr[t]->p; aq = ((s == s1) ? Mat1[i][ps] : Mat2[i][ps]); for (; aq; aq = aq->next) if (aq->loc == Tr[pos]->p) { find = 1; low[n][t] = min(low[n][t], depth[pos]); break; } if (!find && alignment[i].force) goto quit; } } if (n == n_node) { /* Found a tree with all the sequences. * Now need to propagate the low value up and at the same * time look for an articulation point or bridge */ u = t; while (u != 0) { if (brdg_artpt == BR || depth[u] == depth[0]+1) { if (low[n][u] == depth[u]) bridge = 1; } else if (low[n][u] >= depth[u]-1) bridge = 1; if (bridge) break; old_t = pos_of_seq[Tr[u]->parent->s]; low[n][old_t] = min(low[n][old_t], low[n][u]); u = old_t; } if (!bridge) save_tree(); } else { ap = aligns_of_seq[s]; for (; ap; ap = ap->link) { i = ap->algn; s1 = alignment[i].seq1; s2 = alignment[i].seq2; ss = ((s == s1) ? s2 : s1); pos = pos_of_seq[ss]; if (Tr[pos]->visit == 0) { ps = Tr[t]->p; aq = ((s == s1) ? Mat1[i][ps] : Mat2[i][ps]); for (; aq; aq = aq->next) { loc = aq->loc; if (notblocked(pos, loc, n)) { insert(t, pos, loc, ap); DFT_enum2(pos, n+1); block(pos, loc, n); } } if (alignment[i].force) goto quit; } if (n == 1 && s_flag) break; } if (!s_flag || !fd) for (old_t = t; old_t != 0;) { if (brdg_artpt == BR) { if (low[n][old_t] == depth[old_t]) break; } else if (low[n][old_t] >= depth[old_t]-1) break; ap = Tr[old_t]->ap; s = Tr[old_t]->parent->s; u = pos_of_seq[s]; low[n][u] = min(low[n][u], low[n][old_t]); old_t = u; for (ap = ap->link; ap; ap = ap->link) { i = ap->algn; s1 = alignment[i].seq1; s2 = alignment[i].seq2; ss = ((s == s1) ? s2 : s1); pos = pos_of_seq[ss]; if (Tr[pos]->visit == 0) { ps = Tr[old_t]->p; aq = ((s == s1) ? Mat1[i][ps] : Mat2[i][ps]); for (; aq; aq = aq->next) { loc = aq->loc; if (notblocked(pos, loc, n)) { insert(old_t, pos, loc, ap); DFT_enum2(pos, n+1); block(pos, loc, n); } } /* for aq*/ if (alignment[i].force) goto quit; } } /* for ap */ }/*for old_t*/ } /*else*/ quit: if (t != 0) { old_t = pos_of_seq[Tr[t]->parent->s]; delete(old_t,t); } /* renew the block, unblock all the entries blocked * through this level. */ renewblock(n); } insert(pa, n, p, ap) /* insert a node into the tree. The node is Tr[n], the parent * is Tr[pa] and the position is p, and the next alignment is ap. */ int pa, n, p; Align_ptr ap; { Tr[n]->parent = Tr[pa]; Tr[n]->next = Tr[pa]->first; Tr[pa]->first = Tr[n]; Tr[n]->p = p; Tr[n]->visit = 1; Tr[n]->ap = ap; depth[n] = depth[pa]+1; } delete(p,s) /* delete a node Tr[s] from the tree. The parent of the node * is Tr[p]. */ int p, s; { Tree_ptr ap; if (p == -1) return; Tr[s]->visit = 0; if (Tr[s] == Tr[p]->first) Tr[p]->first = Tr[s]->next; else { for (ap = Tr[p]->first; ap->next != Tr[s]; ap = ap->next) ; ap->next = Tr[s]->next; } Tr[s]->parent = Tr[s]->first = Tr[s]->next = NULL; Tr[s]->p = 0; Tr[s]->ap = NULL; } block(node, pos, level) int node, pos, level; { int n; if (nb[level] > MAX_APP) fatal("Too many alignments for a single point."); n = nb[level]++; bl[level][n].n = node; bl[level][n].p = pos; Block[node][pos] = level; } int notblocked(node, pos, level) int node, pos, level; { if (s_flag && fd && level >= 2) return(0); /* force backtrack */ return Block[node][pos] > level; } renewblock(level) int level; { int i; for (i=0; i < nb[level]; ++i) Block[bl[level][i].n][bl[level][i].p] = n_node; nb[level] = 0; } /* -------------------- Print Top of Generated Alignment File --------------- */ print_top (argc, argv) int argc; char *argv[]; { int i, j, nn; FILE *ap, *ckopen(); char *s, line[100], *ckalloc(); nn = ((s_flag) ? 2 : n_node); ap = ckopen(alignment[0].name, "r"); while (fgets(line, 100, ap) != NULL) { for (s = line; *s == ' '; ++s) ; if (strncmp(s, "a {", 3) == 0) break; if (strncmp(s, "s {", 3) == 0) { printf("s {\n"); for (i = 0; i < nn; ++i) { j = seq_at_pos[i]; printf(" \"%s\" %d %d\n", sequence[j].name, sequence[j].lo_addr, sequence[j].hi_addr); } printf("}\n"); while (fgets(line, 100, ap) != NULL) if (line[0] == '}') break; } else if (strncmp(s, "d {", 3)) fputs(line, stdout); else { printf("d {\n \"%s", "blocks"); for (i = 1; i < argc; ++i) { if (i%3 == 0) printf("\n "); printf(" %s", argv[i]); } printf("\"\n}\n"); while (fgets(line, 100, ap) != NULL) if (line[0] == '}') break; } } fclose(ap); } /* ------------------- Handle Blocks of Consistent Columns -------------------*/ typedef struct cons_col *Col_ptr; typedef struct cons_col { int *start; int span; Col_ptr next; } Cons_col; Col_ptr first_col, finished_col; save_tree() /* record a consistent column */ { int i, sp, nn, *st; Col_ptr cp; char *ckalloc(); fd = 1; if (!s_flag) nn = n_node; else nn = 2; /* is it extending a diagonal run of columns? */ for (cp = first_col; cp; cp = cp->next) { st = cp->start; sp = cp->span+1; for (i = 0; i < nn; ++i) if (Tr[i]->p != st[i]+sp) break; if (i == nn) break; } if (cp) { /* found the diagonal run */ cp->span++; return; } cp = (Col_ptr) ckalloc(sizeof(Cons_col)); cp->start = (int *)ckalloc(nn*sizeof(int)); for (i = 0; i < nn; ++i) cp->start[i] = Tr[i]->p; cp->span = 0; cp->next = first_col; first_col = cp; } cull_blocks(i) /* set aside blocks not ending at loc i of sequence 0 */ { Col_ptr cp, prev; for (prev = NULL, cp = first_col; cp; ) if (cp->start[0] + cp->span != i) if (prev) { prev->next = cp->next; cp->next = finished_col; finished_col = cp; cp = prev->next; } else { first_col = first_col->next; cp->next = finished_col; finished_col = cp; cp = first_col; } else { prev = cp; cp = cp->next; } } print_blocks() { Col_ptr cp; int i, *st, sp, nnode; nnode = ((s_flag) ? 2 : n_node); for (cp = finished_col; cp; cp = cp->next) { st = cp->start; sp = cp->span; #ifndef NO_ALIGN printf("a {\n s 0.0\n b"); for (i = 0; i < nnode; ++i) printf(" %d", st[i]); printf("\n e"); for (i = 0; i < nnode; ++i) printf(" %d", st[i] + sp); putchar('\n'); #endif printf(" l"); for (i = 0; i < nnode; ++i) printf(" %d", st[i]); for (i = 0; i < nnode; ++i) printf(" %d", st[i] + sp); printf(" 0.0\n"); #ifndef NO_ALIGN printf("}\n"); #endif } }