/* * intervals.c - find multiple aligned blocks generated by a connected family * of pairwise alignments * * The command syntax is * intervals [-k1] align1 [-k2] align2 ... * where align1, align2, etc., are pairwise alignment files in the format used * by the LAV family of alignment visualization tools. An optional integer * k1, k2, etc., gives the multiplicity of the alignment that it precedes, but * can differ from 1 (the default) only for self-alignments. The aligned blocks * generated by the pairwise alignments are written to standard out. */ #include #include #define MAX_SEQ 100 #define MAX_ALIGN 100 #define max(x,y) ((x) > (y) ? (x) : (y)) #define min(x,y) ((x) < (y) ? (x) : (y)) /* -------------------------- Input Data ----------------------------------- */ struct { char *name; int self_align, lo_addr, hi_addr; } sequence[MAX_SEQ]; struct { int seq1, seq2, mtp; char *name; } alignment[MAX_ALIGN]; int n_seq = 0, n_align = 0; /* -------------------------- Constraint Graph ---------------------------- */ typedef struct edge *Edge_ptr; typedef struct edge { int con_pos, con_align; Edge_ptr next; } Edge; int n_node = 0, /* number of nodes in graph */ *seq_at_pos, /* index of sequence at a given position */ *pos_of_seq; /* position of sequence i in the list */ Edge_ptr *edges; /* edges[i] pts to list of edges constrainiing node i */ /* -------------------------- Recorded 2-Blocks ---------------------------- */ typedef struct block2_loc *Block2_ptr; typedef struct block2_loc { int i, j, k, diag, i_plus_k; Block2_ptr next; } Block2; Block2_ptr *Blocks2; /* -------------------------------- Main Program ---------------------------- */ main(argc, argv) int argc; char *argv[]; { int i, k; if (argc < 2) fatalf("%s [-k1] alignment1 [-k2] alignment2 ...", argv[0]); /* read input */ for (i = 1; i < argc; ++i) { k = 1; if (argv[i][0] == '-' && (k = atoi(argv[i++]+1)) <= 0) fatal("Multiplicity must be positive."); read_align(k, argv[i]); } #ifdef DEBUG_INPUT for (i = 0; i < n_seq; ++i) printf("sequence %d: name = \"%s\", length = %d, self_align = %d\n", i, sequence[i].name, sequence[i].hi_addr, sequence[i].self_align); for (i = 0; i < n_align; ++i) printf("alignment %d: sequence %d <--> sequence %d; k = %d\n", i, alignment[i].seq1, alignment[i].seq2, alignment[i].mtp); #endif /* build the constraint graph */ build_cons(); #ifdef DEBUG_GRAPH { Edge_ptr ep; for (i = 0; i < n_node; ++i) { printf("seq %d at pos %d;", seq_at_pos[i], i); printf(" constraining (pos,alignment) ="); for (ep = edges[i]; ep; ep = ep->next) printf(" (%d,%d)", ep->con_pos, ep->con_align); putchar('\n'); } exit(0); } #endif /* build the databases of 2-blocks */ build_2db(); /* find and store the aligned blocks */ build_blocks(); /* print a lav-style prolog for the aligned blocks */ #ifndef NO_ALIGN print_top(argc, argv); #endif print_blocks(); exit(0); } /* ------------------------------ Input Routines ---------------------------- */ /* * Information listed above under "Input Data" is determined and n_node is set * to the number of nodes in the Constraint Graph (every distinct sequence has * a node and a self-alignment of multiplicity k adds k "secondary" nodes). */ read_align(k, file) char *file; { FILE *fp, *ckopen(); char *fgets(), line[100]; int i, j; if (n_align >= MAX_ALIGN) fatalf("Number of alignments cannot exceed %d\n", MAX_ALIGN); alignment[n_align].mtp = k; alignment[n_align].name = file; /* get the sequence numbers and perform some error checks */ fp = ckopen(file, "r"); while (fgets(line, 100, fp) != NULL && strncmp(line, "s {", 3)) ; i = alignment[n_align].seq1 = get_seq_info(fp, 1); if ((j = alignment[n_align].seq2 = get_seq_info(fp, 2)) < 0) fatalf("`%s' seems to be a multiple alignment.", file); if (i == j) if (sequence[i].self_align > -1) fatal("Cannot handle multiple self_alignments."); else { sequence[i].self_align = n_align; n_node += k; } else if (k > 1) fatal("Multiplicity > 1 only for self-alignments."); ++n_align; fclose(fp); } int get_seq_info(fp, which) FILE *fp; int which; { char *fgets(), line[100], *s, *t, *strsave(); int seq_nbr; fgets(line, 100, fp); for (s = line; *s == ' '; ++s) ; if (*s != '"') fatal("Expecting \"\n"); for (t = ++s; *t && *t != '"'; ++t) ; if (*t != '"') fatal("Could not find end of sequence name."); *t = '\0'; for (seq_nbr = 0; seq_nbr < n_seq; ++seq_nbr) if (strsame(sequence[seq_nbr].name, s)) break; if (seq_nbr == n_seq) { /* first appearance of this sequence name */ if (n_seq >= MAX_SEQ) fatalf("Number of sequences cannot exceed %d\n", MAX_SEQ); sequence[n_seq].name = strsave(s); sequence[n_seq].self_align = -1; ++n_seq; ++n_node; } for (s = t+1; *s == ' '; ++s) ; if (!isdigit(*s)) fatal("Could not find initial subscript."); sequence[seq_nbr].lo_addr = atoi(s); while (isdigit(*s)) ++s; if (*s != ' ') fatal("Could not find blank at end of initial subscript."); while (*++s == ' ') ; if (!isdigit(*s)) fatal("Could not find final subscript."); sequence[seq_nbr].hi_addr = atoi(s); if (which == 2) { /* check if this is a pairwise alignment */ fgets(line, 100, fp); for (s = line; *s == ' ' || *s == '\t'; ++s) ; if (*s != '}') return -1; } return seq_nbr; } /* ------------------------- Build the Constraint Graph --------------------- */ /* * The sequences (nodes) are ordered, beginning with the first sequence of the * first alignment. Each node after the first node has at least one edge * (alignment) from an earlier sequence (thus some constraints are placed on * the values to be tried at that node). For each node n we save the list of * pairs (pos, alignment) of these constraining conditions, where pos < n is * the position in the list of the constraining neighbor. If the sequence at * position p has a self-alignment of multiplicity k, then "secondary" nodes * for the sequence are placed at positions p + j for 1 <= j <= k. Inedges * and outedges between a secondary node and nodes for other sequences are the * same as for the sequence's "primary" node; in addition, a node representing * a sequence has an inedge from every earlier node for that sequence. */ typedef struct vertex *Align_ptr; typedef struct vertex { int algn, nbr; Align_ptr link; } Align; build_cons() { Align_ptr ap, *aligns_of_seq; Edge_ptr ep; int i, j, k, n, neighbor, sa, nodes, p, n_stack, pn, s1, s2, stack[MAX_SEQ], unreached[MAX_SEQ]; char *ckalloc(); /* for each sequence, build the list of alignments involving it */ aligns_of_seq = (Align_ptr *) ckalloc(n_seq*sizeof(Align_ptr)); for (i = 0; i < n_seq; ++i) aligns_of_seq[i] = NULL; for (i = 0; i < n_align; ++i) /* ignore self-alignments */ if ((s1 = alignment[i].seq1) != (s2 = alignment[i].seq2)) { ap = (Align_ptr) ckalloc(sizeof(Align)); ap->algn = i; ap->nbr = s2; ap->link = aligns_of_seq[s1]; aligns_of_seq[s1] = ap; ap = (Align_ptr) ckalloc(sizeof(Align)); ap->algn = i; ap->nbr = s1; ap->link = aligns_of_seq[s2]; aligns_of_seq[s2] = ap; } /* order nodes of constraint graph; start with sequence 0 at node 0 */ pos_of_seq = (int *) ckalloc(n_seq*sizeof(int)); seq_at_pos = (int *) ckalloc(n_node*sizeof(int)); edges = (Edge_ptr *) ckalloc(n_node*sizeof(Edge_ptr)); for (i = 0; i < n_seq; ++i) { unreached[i] = 1; pos_of_seq[i] = -1; } nodes = 0; /* number of nodes currently in graph */ stack[0] = 0; /* stack node 0 */ unreached[0] = 0; n_stack = 1; /* number of nodes currently in stack */ while (n_stack) { n = stack[--n_stack]; /* POP */ pos_of_seq[n] = pn = nodes++; seq_at_pos[pn] = n; edges[pn] = NULL; /* loop over neighbors of n */ for (ap = aligns_of_seq[n]; ap; ap = ap->link) /* if the neighbor is an earlier node */ if ((p = pos_of_seq[neighbor = ap->nbr]) > -1) { /* constraining edge from neighbor to n */ add_con(p, pn, ap->algn); /* + edges from neighbor's secondary nodes */ if ((sa = sequence[neighbor].self_align) >= 0) for (k = 1; k <= alignment[sa].mtp; ++k) add_con(p+k, pn, ap->algn); } else if (unreached[neighbor]) { /* add neighbor to the stack of marked nodes */ unreached[neighbor] = 0; stack[n_stack++] = neighbor; } /* if n is self aligned */ if ((sa = sequence[n].self_align) >= 0) /* next nodes are n's secondary nodes */ for (k = 1; k <= alignment[sa].mtp; ++k, ++nodes) { edges[nodes] = NULL; seq_at_pos[nodes] = n; /* constraining edges from earlier copies */ for (j = 0; j < k; ++j) add_con(pn+j, nodes, sa); /* and duplicates of constraints on primary */ for (ep = edges[pn]; ep; ep = ep->next) add_con(ep->con_pos, nodes, ep->con_align); } } if (nodes != n_node) fatal("Graph is disconnected."); } add_con(from, to, align) int from, to, align; { Edge_ptr ep; char *ckalloc(); ep = (Edge_ptr) ckalloc(sizeof(Edge)); ep->con_pos = from; ep->con_align = align; ep->next = edges[to]; edges[to] = ep; } /* ------------------------ Build the Lists of 2-blocks --------------------- */ /* * The node ordering constructed for the Constraint Graph determines, for * each edge (alignment), which sequence comes first (i.e., whose values * will constrain values that we try for the other sequence). For each * interval alignment we make a linked list of 2-blocks, sorted by offset to * simplify the process of intersecting a collection of these lists. */ build_2db() { int i; char *ckalloc(); Init_DB(); for (i = 0; i < n_align; ++i) build_2block(i); } build_2block(algn) /* build 2-block list for alignment algn */ int algn; { int reverse, len, s1, s2, x1, x2, y1, y2; char *ckalloc(); FILE *ap, *ckopen(); Block2_ptr start, b; s1 = alignment[algn].seq1; s2 = alignment[algn].seq2; reverse = (pos_of_seq[s1] > pos_of_seq[s2]); len = sequence[s1].hi_addr; ap = ckopen(alignment[algn].name, "r"); start = NULL; while (get_diag(ap, &x1, &y1, &x2, &y2)) { if (s1 == s2 && x1 == 1 && y1 == x1 && x2 == len && y2 == x2) continue; /* ignore main diagonal in self-comp */ b = (Block2_ptr) ckalloc(sizeof(Block2)); if (reverse) { b->i = y1; b->j = x1; b->k = y2 - y1; } else { b->i = x1; b->j = y1; b->k = x2 - x1; } b->diag = b->i - b->j; b->next = start; start = b; } fclose(ap); In_DB(algn, start); } /* read a line from a pairwise alignment file */ int get_diag(ap, x1p, y1p, x2p, y2p) FILE *ap; int *x1p, *y1p, *x2p, *y2p; { char *s, line[100]; while (fgets(line, 100, ap) != NULL) { for (s = line; *s == ' '; ++s) ; if (*s != 'l') continue; while (*s && *s != ' ') ++s; sscanf(s, "%d %d %d %d", x1p, y1p, x2p, y2p); return 1; } return 0; } /* ---------- Find Aligned Blocks Generated by Pairwise Alignments ---------- */ int *offset; Block2_ptr **BP; /* BP[i] pts to array used at i-th level of recursion */ int **OP; /* ditto */ build_blocks() { int i, n_cons; char *ckalloc(); Edge_ptr ep; Block2_ptr B; offset = (int *)ckalloc(n_node*sizeof(int)); BP = (Block2_ptr **) ckalloc(n_node*sizeof(Block2_ptr*)); OP = (int **) ckalloc(n_node*sizeof(int*)); for (i = 1; i < n_node; ++i) { /* count number of constraints (in-edges) for node i */ for (n_cons = 0, ep = edges[i]; ep; ++n_cons, ep = ep->next) ; BP[i] = (Block2_ptr *) ckalloc(n_cons*sizeof(Block2_ptr)); OP[i] = (int *) ckalloc(n_cons*sizeof(int)); } offset[0] = 0; for (B = Blocks2[edges[1]->con_align]; B; B = B->next) { offset[1] = B->diag; enumerate(2, B->i, B->i + B->k); } } /* try all feasible values at node pos of the Constraint Graph */ enumerate(pos, L, R) int pos, L, R; { int i, n_cons, diff_offset, bd, *offs, o; Block2_ptr *Bp, Out_DB(), List; Edge_ptr ep; if (pos >= n_node) { save_block(L,R); return; } Bp = BP[pos]; offs = OP[pos]; /* Bp[i] pts to next 2-block in edge from i-th constraining position */ for (n_cons = 0, ep = edges[pos]; ep; ++n_cons, ep = ep->next) { o = offs[n_cons] = offset[ep->con_pos]; if ((List = Bp[n_cons] = Out_DB(ep->con_align, L-o, R-o)) == NULL) return; } for ( ; ; ) { /* loop over feasible values for this position */ for (diff_offset = 1; diff_offset; ) { /* until same offset */ diff_offset = 0; for (i = 0; i < n_cons-1; ++i) if (offs[i]+Bp[i]->diag != offs[i+1]+Bp[i+1]->diag) { diff_offset = 1; if (offs[i]+Bp[i]->diag < offs[i+1]+Bp[i+1]->diag) { if ((Bp[i] = Bp[i]->next) == NULL) {free_list(List); return;} } else if ((Bp[i+1] = Bp[i+1]->next) == NULL) {free_list(List); return;} } } offset[pos] = offs[0] + Bp[0]->diag; exhaust(Bp, offs, n_cons, 0, pos, L, R); /* move to next diagonal in every list */ for (i = 0; i < n_cons; ++i) for (bd = Bp[i]->diag; Bp[i]->diag == bd; ) if ((Bp[i] = Bp[i]->next) == NULL) {free_list(List); return;} } } free_list(List) Block2_ptr List; { Block2_ptr b; while (List) { b = List->next; free(List); List = b; } } exhaust(Bp, offs, n_cons, level, pos, L, R) Block2_ptr *Bp; int *offs, n_cons, level, pos, L, R; { Block2_ptr b2; int newL, newR, bd; if (level == n_cons) enumerate(pos+1, L, R); else { bd = Bp[level]->diag; for (b2 = Bp[level]; b2 && b2->diag == bd; b2 = b2->next) { newL = max(L, offs[level] + b2->i); newR = min(R, offs[level] + b2->i + b2->k); if (newL <= newR) exhaust(Bp, offs, n_cons, level+1, pos, newL, newR); } } } /* ------------------------- Interval Databases ----------------------------- */ /* * For each alignment (except the 0-th) we arrange the 2-blocks so we can * answer queries of the form: give me all 2-blocks where [i, i+k] intersects * the query interval [L, R]. The current implementation sorts blocks by i+k * and retains the largest k (interval length). Given L and R, we find the * first interval such that L <= i+k. The sorted list is scanned from this * point on, stopping when we reach an interval where i+k > R + max(k). */ Block2_ptr **Sort2; /* array of pointers to 2-blocks sorted by i+k */ int *many; /* number of 2-blocks in this alignment */ int *longest; /* maximum k (interval length) */ Init_DB() { Blocks2 = (Block2_ptr *) ckalloc(n_align*sizeof(Block2_ptr)); Sort2 = (Block2_ptr **) ckalloc(n_align*sizeof(Block2_ptr *)); many = (int *) ckalloc(n_align*sizeof(int)); longest = (int *) ckalloc(n_align*sizeof(int)); } /* put linked list of 2-blocks into database for algn-th alignment */ In_DB(algn, bp) Block2_ptr bp; { int n, max_k; Block2_ptr b, *s; max_k = 0; for (n = 0, b = bp; b; ++n, b = b->next) max_k = max(max_k, b->k); many[algn] = n; longest[algn] = max_k; s = Sort2[algn] = (Block2_ptr *) ckalloc(n*sizeof(Block2_ptr)); for (n = 0, b = bp; b; ++n, b = b->next) { s[n] = b; b->i_plus_k = b->i + b->k; } sort(s, n); Blocks2[algn] = bp; } /* heapsort, by "i_plus_k" field, n (pointers to) 2-blocks */ sort(s, n) Block2_ptr *s; int n; { Block2_ptr t; int i, heapsize; --s; /* so subscripts start with 1 */ for (i = n/2; i > 0; --i) FixHeap(s, i, s[i], n); for (heapsize = n; heapsize >= 2; --heapsize) { t = s[1]; FixHeap(s, 1, s[heapsize], heapsize-1); s[heapsize] = t; } } FixHeap(s, root, k, bound) Block2_ptr *s, k; int root, bound; { int vacant, largerChild; vacant = root; while (2*vacant <= bound) { largerChild = 2*vacant; if (2*vacant < bound && s[largerChild+1]->i_plus_k > s[largerChild]->i_plus_k) ++largerChild; if (k->i_plus_k < s[largerChild]->i_plus_k) { s[vacant] = s[largerChild]; vacant = largerChild; } else break; } s[vacant] = k; } /* return pointer to list of 2-blocks from algn-th alignment that overlap [L,R], sorted by diagonal */ Block2_ptr Out_DB(algn, L, R) int algn, L, R; { char *ckalloc(); Block2_ptr b, p, start, new, *s = Sort2[algn]; int d, hi, lo, mid; start = NULL; lo = 0; hi = many[algn]-1; if (s[hi]->i_plus_k < L) return start; while (lo < hi) { /* do binary search on L */ mid = (lo+hi)/2; if (L <= s[mid]->i_plus_k) hi = mid; else lo = mid+1; } /* Now s[hi] is the first (by i+k) 2-block where L <= i+k */ for ( ; hi < many[algn] && s[hi]->i_plus_k <= R+longest[algn]; ++hi) if (s[hi]->i <= R) { /* overlapping interval; insert in sorted order */ b = s[hi]; new = (Block2_ptr) ckalloc(sizeof(Block2)); new->i = b->i; new->j = b->j; new->k = b->k; d = new->diag = b->diag; if (start == NULL || start->diag >= d) { new->next = start; start = new; } else { for (p = start; p->next && p->next->diag < d; ) p = p->next; new->next = p->next; p->next = new; } } return start; } /* -------------------- Print Top of Generated Alignment File --------------- */ print_top (argc, argv) int argc; char *argv[]; { int i, j; FILE *ap, *ckopen(); char *s, line[100], *ckalloc(); ap = ckopen(alignment[0].name, "r"); while (fgets(line, 100, ap) != NULL) { for (s = line; *s == ' '; ++s) ; if (strncmp(s, "a {", 3) == 0) break; if (strncmp(s, "s {", 3) == 0) { printf("s {\n"); for (i = 0; i < n_node; ++i) { j = seq_at_pos[i]; printf(" \"%s\" %d %d\n", sequence[j].name, sequence[j].lo_addr, sequence[j].hi_addr); } printf("}\n"); while (fgets(line, 100, ap) != NULL) if (line[0] == '}') break; } else if (strncmp(s, "d {", 3)) fputs(line, stdout); else { printf("d {\n \"%s", "intervals"); for (i = 1; i < argc; ++i) { if (i%3 == 0) printf("\n "); printf(" %s", argv[i]); } printf("\"\n}\n"); while (fgets(line, 100, ap) != NULL) if (line[0] == '}') break; } } fclose(ap); } /* --------------------------- Handle Blocks -------------------------------- */ typedef struct save_block *Save_ptr; typedef struct save_block { int *start; int span; Save_ptr next; } Saved_block; Save_ptr first_block; save_block(L,R) /* record a block */ { int i; Save_ptr sp; char *ckalloc(); sp = (Save_ptr) ckalloc(sizeof(Saved_block)); sp->start = (int *)ckalloc(n_node*sizeof(int)); for (i = 0; i < n_node; ++i) sp->start[i] = L - offset[i]; sp->span = R-L; sp->next = first_block; first_block = sp; } print_blocks() { Save_ptr cp; int i, *st, sp; for (cp = first_block; cp; cp = cp->next) { st = cp->start; sp = cp->span; #ifndef NO_ALIGN printf("a {\n s 0.0\n b"); for (i = 0; i < n_node; ++i) printf(" %d", st[i]); printf("\n e"); for (i = 0; i < n_node; ++i) printf(" %d", st[i] + sp); putchar('\n'); #endif printf(" l"); for (i = 0; i < n_node; ++i) printf(" %d", st[i]); for (i = 0; i < n_node; ++i) printf(" %d", st[i] + sp); printf(" 0.0\n"); #ifndef NO_ALIGN printf("}\n"); #endif } } /* -------------------------- Utility routines ------------------------------ */ /* fatal - print message and die */ fatal(msg) char *msg; { fprintf(stderr, "%s\n", msg); exit(1); } /* fatalf - format message, print it, and die */ fatalf(msg, val) char *msg, *val; { fprintf(stderr, msg, val); putc('\n', stderr); exit(1); } /* ckalloc - allocate space; check for success */ char *ckalloc(amount) int amount; { char *malloc(), *p; if ((p = malloc( (unsigned) amount)) == NULL) fatal("Ran out of memory."); return(p); } /* ckopen - open file; check for success */ FILE *ckopen(name, mode) char *name, *mode; { FILE *fopen(), *fp; if ((fp = fopen(name, mode)) == NULL) fatalf("Cannot open %s.", name); return(fp); } /* strsame - tell whether two strings are identical */ int strsame(s, t) char *s, *t; { return(strcmp(s, t) == 0); } /* strsave - save string s somewhere; return address */ char *strsave(s) char *s; { char *ckalloc(), *p, *strcpy(); p = ckalloc(strlen(s)+1); /* +1 to hold '\0' */ return(strcpy(p, s)); }