#include "psublast/libc.h" #include "psublast/psublast.h" static char rcsid[] = "$Id: decomT.c,v 1.7 2000/03/23 19:49:33 schwartz Exp $"; typedef struct sg { int b1, b2, e1, e2; int score; int *script, script_len; } seg_node, *seg_ptr; /* Tree node */ typedef struct tl { int score, lscore; struct tl *lchild, *mchild, *rchild; seg_ptr left, right; } tree_node, *tree_ptr; #define DEFAULT_X 100 static uchar *seq1, *seq2; static int len1, len2, from1, from2, to1, to2; static int gop, gep, X, Y; static ss_t ss; static SEQ *sf1, *sf2; static argv_scores_t scores; static int outflag; void read_seqs(FILE *); void decompose(int *S, int n, int b1, int b2, int K); int read_align(FILE *file_ptr, int *script, int *start1, int *start2); int main(int argc, char **argv) { int i, x, y, K; char buf[500]; FILE *ap; int *bp; if (argc < 2) fatalf("%s alignment [O=?60] [E=?2] [M=?10] [I=?-10] [V=?-10] [S=?] [X=?100] [T=?0]", argv[0]); ckargs("OEMIVSTX", argc, argv, 2); DNA_scores(&scores, ss); gop = scores.O; gep = scores.E; get_argval_pos('X', &X, DEFAULT_X); get_argval_pos('T', &outflag, 0); ap = same_string(argv[1], "-") ? stdin : ckopen(argv[1], "r"); if (fgets(buf, 500, ap) == NULL || strcmp(buf, "#:lav\n")) fatal("Not an alignment file."); read_seqs(ap); bp = ckalloc(sizeof(int)*(len1+len2+1)); /* print_align_header(sf1, sf2, &scores); */ K = DNA_thresh(sf1, sf2, &scores); while ((i = read_align(ap, bp, &x, &y)) != 0) decompose(bp, i, x, y, K); free(bp); if (ap != stdin) fclose(ap); seq_close(sf1); seq_close(sf2); return 0; } static void print_all_X_full(tree_ptr tp, int lscore, int threshold, int ind) { int i; if (tp == NULL) return; if (tp->score > 0) { if (tp->score < threshold || -tp->lscore < X) return; if (tp->lscore > lscore) { /*for (i = 0; i < ind; i++) printf(" ");*/ /*printf(" l %d %d %d %d %d %d\n", tp->left->b1, tp->left->b2, tp->right->e1, tp->right->e2, (int) (100.0* log((double) (-tp->lscore-X))), -tp->lscore); */ printf(" l %d %d %d %d %d\n", tp->left->b1, tp->left->b2, tp->right->e1, tp->right->e2, ind); ind--; } } print_all_X_full(tp->lchild, tp->lscore, threshold, ind); print_all_X_full(tp->mchild, tp->lscore, threshold, ind); print_all_X_full(tp->rchild, tp->lscore, threshold, ind); } static int print_all_X_full_tree(tree_ptr tp, int lscore, int threshold, tree_ptr *stack, int top) { int i, j; if (tp == NULL) return top; if (tp->score > 0) { if (tp->score < threshold || -tp->lscore < X) return top; if (tp->lscore > lscore) { stack[top] = tp; top++; } } i = print_all_X_full_tree(tp->lchild, tp->lscore, threshold, stack, top); i = print_all_X_full_tree(tp->mchild, tp->lscore, threshold, stack, i); i = print_all_X_full_tree(tp->rchild, tp->lscore, threshold, stack, i); if (top > 0 && stack[top-1] == tp) { if (top < i) { printf("%d %d-%d --->", -tp->lscore, tp->left->b1, tp->right->e1); j = top; while (top < i) { printf(" %d-%d", stack[top]->left->b1, stack[top]->right->e1); top++; } printf("\n"); i = j; } } return i; } static void new_seg(seg_ptr sp, int top, int b1, int b2, int e1, int e2, int S, int *script, int script_len) { sp[top].b1 = b1; sp[top].b2 = b2; sp[top].e1 = e1; sp[top].e2 = e2; sp[top].score = S; sp[top].script = script; sp[top].script_len = script_len; } static void new_tree_node(tree_ptr tp, int i, int score, seg_ptr left, seg_ptr right, tree_ptr lc, tree_ptr mc, tree_ptr rc) { tp[i].score = tp[i].lscore = score; tp[i].left = left; tp[i].right = right; tp[i].lchild = lc; tp[i].rchild = rc; tp[i].mchild = mc; } void merge(tree_ptr tp, int i, tree_ptr *stack, int top_s) { int score = stack[top_s-2]->score + stack[top_s-1]->score + stack[top_s]->score; new_tree_node(tp, i, score, stack[top_s-2]->left, stack[top_s]->right, stack[top_s-2], stack[top_s-1], stack[top_s]); if (score > 0) tp[i].lscore = stack[top_s-1]->score; } static tree_ptr build_X_forest(tree_ptr tp, tree_ptr pp) { tree_ptr lc, mc, rc; if (tp == NULL) return NULL; lc=tp->lchild; mc = tp->mchild; rc = tp->rchild; if (tp == pp) { tp->lchild = NULL;} if (tp->score > 0) { if (tp->lscore > pp->lscore) { /*printf("%d %d %d->%d %d %d %d\n", -tp->lscore, tp->left->b1, tp->right->e1, -pp->lscore, pp->left->b1, pp->right->e1, pp->lchild);*/ tp->mchild = pp->lchild; pp->lchild = tp; pp = tp; pp->lchild = NULL; } } rc = build_X_forest(rc, pp); mc = build_X_forest(mc, pp); lc = build_X_forest(lc, pp); return tp; } static void print_X_tree(tree_ptr tp, int threshold, int level) { tree_ptr cp; if (tp==NULL || tp->score < threshold || -tp->lscore < X ) return; for (cp = tp->lchild; cp; cp = cp->mchild) if (cp->score >= threshold) break; if (cp) { if (outflag == 1) { printf("%d %d-%d --->", -tp->lscore, tp->left->b1, tp->right->e1); for (cp = tp->lchild; cp; cp = cp->mchild) if (cp->score >= threshold) printf(" %d-%d", cp->left->b1, cp->right->e1); printf("\n"); } else { for (cp = tp->lchild; cp; cp = cp->mchild) if (cp->score >= threshold) printf(" l %d %d %d %d %d\n", cp->left->b1, cp->left->b2, cp->right->e1,cp->right->e2, level); } for (cp = tp->lchild; cp; cp = cp->mchild) print_X_tree(cp, threshold, level-1); } } static void print_X_forest(tree_ptr tp, int threshold) { tp = tp->lchild; while (tp) { if (outflag == 0 && tp->score >= threshold && -tp->lscore >= X) printf(" l %d %d %d %d %d\n", tp->left->b1, tp->left->b2, tp->right->e1,tp->right->e2, 10); print_X_tree(tp, threshold, 9); tp = tp->mchild; } } void decompose(int *A, int n, int b1, int b2, int K) { int i, s, si, score, score1, score2, S, top, index, min_score = 0; seg_ptr sp; tree_ptr tp, *stack; uchar *s1, *s2; int *script, top_s; int x1, y1, x, y, script_len; b1 += from1; b2+=from2; s1 = seq1+b1-1; s2 = seq2 + b2-1; sp = ckalloc(sizeof(seg_node)*(n+2)); top = 0; x = y = 0; S = 0; script_len = 0; script = A; x1 = y1 = 0; for (i = 0; i < n; i++) { if ((s = A[i]) < 0) { si = -gop+gep*s; } else if (s > 0) { si = -gop-gep*s; } else si = ss[s1[x]][s2[y]]; min_score -= abs(si); if (S >=0 && si >= 0) { S+=si; script_len++;} else if (S < 0 && si < 0) {S+=si; script_len++;} else { top++; new_seg(sp, top, b1+x1, b2+y1, b1+x-1, b2+y-1, S, script, script_len); x1 = x; y1 = y; S = si; script = A+i; script_len = 1; } if (s < 0) x-=s; else if (s > 0) y+=s; else {x++; y++;} } top++; if (S > 0) { new_seg(sp, top, b1+x1, b2+y1, b1+x-1, b2+y-1, S, script, script_len); top++; new_seg(sp, top, 0, 0, 0, 0, min_score-1, NULL, 0); } else { new_seg(sp, top, 0, 0, 0, 0, min_score-1, NULL, 0); } new_seg(sp, 0, 0, 0, 0, 0, min_score-1, NULL, 0); top++; tp = ckalloc(sizeof(tree_node)*(top*3/2+2)); for (i = 0; i < top; i++) new_tree_node(tp, i, sp[i].score, sp+i, sp+i, NULL, NULL, NULL); index = top; stack = ckalloc(sizeof(tree_ptr)*(top+1)); top_s = 2; stack[0] = tp; stack[1] = tp+1; stack[2] = tp+2; i = 3; while (top_s > 1 || i < top) { if (top_s > 1) { score1 = stack[top_s]->score; score2 = stack[top_s-2]->score; if (stack[top_s-1]->score < -MAX(score1, score2)) { merge(tp, index, stack, top_s); top_s-= 2; stack[top_s] = &tp[index++]; continue; } else if (score2 >= score1) { merge(tp, index, stack, top_s-1); top_s -= 2; stack[top_s-1] = &tp[index++]; stack[top_s] = stack[top_s+2]; continue; } } stack[++top_s] = &tp[i++]; stack[++top_s] = &tp[i++]; } /* for (i = 1; i <= top; i++) if ((score = b[i].SR-b[i].SL) >= K) print_align(score, seq1, seq2, b[i].b1-from1, b[i].e1-from1, b[i].b2-from2, b[i].e2-from2, A + b[i].L); */ stack[0] = build_X_forest(stack[0], stack[0]); print_X_forest(stack[0], K); free(sp); free(stack); free(tp); } void read_seqs(FILE *alignment) { char buf[500], buf2[500], *p, *q; while (fgets(buf, 500, alignment)) { if (strncmp(buf, "s {", 3)) continue; if (fgets(buf, 500, alignment) == NULL) fatal("Did not find sequence names in alignment file."); if ((q=strchr(buf,'"')) == NULL || (p=strchr(q+1,'"')) == NULL) fatal("Cannot find first sequence name."); *p = '\0'; if (sscanf(p+1, "%d %d", &from1, &to1) != 2) fatal("Could not find positions in sequence 1"); snprintf(buf2, 500, "%s[%d,%d]", q+1, from1, to1); sf1 = seq_open(buf2, 0, 0); if (seq_read(sf1) < 0) fatalf("error reading %s", buf2); len1 = SEQ_LEN(sf1); seq1 = SEQ_CHARS(sf1); --from1; if (fgets(buf, 500, alignment) == NULL) fatal("Did not find sequence names in alignment file."); if ((q=strchr(buf,'"')) == NULL || (p=strchr(q+1,'"')) == NULL) fatal("Cannot find second sequence name."); *p = '\0'; if (sscanf(p+1, "%d %d", &from2, &to2) != 2) fatal("Could not find positions in sequence 2"); snprintf(buf2, 500,"%s[%d,%d]", q+1, from2, to2); sf2 = seq_open(buf2, 0, 0); if (seq_read(sf2) < 0) fatalf("error reading %s", buf2); len2 = SEQ_LEN(sf2); seq2 = SEQ_CHARS(sf2); --from2; return; } fatal("Failed to extract sequence names from alignment file."); } int read_align(FILE *file_ptr, int *script, int *start1, int *start2) { char line[100]; int i, j, b1, b2, e1, e2, pe1, pe2; i = 0; while (fgets(line, 100, file_ptr)) if (strncmp(line, "a {", 3) == 0) { fgets(line, 100, file_ptr); fgets(line,100, file_ptr); sscanf(line, " b %d %d", &b1, &b2); fgets(line,100, file_ptr); sscanf(line, " e %d %d", &e1, &e2); pe1 = b1; pe2 = b2; *start1 = b1; *start2 = b2; while (fgets(line, 100, file_ptr)) { if (line[0] == '}') return i; sscanf(line, "%*s %d %d %d %d", &b1, &b2, &e1, &e2); if (b1 != pe1) script[i++] = -(b1-pe1); if (b2 != pe2) script[i++] = b2-pe2; for (j = b1; j <= e1; j++) script[i++] = 0; pe1 = e1+1; pe2 = e2+1; } break; } return 0; }