#!/bin/nawk -f #!/usr/local/bin/gawk -f # $Id: msa2lav,v 1.11 1991/11/02 21:41:44 schwartz Exp $ func usage() { print "usage: msa2lav seqdesc alignfile"; error=1; exit(1); } func emit_header() { print "#:lav"; print "gen { \"msa2lav\" }"; print "desc { \"msa alignment\" }"; } func emit_seq(nseq, n) { print "s {"; for (n=1; n<=nseq; ++n) if (seqinfo[n] == "") print " \"unknown" n "\" " 1 " " length(catseq[n]); else print seqinfo[n]; print "}"; } func extract_seqinfo(fname, f, n) { if (fname != "") { n = 0; while (getline 0) { split ($0, f); if (f[2] == "residues") { sub(">", "", f[1]); sub("-", " ", f[3]); # XXX - assumes positive numbers sub("-", " ", f[5]); seqinfo[++n] = " \"" f[1] "\" " f[3]; } } close(fname); } } func is_edge(i, n,l,r) { for (n=1; n<=nseq; ++n) { l = substr(catseq[n],i,1); r = substr(catseq[n],i+1,1); if (((l=="-") != (r=="-")) || (l=="") || (r=="")) { return 1; } } return 0; } func write_block (from, len, n) { for (n=1; n<=nseq; ++n) { print (seqidx[n]+0)"\t"substr(catseq[n], from, len)"\t"(seqidx[n]+len-1); seqidx[n] += (substr(catseq[n], from, 1) == "-") ? 0 : len; } print ""; } func write_align (from, len, out, newidx) { for (n=1; n<=nseq; ++n) { if (seqidx[n]==0) seqidx[n] = 1; } for (n=1; n<=nseq; ++n) { out[n] = substr(catseq[n], from, len); gap = (match(out[n],"-+") ? 0 : len); if (DEBUG==1) print seqidx[n] "\t" out[n] "\t" seqidx[n]+len-1 "\t" seqidx[n]+gap-1; newidx[n] = seqidx[n] + gap; } print "a {"; print " s 0.0"; printf " b "; for (n=1; n<=nseq; ++n) printf seqidx[n] " "; print ""; printf " e "; for (n=1; n<=nseq; ++n) printf newidx[n]-1 " "; print ""; printf " l "; for (n=1; n<=nseq; ++n) printf seqidx[n] " "; for (n=1; n<=nseq; ++n) printf newidx[n]-1 " "; print "0.0"; print "}"; for (n=1; n<=nseq; ++n) seqidx[n] = newidx[n]; } BEGIN { if (ARGC==1) { usage(); } if (ARGC>2) { seq_file = ARGV[1]; delete ARGV[1]; } emit_header(); # globals error=0; # usage inoma=0; # looking at optimal mult align section nseq=0; # number of sequences accumulated n=0; # scratch edge_start=1; # at block boundary catseq[0]=""; # catenated lines of sequence fragments seqidx[0]=0; # running index into each sequence seqinfo[0]=""; # info from sequence file seq[0]=""; # partial lines } (inoma==0) && ($0 ~ /[ \t]*\*\*\*[ \t]*Optimal Multiple Alignment/) {inoma=1;} (inoma==1) && ($0 ~ /^[-A-Z]+$/) { if (DEBUG==1) print "@ " $0; ++n; seq[n] = $0; } (inoma==1) && ($0 ~ /^$/) { if (DEBUG==1) print "catenate"; if (nseq==0) { nseq = n; } for(i=1; i<=n; ++i) { catseq[i] = catseq[i] seq[i]; } n=0; } END { if (error==0) { for(i=1; i<=n; ++i) { catseq[i] = catseq[i] seq[i]; } if (DEBUG==1) print "cat: " if (DEBUG==1) for(i=1; i<=nseq; ++i) { print catseq[i]; } extract_seqinfo(seq_file); emit_seq(nseq); if (DEBUG==1) print "blocks: " for(i=1; i<=length(catseq[1]); ++i) { if (is_edge(i)) { write_align(edge_start, i-edge_start+1); edge_start=i+1; } } } }