
static char const rcsid [] = "$Id: filt.c,v 1.1 1998/04/21 10:43:44 stojanov Exp $";


/*****************************************************************************/
/*                                                                           */
/* Program: filt (data filter within a plain format landmarks file)          */
/*                                                                           */
/* Author: Nikola Stojanovic                                                 */
/*                                                                           */
/* Revision:    16 APR 97   Version 1.0                                      */
/*                                                                           */
/*                                                                           */
/*   Given a file in "plain" format and a list of site names and/or binding  */
/* factors, the program searches the file for occurences of these and        */
/* outputs a new file containing only the specified lines                    */
/*                                                                           */
/*****************************************************************************/


#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "ntl.h"

/*****************************************************************************/
/*                                                                           */
/* Definitions section                                                       */
/*                                                                           */
/*****************************************************************************/


/*****************************************************************************/
/* Definitions of the constants of the program unit                          */
/*****************************************************************************/


#define BY_FACTOR                 1
#define IN_SEQUENCE               2

#define DEFAULT_ORDER             IN_SEQUENCE


/*****************************************************************************/
/* Prototypes of locally used functions of the program unit                  */
/*****************************************************************************/


void filt_process_data (char *input_file, strlist_ptr sites,
                        strlist_ptr bindings, plain_ptr records, int order);
void filt_print_record (plain_ptr item);

/*****************************************************************************/
/*                                                                           */
/* Code section                                                              */
/*                                                                           */
/*****************************************************************************/


/*****************************************************************************/
/*                                                                           */
/* Procedure: main                                                           */
/*                                                                           */
/* "main" procedure of the program. Receives and analyses the command line   */
/*   parameters, sets the control variables of the program, checks their     */
/*   consistency and passes control to internal procedures which actually    */
/*   process the input data and output results; returns 0 if everything is   */
/*   OK, non-zero status in case of any errors                               */

int main (int argc, char **argv)
{
 char *input_file, *new_name; int arg_count; plain_ptr records; errind report;
 strlist_ptr site_names, binding_factors, list_scan, list_prev;
 bool order_set; int order;
 
 input_file = NULL; site_names = NULL; binding_factors = NULL;
 order_set = FALSE; order = DEFAULT_ORDER;
 
 if (argc < 2) {                       /* Display instructions and terminate */
 
  fprintf (stderr, "usage: %s <input_file>\n", argv [0]);
  fprintf (stderr, "            [-s <site_name>]+\n");
  fprintf (stderr, "            [-b <binding_factor>]+\n");
  fprintf (stderr, "            [-f | -o]\n");
  exit (1);
 }
 else {                           /* Some parameters provided - process them */
 
  /* Proceed to extract the command line parameters and create the settings  */
  
  arg_count = 1; while (arg_count < argc) {
  
   if (argv [arg_count] [0] != '-') {       /* Not an "-" option - file name */
    if (input_file != NULL) {
     fprintf (stderr, "Input file already set.\n"); exit (1);
    }
    else {
     input_file = NTL0_strsave (argv [arg_count]); arg_count++;
    }
   }
   else if (!strcmp (argv [arg_count], "-s")) {        /* Site name provided */
    arg_count++;
    if (arg_count == argc) {
     fprintf (stderr, "Missing site name.\n"); exit (1);
    }
    else {
     new_name = NTL0_strsave (argv [arg_count]); arg_count++;
     if (site_names == NULL) {
      site_names = (strlist_ptr) NTL0_ckalloc (sizeof (StrList_Struct));
      site_names -> string = new_name;
      site_names -> next = NULL;
     }
     else {
      list_scan = site_names; list_prev = NULL; while (list_scan != NULL) {
       if (!strcmp (list_scan -> string, new_name)) {
        fprintf (stderr, "Repeated site name request ('%s').\n", new_name);
        exit (1);
       }
       else { list_prev = list_scan; list_scan = list_scan -> next; }
      }
      list_prev -> next = (strlist_ptr) NTL0_ckalloc (sizeof (StrList_Struct));
      (list_prev -> next) -> string = new_name;
      (list_prev -> next) -> next = NULL;
     }
    }
   }
   else if (!strcmp (argv [arg_count], "-b")) {   /* Binding factor provided */
    arg_count++;
    if (arg_count == argc) {
     fprintf (stderr, "Missing binding factor.\n"); exit (1);
    }
    else {
     new_name = NTL0_strsave (argv [arg_count]); arg_count++;
     if (binding_factors == NULL) {
      binding_factors = (strlist_ptr) NTL0_ckalloc (sizeof (StrList_Struct));
      binding_factors -> string = new_name;
      binding_factors -> next = NULL;
     }
     else {
      list_scan = binding_factors; list_prev = NULL;
      while (list_scan != NULL) {
       if (!strcmp (list_scan -> string, new_name)) {
        fprintf (stderr,
                 "Repeated binding factor request ('%s').\n", new_name);
        exit (1);
       }
       else { list_prev = list_scan; list_scan = list_scan -> next; }
      }
      list_prev -> next = (strlist_ptr) NTL0_ckalloc (sizeof (StrList_Struct));
      (list_prev -> next) -> string = new_name;
      (list_prev -> next) -> next = NULL;
     }
    }
   }
   else if (!strcmp (argv [arg_count], "-f")) {     /* Sort output by factor */
    if (order_set) {
     fprintf (stderr, "Output order already set.\n"); exit (1);
    }
    else { order = BY_FACTOR; order_set = TRUE; arg_count++; }
   }
   else if (!strcmp (argv [arg_count], "-o")) {  /* Output by seq. positions */
    if (order_set) {
     fprintf (stderr, "Output order already set.\n"); exit (1);
    }
    else { order = IN_SEQUENCE; order_set = TRUE; arg_count++; }
   }
   else {                                     /* Unknown command-line option */
    fprintf (stderr, "Illegal option (%s).\n", argv [arg_count]);
    fprintf (stderr, "usage: %s <input_file>\n", argv [0]);
    fprintf (stderr, "            [-s <site_name>]+\n");
    fprintf (stderr, "            [-b <binding_factor>]+\n");
    fprintf (stderr, "            [-f | -o]\n");
    exit (1);
   }
  }
  /* Now check whether all necessary parameters have been provided           */

  if (input_file == NULL) {
   fprintf (stderr, "Must have a file to process.\n"); exit (1);
  }
  if ((site_names == NULL) && (binding_factors == NULL)) {
   fprintf (stderr, "No sites to search for.\n"); exit (1);
  }

  /* Proceed to load the data from the specified plain format file           */
  
  if ((report = NTL2_Load_Plain (input_file, &records)) != NULL) {
   fprintf (stderr, "Can't open sequences file '%s' (%s).\n",
                    input_file, report -> message);
   exit (1);
  }
  else {                            /* File loading OK, process the contents */

   filt_process_data (input_file, site_names, binding_factors, records, order);
   exit (0);
  }
 }
}


/*****************************************************************************/
/*                                                                           */
/* Procedure: filt_process_data                                              */
/*                                                                           */

void filt_process_data (char *input_file, strlist_ptr sites,
                        strlist_ptr bindings, plain_ptr records, int order)
{
 strlist_ptr list_scan; plain_ptr data_scan; int count; bool found;
 
 printf ("#:plain:\n\n");
 printf ("# Data filtered from file:  %s\n", input_file);
 if (sites != NULL) {
  printf ("# Site names:  ");
  for (list_scan = sites; list_scan != NULL; list_scan = list_scan -> next) {
   printf ("%s", list_scan -> string);
   if (list_scan -> next != NULL) printf (", "); else printf ("\n");
  }
 }
 if (bindings != NULL) {
  printf ("# Binding factors:  ");
  for (list_scan = bindings; list_scan != NULL; list_scan = list_scan -> next)
  {
   printf ("%s", list_scan -> string);
   if (list_scan -> next != NULL) printf (", "); else printf ("\n");
  }
 }
 printf ("\n\n");

 count = 0;
 if (order == BY_FACTOR) {
  for (list_scan = sites; list_scan != NULL; list_scan = list_scan -> next) {
   for (data_scan = records; data_scan != NULL; data_scan = data_scan -> next) {
    if (data_scan -> site != NULL) {
     if (!strcmp (list_scan -> string, data_scan -> site)) {
      filt_print_record (data_scan); count++;
     }
    }
   }
  }
  for (list_scan = bindings; list_scan != NULL; list_scan = list_scan -> next) {
   for (data_scan = records; data_scan != NULL; data_scan = data_scan -> next) {
    if (data_scan -> binding != NULL) {
     if (!strcmp (list_scan -> string, data_scan -> binding)) {
      filt_print_record (data_scan); count++;
     }
    }
   }
  }
 }
 else if (order == IN_SEQUENCE) {
  for (data_scan = records; data_scan != NULL; data_scan = data_scan -> next) {
   found = FALSE; list_scan = sites; while ((!found) && (list_scan != NULL)) {
    if (data_scan -> site != NULL) {
     if (!strcmp (list_scan -> string, data_scan -> site)) found = TRUE;
     else list_scan = list_scan -> next;
    }
    else list_scan = NULL;
   }
   list_scan = bindings; while ((!found) && (list_scan != NULL)) {
    if (data_scan -> binding != NULL) {
     if (!strcmp (list_scan -> string, data_scan -> binding)) found = TRUE;
     else list_scan = list_scan -> next;
    }
    else list_scan = NULL;
   }
   if (found) { filt_print_record (data_scan); count++; }
  }
 }
 printf ("\n# Matched in %d lines.\n", count);
}


/*****************************************************************************/
/*                                                                           */
/* Procedure: filt_print_record                                              */
/*                                                                           */

void filt_print_record (plain_ptr item)
{
 printf ("%ld %ld ", item -> start, item -> stop);
 if (item -> text == NULL) printf ("null ");
 else printf ("%s ", item -> text);
 if (item -> matched == NULL) printf ("null ");
 else printf ("%s ", item -> matched);
 if (item -> site == NULL) printf ("null ");
 else printf ("\"%s\" ", item -> site);
 if (item -> binding == NULL) printf ("null ");
 else printf ("\"%s\" ", item -> binding);
 printf ("%c ", item -> strand);
 if (item -> database == NULL) printf ("null ");
 else printf ("%s ", item -> database);
 if (item -> file == NULL) printf ("null ");
 else printf ("%s ", item -> file);
 if (item -> reference == NULL) printf ("null ");
 else printf ("%s ", item -> reference);
 if (item -> c_range == NULL) printf ("null\n");
 else printf ("%s\n", item -> c_range);
}

