/***************************************************************************/
/*                                                                         */ 
/* Copyright (C) 1991-1995 Daniel Sleator and Davy Temperley               */ 
/* See file "README" for information about commercial use of this system   */
/*                                                                         */
/***************************************************************************/

/*
  This file does the post-processing.  The input is a global array
  called "pp_link_array" whose length is the global "N_links".
  The main routine is "post_process()".  It uses the link names only,
  and not the connectors.  (Since this is now taking a significant
  fraction of total parsing time, it should be rewritten to use hashing
  instead of linear search. 6/96 - see below (ALB)
  
  A domain is a set of links.  Each domain has a defining link.
  Only certain types of links serve to define a domain.  These
  parameters are set by the lists of link names in a separate,
  human-readable file referred to herein as the 'knowledge file.'
  
  The domains are nested: given two domains, either they're disjoint,
  or one contains the other, i.e. they're tree structured.  The set of links
  in a domain (but in no smaller domain) are called the "group" of the
  domain.  Data structures are built to store all this stuff.
  The tree structured property is not mathematically guaranteed by
  the domain construction algorithm.  Davy simply claims that because
  of how he built the dictionary, the domains will always be so
  structured.  The program checks this and gives an error message
  if it's violated.
  
  Define the "root word" of a link (or domain) to be the word at the
  left end of the link.  The other end of the defining link is called
  the "right word".
  
  The domain corresponding to a link is defined to be the set of links
  reachable by starting from the right word, following links and never
  using the root word or any word to its left.
  
  There are some minor exceptions to this.  The "restricted_link" lists
  those connectors that, even if they point back before the root word,
  are included in the domain.  Some of the starting links are included
  in their domain, these are listed in the "domain_contains_links" list.
  
  Such was the way it was.  Now Davy tells me there should be another type
  of domain that's quite different.  Let's call these "urfl" domains.
  Certain type of connectors start urfl domains.  They're listed below.
  In a urfl domain, the search includes the root word.  It does a separate
  search to find urfl domains.
  
  Restricted links should work just as they do with ordinary domains. If they
  come out of the right word, or anything to the right of it (that's
  in the domain), they should be included but should not be traced
  further. If they come out of the root word, they should not be
  included. 
  */   

/*
  I also, unfortunately, want to propose a new type of domain. These
  would include everything that can be reached from the root word of the
  link, to the right, that is closer than the right word of the link.
  (They would not include the link itself.)
  
  In the following sentence, then, the "Urfl_Only Domain" of the G link
  would include only the "O" link:
  
  +-----G----+    
  +---O--+   +-AI+
  |      |   |   |
  hitting dogs is fun.a 
  
  In the following sentence it would include the "O", the "TT", the "I",
  the second "O", and the "A".
  
  +----------------G---------------+    
  +-----TT-----+  +-----O-----+    |    
  +---O---+    +-I+    +---A--+    +-AI+
  |       |    |  |    |      |    |   |
  telling people to do stupid things is fun.a 
  
  This would allow us to judge the following:
  
  kicking dogs bores me
  *kicking dogs kicks dogs
  explaining the program is easy
  *explaining the program is running
  
  (These are distinctions that I thought we would never be able to make,
  so I told myself they were semantic rather than syntactic. But with
  domains, they should be easy.)
  */
 
  /* Modifications, 6/96 ALB: 
     1) Rules and link sets are relegated to a separate, user-written
        file(s), herein referred to as the 'knowledge file'
     2) This information is read by a lexer, in pp_lexer.l (lex code)
        whose exported routines are all prefixed by 'pp_lexer'
     3) when postprocessing a sentence, the links of each domain are
        placed in a set for quick lookup, ('contains one' and 'contains none')
     4) Functions which were never called have been eliminated:
        link_inhabits(), match_in_list(), group_type_contains(),
        group_type_contains_one(),  group_type_contains_all()
     5) Some 'one-by-one' initializations have been replaced by faster
        block memory operations (memset etc.)
     6) The above comments are correct but incomplete! (1/97)
     7) observation: the 'contains one' is, empirically, by far the most 
        violated rule, so it should come first in applying the rules.
   */

#include <stdarg.h>
#include <memory.h>
 
#include "pp_lexer.h"
#include "linkset.h"

#define PP_MAX_DOMAINS 128            /* max. possible # of unique domains */
#define PP_MAX_UNIQUE_LINK_NAMES 1024  /* just needs to be approximate */

#define _APPLY_GLOBAL_RULES_  /*define this for some 'quick'n'dirty checking*/

static char *pp_knowledge_file;     /* name of file containing rules, etc. */
static int q_init_called = 0;       /* post_process_init() called yet?     */
static int set_of_links_of_sentence;/* links seen in any linkage of sent   */
static int n_links_seen=0;          /* size of above set (diagnostic)      */
static int set_of_links_starting_bounded_domain;
static int set_of_links_in_an_active_rule;
static int n_contains_one, n_contains_none;  /* # of rules in each set */
static int n_global_rules_firing, n_local_rules_firing;

/* Holds a single post-processing rule. Since rules come in many flavors, not 
   all fields of the following are always relevant */
typedef struct rule_record {
    char    *selector;       /* name of link to which rule applies          */
    int      link_set;       /* set of links relevant to rule               */
    int      link_set_size;  /* size of this set                            */
    char     **link_array;   /* array containing the spelled-out names      */
    int      domain;         /* type of domain to which rule applies        */
    char     *msg; }         /* explanation (NULL=end sentinel in array)    */
RULE_RECORD;

/* A node of the lookup table, associating a domain with each starting link */
typedef struct starting_link_and_domain {
  char*   starting_link;              
  int     domain;       /* domain which the link belongs to (-1: terminator)*/
} STARTING_LINK_AND_DOMAIN;

STARTING_LINK_AND_DOMAIN *starting_link_lookup_table;

/* rule database is here (read from knowledge file at run-time) */
static RULE_RECORD *connected_rules;
static RULE_RECORD *connected_without_rules;
static RULE_RECORD *bounded_rules;
static RULE_RECORD *contains_one_rules;
static RULE_RECORD *contains_none_rules;

static int *relevant_contains_one_rules; /* -1-terminated list of indices */
static int *relevant_contains_none_rules;

/* see knowledge file (and above) for an explanation of these */
static int domain_starter_links;
static int urfl_domain_starter_links;
static int urfl_only_domain_starter_links;
static int domain_contains_links;
static int must_be_connected_without;
static int restricted_links;
static int ignore_these_links;

typedef struct d_tree_leaf_struct D_tree_leaf;
typedef struct domain_struct Domain;

struct domain_struct {
  char          *string;
  int            size;
  List_o_links  *lol;
  int            start_link;  /* the link that started this domain */
  int            type;        /* one letter name */
  D_tree_leaf   *child; 
  Domain        *parent;
};

struct d_tree_leaf_struct {
  Domain *parent;
  int    link;
  struct d_tree_leaf_struct *next;
};

static Domain domain_array[MAX_LINKS]; /* the domains, sorted by size. */
static int N_domains;                  /* the number of domains */
static int N_domain_trees;             /* the number of domain trees */

/* a pointer to the list of links out of this word */
static List_o_links * word_links[MAX_SENTENCE]; 

static int visited[MAX_SENTENCE];       /* for the depth-first search */

/************************  utility functions ********************************/

static void check_domain_is_legal(char *p) {
  if (strlen(p)>1)
    error("post_process: Domain (%s) must be a single character", p);
}

static void copy_string_into(char *source, char **target)
{
    *target = (char*) xalloc ((strlen(source)+1)*sizeof(char));
    strcpy(*target, source);
}

int string_in_list(char * s, char * a[]) 
{
  /* returns FALSE if the string s does not match equal anything the
     the array.  The array elements are post-processing symbols */
  int i;
  for (i=0; a[i] != NULL; i++) 
    if (post_process_match(a[i], s)) return TRUE;
  return FALSE;
}

/********************** code to read knowledge file *************************/

static void read_starting_link_table(char *label) {
  /* read table of [link, domain type]. This tells us what domain type each 
     link belongs to. */
  char *p;
  int n_pairs, i, n_tokens;

  if (!pp_lexer_set_label(label))
    error("\npost_process: Couldn't find starting link table %s in %s",
	  label,pp_knowledge_file);
  n_tokens = pp_lexer_count_tokens_of_label();
  if (n_tokens %2) 
    error("post_process:Link table must be of format [<link> <domain name>]+");
  n_pairs = n_tokens/2;
  starting_link_lookup_table = (STARTING_LINK_AND_DOMAIN *) 
    xalloc ((1+n_pairs) * sizeof(STARTING_LINK_AND_DOMAIN));
  for (i=0; i<n_pairs; i++)
    {
      /* read the link itself */
      p = pp_lexer_get_next_token_of_label();
      copy_string_into(p, &(starting_link_lookup_table[i].starting_link));
      
      /* read (and store in a set) the domain type of the link */
      p = pp_lexer_get_next_token_of_label();
      check_domain_is_legal(p);
      starting_link_lookup_table[i].domain = (int) p[0];
    }

  /* end sentinel */
  starting_link_lookup_table[n_pairs].domain = -1;
}

static void read_link_set(char *label, int *set_unit) {
  /* read link set, marked by label in knowledge file, into specified array */
  int n_strings,i;
  char *p;
  if (!pp_lexer_set_label(label)) 
    error("post_process: Couldn't find link set %s in %s",
	  label, pp_knowledge_file);
  n_strings = pp_lexer_count_tokens_of_label();
  *set_unit = linkset_open(n_strings);
  for (i=0; i<n_strings; i++) {
      p = pp_lexer_get_next_token_of_label();
      linkset_add_solid(*set_unit, p);
  }
}

static void read_connected_rule() {
  /* This is a degenerate class of rules: either the rule asserting
     connectivity is there, or it isn't. The only information in the
     rule is the error message if the linkage isn't connected. */
    connected_rules = (RULE_RECORD *) xalloc (sizeof(RULE_RECORD));
    if (!pp_lexer_set_label("CONNECTED_RULES")) {
	connected_rules[0].msg=NULL;            /* there is no such rule */
	printf("WARNING: Not using 'link is connected' rule");
	return;
    }
    if (pp_lexer_count_tokens_of_label()>1)
      error("post_process: Invalid syntax in CONNECTED_RULES (%s)",
	    pp_knowledge_file);
    copy_string_into(pp_lexer_get_next_token_of_label(),
		     &(connected_rules[0].msg));
}

static void read_connected_without_rules() {
    int n_rules, n_commas, n_tokens, r, i;
    char **tokens;
    pp_lexer_set_label("CONNECTED_WITHOUT_RULES");
    n_commas = pp_lexer_count_commas_of_label();
    n_rules = (n_commas + 1)/2;
    connected_without_rules = 
      (RULE_RECORD*) xalloc ((1+n_rules)*sizeof(RULE_RECORD));
    for (r=0; r<n_rules; r++)
      {
	  /* read link set */
	  pp_lexer_get_next_group_of_tokens_of_label(&n_tokens, &tokens);
	  claim(n_tokens>0, "syntax error in pp info file");
	  connected_without_rules[r].link_set=linkset_open(n_tokens);
	  for (i=0; i<n_tokens; i++)
	    linkset_add_solid(connected_without_rules[r].link_set, tokens[i]);
	  
	  /* read error message */
	  pp_lexer_get_next_group_of_tokens_of_label(&n_tokens, &tokens);
	  if (n_tokens>1)
	    error("post_process: Invalid syntax in %s (rule %i of CONNECTED_WITHOUT_RULES)", pp_knowledge_file, r+1);
	  copy_string_into(tokens[0], &(connected_without_rules[r].msg));
      }
    
    /* sentinel entry */
    connected_without_rules[n_rules].msg = NULL;
}

static void read_bounded_rules() {
    char **tokens;
    int n_rules, n_commas, n_tokens, r;
    pp_lexer_set_label("BOUNDED_RULES");
    n_commas = pp_lexer_count_commas_of_label();
    n_rules = (n_commas + 1)/2;
    bounded_rules = (RULE_RECORD*) xalloc ((1+n_rules)*sizeof(RULE_RECORD));
    for (r=0; r<n_rules; r++)
      {
	  /* read domain */	
	  pp_lexer_get_next_group_of_tokens_of_label(&n_tokens, &tokens);
	  if (n_tokens!=1)
	    error("post_process:Invalid syntax in %s:rule %i of BOUNDED_RULES",
		  pp_knowledge_file,r+1);
	  bounded_rules[r].domain = (int) tokens[0][0];
	  
	  /* read error message */
	  pp_lexer_get_next_group_of_tokens_of_label(&n_tokens, &tokens);
	  if (n_tokens!=1) 
	    error("post_process:Invalid syntax in %s:rule %i of BOUNDED_RULES",
		  pp_knowledge_file, r+1);
	  copy_string_into(tokens[0], &(bounded_rules[r].msg));
      }
    
    /* sentinel entry */
    bounded_rules[n_rules].msg = NULL;
}

static void read_contains_rules(RULE_RECORD **rules, char *label, int *n_rules) {
  /* to save bytes, this function serves double duty: reading the
     'contains_one_rules' and reading the 'contains_none_rules',
     into their respective arrays */
  int n_commas, n_tokens, i, r;
  char **tokens;
  pp_lexer_set_label(label);
  n_commas = pp_lexer_count_commas_of_label();
  *n_rules = (n_commas + 1)/3;
  *rules = (RULE_RECORD*) xalloc ((1+*n_rules)*sizeof(RULE_RECORD));
  for (r=0; r<*n_rules; r++)
    {
      /* read link */
      pp_lexer_get_next_group_of_tokens_of_label(&n_tokens, &tokens);
      if (n_tokens>1) 
	error("post_process: Invalid syntax in %s (rule %i of %s)",
	      label, r+1, pp_knowledge_file);
      copy_string_into(tokens[0], &((*rules)[r].selector));
      
      /* read link set */
      pp_lexer_get_next_group_of_tokens_of_label(&n_tokens, &tokens);
      (*rules)[r].link_set = linkset_open(n_tokens);
      (*rules)[r].link_set_size = n_tokens;
      (*rules)[r].link_array = (char **) xalloc((1+n_tokens)*sizeof(char*));
      for (i=0; i<n_tokens; i++) 
	{
	    linkset_add_solid((*rules)[r].link_set, tokens[i]);
	    copy_string_into(tokens[i], &((*rules)[r].link_array[i]));
	}
      (*rules)[r].link_array[i]=0; /* NULL-terminator */

      /* read error message */
      pp_lexer_get_next_group_of_tokens_of_label(&n_tokens, &tokens);
      if (n_tokens>1) 
	error("post_process: Invalid syntax in %s (rule %i of %s)",
	      label,r+1,pp_knowledge_file);
      copy_string_into(tokens[0], &((*rules)[r].msg)); 
    }
  
  /* sentinel entry */
  (*rules)[*n_rules].msg = NULL;
}


static void read_knowledge_file(char *kfile) {
  /* read the information in the 'knowledge file' into the static arrays 
     defined above. Originally these arrays were defined here, in the source 
     file, but deferring the variable bindings until run-time lets us make 
     changes to the rule database without recompiling. Also, it's neater. */

  FILE *f;

  copy_string_into(kfile, &pp_knowledge_file); 
  if (verbosity>1) printf("post_process: parsing knowledge file...\n");
  
  /* look for the pp knowledge file in the same places we look for the dict */
  f=dictopen(pp_knowledge_file, "r");
  if (f==NULL) 
     error("Couldn't find post-process knowledge file %s",pp_knowledge_file);
  if (verbosity>1) 
     printf("post-process: reading rules from %s\n", pp_knowledge_file);
  pp_lexer_open(f);
  
  if (verbosity>1) printf("post_process: reading link table...\n");
  read_starting_link_table("STARTING_LINK_TYPE_TABLE");    
  
  if (verbosity>1) printf("post_process: reading link sets...\n");
  read_link_set("DOMAIN_STARTER_LINKS"         , &domain_starter_links);
  read_link_set("URFL_DOMAIN_STARTER_LINKS"    , &urfl_domain_starter_links);
  read_link_set("DOMAIN_CONTAINS_LINKS"        , &domain_contains_links);
  read_link_set("IGNORE_THESE_LINKS"           , &ignore_these_links);
  read_link_set("RESTRICTED_LINKS"             , &restricted_links);
  read_link_set("MUST_BE_CONNECTED_WITHOUT_LINKS", 
		&must_be_connected_without);
  read_link_set("URFL_ONLY_DOMAIN_STARTER_LINKS",
		&urfl_only_domain_starter_links);

  if (verbosity>1) printf("post_process: reading rules...\n");
  read_connected_rule();
  read_connected_without_rules();
  read_bounded_rules();
  read_contains_rules(&contains_one_rules,"CONTAINS_ONE_RULES",&n_contains_one);
  read_contains_rules(&contains_none_rules,"CONTAINS_NONE_RULES",&n_contains_none);
  if (verbosity>1) printf("post_process: done reading rules.\n");
}


static int find_domain_name(char *link) {
  /* Return the name of the domain used for this string. Return -1 if not in
     the list. Does this by looking up string in set of starting connectors,
     built from reading the knowledge file, and then finding the domain name
     appropriate to that connector */
  int i,domain;
  for (i=0;;i++) {
     domain = starting_link_lookup_table[i].domain;
    /* have we reached the end of the array of starting links? */
    if (domain==-1) return -1;
    /* does the i'th starting link match the provided link? */
    if (post_process_match(starting_link_lookup_table[i].starting_link, link))
      return domain;
  }
}

static void build_graph(void) {
  /* fill in the word_links array with a list of words neighboring each
     word (actually a list of links).  The dir fields are not set, since this
     (after fat-link-extraction) is an undirected graph. */ 
  int i, link;
  List_o_links * lol;
    
  for (i=0; i<N_words; i++) 
    word_links[i] = NULL;
    
  for (link=0; link<N_links; link++) {
    if (pp_link_array[link].l == -1) continue;	
    if (linkset_match(ignore_these_links, pp_link_array[link].name)) continue;

    lol = (List_o_links *) xalloc(sizeof(List_o_links));
    lol->next = word_links[pp_link_array[link].l];
    word_links[pp_link_array[link].l] = lol;
    lol->link = link;
    lol->word = pp_link_array[link].r;
	
    lol = (List_o_links *) xalloc(sizeof(List_o_links));
    lol->next = word_links[pp_link_array[link].r];
    word_links[pp_link_array[link].r] = lol;
    lol->link = link;
    lol->word = pp_link_array[link].l;
  }
}

static void add_link_to_domain(int link) {
  List_o_links *lol;
  lol = (List_o_links *) xalloc(sizeof(List_o_links));
  lol->next = domain_array[N_domains].lol;
  domain_array[N_domains].lol = lol;
  domain_array[N_domains].size++;
  lol->link = link;
}

static void depth_first_search(int w, int root, int start_link) {
  List_o_links *lol;
  visited[w] = TRUE;
  for (lol = word_links[w]; lol != NULL; lol = lol->next) {
    if (lol->word < w && lol->link != start_link) {
      add_link_to_domain(lol->link);
    }
  }
  for (lol = word_links[w]; lol != NULL; lol = lol->next) {
    if (!visited[lol->word] && (lol->word != root) &&
	!(lol->word < root && lol->word < w &&
          linkset_match(restricted_links,pp_link_array[lol->link].name)))
      depth_first_search(lol->word, root, start_link);
  }
}

static void bad_depth_first_search(int w, int root, int start_link) {
  List_o_links * lol;
  visited[w] = TRUE;
  for (lol = word_links[w]; lol != NULL; lol = lol->next) {
    if ((lol->word < w)  && (lol->link != start_link) && (w != root)) {
      add_link_to_domain(lol->link);
    }
  }
  for (lol = word_links[w]; lol != NULL; lol = lol->next) {
    if ((!visited[lol->word]) && !(w == root && lol->word < w) &&
	!(lol->word < root && lol->word < w && 
          linkset_match(restricted_links,pp_link_array[lol->link].name)))
      bad_depth_first_search(lol->word, root, start_link);
  }
}

static void d_depth_first_search(int w, int root, int right, int start_link) {
  List_o_links * lol;
  visited[w] = TRUE;
  for (lol = word_links[w]; lol != NULL; lol = lol->next) {
    if ((lol->word < w) && (lol->link != start_link) && (w != root)) {
      add_link_to_domain(lol->link);
    }
  }
  for (lol = word_links[w]; lol != NULL; lol = lol->next) {
    if (!visited[lol->word] && !(w == root && lol->word >= right) &&
	!(w == root && lol->word < root) &&
	!(lol->word < root && lol->word < w && 
          linkset_match(restricted_links,pp_link_array[lol->link].name)))
      d_depth_first_search(lol->word, root, right, start_link);
  }
}    

static int domain_compare(Domain * d1, Domain * d2) {
  /* for sorting the domains by size */
  return (d1->size - d2->size) ;
}
    

static void setup_domain_array(int n, char *string, int start_link) {
  memset(visited, 0, N_words*sizeof(int));   /* set visited[i] to FALSE */
  domain_array[n].string = string;
  domain_array[n].lol        = NULL;
  domain_array[n].size       = 0;
  domain_array[n].start_link = start_link;
}

static void build_domains(void) {
  int link, i, d;
  char *string;

  N_domains = 0;
  for (link = 0; link<N_links; link++) {
    if (pp_link_array[link].l == -1) continue;	
    string = pp_link_array[link].name;
    if (linkset_match(ignore_these_links, string)) continue;
    if (linkset_match(domain_starter_links, string)) {
	setup_domain_array(N_domains, string, link);
        if (linkset_match(domain_contains_links, string)) 
	  add_link_to_domain(link);
	depth_first_search(pp_link_array[link].r, 
			   pp_link_array[link].l, 
			   link);
	N_domains++;
        assert(N_domains<PP_MAX_DOMAINS, "raise value of PP_MAX_DOMAINS");
    } 
    else if (linkset_match(urfl_domain_starter_links,string)) {
	setup_domain_array(N_domains, string, link);
	/* always add the starter link to its urfl domain */
	add_link_to_domain(link);
	bad_depth_first_search(pp_link_array[link].r,
			       pp_link_array[link].l,
			       link);
	N_domains++;
        assert(N_domains<PP_MAX_DOMAINS, "raise value of PP_MAX_DOMAINS");
    }
    else if (linkset_match(urfl_only_domain_starter_links,string)) {
	setup_domain_array(N_domains, string, link);
	/* do not add the starter link to its urfl_only domain */
	d_depth_first_search(pp_link_array[link].l,
			     pp_link_array[link].l,
			     pp_link_array[link].r,
			     link);
	N_domains++;
        assert(N_domains<PP_MAX_DOMAINS, "raise value of PP_MAX_DOMAINS");
    }
}
    
  /* sort the domains by size */
  qsort((void *) domain_array, 
	N_domains, 
	sizeof(Domain),
	(int (*)(const void *, const void *)) domain_compare);
  
  /* sanity check: all links in all domains have a legal domain name */
  for (d=0; d<N_domains; d++) {
    i = find_domain_name(domain_array[d].string);
    if (i==-1) 
       error("post_process: Need an entry for %s in LINK_TYPE_TABLE",
           domain_array[d].string);
    domain_array[d].type = i;
  }
}

static int contained_in(Domain * d1, Domain * d2) {
  /* returns TRUE if domain d1 is contained in domain d2 */
  char mark[MAX_LINKS];
  List_o_links * lol;
  memset(mark, 0, N_links*sizeof(char));
  for (lol=d2->lol; lol != NULL; lol = lol->next) 
    mark[lol->link] = TRUE;
  for (lol=d1->lol; lol != NULL; lol = lol->next) 
    if (!mark[lol->link]) return FALSE;
  return TRUE;
}

static int link_in_domain(int link, Domain * d) {
  /* returns the predicate "the given link is in the given domain" */    
  List_o_links * lol;
  for (lol = d->lol; lol != NULL; lol = lol->next) 
    if (lol->link == link) return TRUE;
  return FALSE;
}

static int check_domain_nesting() {
  /* returns TRUE if the domains actually form a properly nested structure */
  Domain * d1, * d2;
  int counts[4];
  char mark[MAX_LINKS];
  List_o_links * lol;
  int i;
  for (d1=domain_array; d1 < domain_array + N_domains; d1++) {
    for (d2=d1+1; d2 < domain_array + N_domains; d2++) {
      memset(mark, 0, N_links*sizeof(char));
      for (lol=d2->lol; lol != NULL; lol = lol->next) {
	mark[lol->link] = 1;
      }
      for (lol=d1->lol; lol != NULL; lol = lol->next) {
	mark[lol->link] += 2;
      }
      counts[0] = counts[1] = counts[2] = counts[3] = 0;
      for (i=0; i<N_links; i++) 
	counts[mark[i]]++;
      if ((counts[1] > 0) && (counts[2] > 0) && (counts[3] > 0)) 
	return FALSE;
    }
  }
  return TRUE;
}

static void build_domain_forest() {
  int d, d1, link;
  D_tree_leaf * dtl;
  if (N_domains > 0) 
    domain_array[N_domains-1].parent = NULL;
  N_domain_trees = 1;
  for (d=0; d < N_domains-1; d++) {
    for (d1 = d+1; d1 < N_domains; d1++) {
      if (contained_in(&domain_array[d], &domain_array[d1])) {
	domain_array[d].parent = &domain_array[d1];
	break;
      }
    }
    if (d1 == N_domains) {
      /* we know this domain is a root of a new tree */
      domain_array[d].parent = NULL;
      N_domain_trees++;
      /* It's now ok for this to happen.  It used to do:
	 printf("I can't find a parent domain for this domain\n");
	 print_domain(d);
	 exit(1); */
    }
  }
  /* the parent links of domain nodes have been established.
     now do the leaves */
  for (d=0; d < N_domains; d++) {
    domain_array[d].child = NULL;
  }
  for (link=0; link < N_links; link++) {
    if (pp_link_array[link].l == -1) continue; /* probably not necessary */
    for (d=0; d<N_domains; d++) {
      if (link_in_domain(link, &domain_array[d])) {
	dtl = (D_tree_leaf *) xalloc(sizeof(D_tree_leaf));
	dtl->link = link;
	dtl->parent = &domain_array[d];
	dtl->next = domain_array[d].child;
	domain_array[d].child = dtl;
	break;
      }
    }
  }
}


static void free_list_o_links(List_o_links *lol) {
  /* free the list of links pointed to by lol
     (does not free any strings) */
  List_o_links * xlol;
  while(lol != NULL) {
    xlol = lol->next;
    xfree((char *)lol, sizeof(List_o_links));
    lol = xlol;
  }
}

static void free_D_tree_leaves(D_tree_leaf *dtl) {
  D_tree_leaf * xdtl;
  while(dtl != NULL) {
    xdtl = dtl->next;
    xfree((char *)dtl, sizeof(D_tree_leaf));
    dtl = xdtl;
  }
}

static void free_post_processing_structures(void) {
  int w, d;
  for (w=0; w<N_words; w++) 
    free_list_o_links(word_links[w]);
    
  for (d=0; d<N_domains; d++) {
    free_list_o_links(domain_array[d].lol);
    free_D_tree_leaves(domain_array[d].child);
  }
}

static void connectivity_dfs(int w, int set_unit) {
  List_o_links *lol;
  visited[w] = TRUE;
  for (lol = word_links[w]; lol != NULL; lol = lol->next) {
    if (!visited[lol->word] &&
        !linkset_match(set_unit, pp_link_array[lol->link].name)) {
      connectivity_dfs(lol->word, set_unit);
    }
  }
}

static void mark_reachable_words(int w){
  List_o_links *lol;
  if (visited[w]) return;
  visited[w] = TRUE;
  for (lol = word_links[w]; lol != NULL; lol = lol->next) {
    mark_reachable_words(lol->word);
  }
}

static int is_connected(void) {
  /* Returns true if the linkage is connected, considering words
     that have at least one edge....this allows conjunctive sentences
     not to be thrown out. */
  int i;
  for (i=0; i<N_words; i++) visited[i] = (word_links[i] == NULL);
  mark_reachable_words(0);
  for (i=0; i<N_words; i++) if (!visited[i]) return FALSE;
  return TRUE;
}


static D_type_list ** build_type_array(void) {
  D_type_list ** array, * dtl;
  int d, i;
  List_o_links * lol;
    
  array = NULL;  /* stop uninitialized variable compiler warnings */
  if (N_links > 0) array = (D_type_list **) xalloc(N_links * sizeof(D_type_list *));
  for (i=0; i<N_links; i++) {
    array[i] = NULL;
  }
  for (d=0; d<N_domains; d++) {
    for (lol=domain_array[d].lol; lol != NULL; lol = lol->next) {
      dtl = (D_type_list *) xalloc(sizeof(D_type_list));
      dtl->next = array[lol->link];
      array[lol->link] = dtl;
      dtl->type = domain_array[d].type;
    }
  }
  return array;
}

static PP_node * bogus_pp_node(void) {
  /* Construct and return a vacuus pp_node.
     This is for when post processing is turned off.
       */
  PP_node * pp_return;
  int link;
  pp_return = (PP_node *) xalloc(sizeof(PP_node));
  pp_return->d_type_array =
    (D_type_list **) xalloc(N_links * sizeof(D_type_list *));
  for (link=0; link<N_links; link++) {
    pp_return->d_type_array[link] = NULL;
  }
  pp_return->v = NULL;
  return pp_return;
}


/************************* rule application ********************************/

static int apply_connected(RULE_RECORD *rule) {
  /* There is actually just one (or none, if user didn't specify it)
     rule asserting that linkage is connected. */
  if (!is_connected()) return 0;  
  return 1;                     
}

static int apply_connected_without(RULE_RECORD *rule) {
  /* Returns true if the linkage is connected when ignoring the links
     whose names are in the given list of link names.
       
     Actually, what it does is this: it returns FALSE if the connectivity
     of the subgraph reachable from word 0 changes as a result of deleting
     these links. */
  int i;
  memset(visited, 0, N_words*sizeof(int));
  mark_reachable_words(0);
  for (i=0; i<N_words; i++) 
    visited[i] = !visited[i];
  connectivity_dfs(0, rule->link_set);
  for (i=0; i<N_words; i++) 
    if (visited[i] == FALSE) return FALSE;
  return TRUE;
}


static int apply_bounded(RULE_RECORD *rule) {
  /* Checks to see that all domains with this name have the property that
     all of the words that touch a link in the domain are not to the left
     of the root word of the domain. */
  int d, lw, d_type;
  List_o_links * lol;    
  d_type = rule->domain;
  for (d=0; d<N_domains; d++) {
    if (domain_array[d].type != d_type) continue;
    lw = pp_link_array[domain_array[d].start_link].l;
    for (lol = domain_array[d].lol; lol != NULL; lol = lol->next) {
      if (pp_link_array[lol->link].l < lw) return FALSE;
    }
  }
  return TRUE;
}

static int apply_contains_none(RULE_RECORD *rule) 
{
  /* returns TRUE if and only if:
     all groups containing the selector link do not contain anything
     from the link_array contained in the rule. Uses exact string matching. */
  D_tree_leaf * dtl;
  int d;
  for (d=0; d<N_domains; d++) 
    {
      for (dtl = domain_array[d].child; 
	   dtl != NULL &&
	     !post_process_match(rule->selector,pp_link_array[dtl->link].name);
	   dtl = dtl->next); 
      if (dtl != NULL) 
	{
	  /* selector link of rule appears in this domain */
	  for (dtl = domain_array[d].child; dtl != NULL; dtl = dtl->next) 
	    if (string_in_list(pp_link_array[dtl->link].name, 
			       rule->link_array)) 
	      return FALSE;
	}
    }
  return TRUE;
}

int apply_contains_one_globally(RULE_RECORD *rule)
{
  /* returns TRUE if and only if 
     (1) the sentence doesn't contain the selector link for the rule, or 
     (2) it does, and it also contains one or more from the rule's link set */

  int i,j,count;
  for (i=0; i<N_links; i++)
    if (post_process_match(rule->selector,pp_link_array[i].name)) break;
  if (i==N_links) return TRUE;
  
  /* selector link of rule appears in sentence */
  count=0;
  for (j=0; j<N_links && count==0; j++)
    if (string_in_list(pp_link_array[j].name, rule->link_array)) 
      {
	count=1;
	break;
      }
  if (count==0) return FALSE; else return TRUE;
}


int apply_contains_one(RULE_RECORD *rule) 
{
  /* returns TRUE if and only if all groups containing the specified link 
     contain at least one from the required list.  (as determined by exact
     string matching) */    
  D_tree_leaf * dtl;
  int d, count;
  for (d=0; d<N_domains; d++) 
    {
      for (dtl = domain_array[d].child; 
	   dtl != NULL &&
	     !post_process_match(rule->selector,pp_link_array[dtl->link].name);
	   dtl = dtl->next);
      if (dtl != NULL) 
	{
	  /* selector link of rule appears in this domain */
	  count=0;
	  for (dtl = domain_array[d].child; dtl != NULL; dtl = dtl->next) 
	    if (string_in_list(pp_link_array[dtl->link].name,rule->link_array))
	      {
		count=1;
		break;
	      }
	  if (count == 0) return FALSE;
	}
    }
  return TRUE;
}


static int apply_rules(int (apply_fn) (RULE_RECORD *),
		       RULE_RECORD *rule_array,   
		       char **msg) 
{
  int i;
  for (i=0; (*msg=rule_array[i].msg)!=NULL; i++) 
    if (!apply_fn(&(rule_array[i]))) return 0;  
  return 1;
}

static int apply_relevant_rules(int (apply_fn) (RULE_RECORD *),
				RULE_RECORD *rule_array,   
				int *relevant_rules,
				char **msg) 
{
  int i, idx;

  /* if we didn't accumulate link sets for this sentence, we need to apply
     al rules */
  if (n_links_seen==0) return apply_rules(apply_fn, rule_array,msg);

  /* we did, and we don't */
  for (i=0; (idx=relevant_rules[i])!=-1; i++) {
    assert((*msg=rule_array[idx].msg)!=NULL, "foo"); /* fell off the end? */
    if (!apply_fn(&(rule_array[idx]))) return 0;
  }
  return 1;
}
  
/****************************** exported ******************************/

void post_process_stats(int *n_local_rules, /*# of times domain rules fired*/
			int *n_global_rules)/*# of times global rules fired*/
{
  *n_local_rules  = n_local_rules_firing;
  *n_global_rules = n_global_rules_firing;
}

void post_process_setup_for_sentence()
{
  linkset_clear(set_of_links_of_sentence);
  linkset_clear(set_of_links_in_an_active_rule);
  n_links_seen = 0;
  n_local_rules_firing=0;
  n_global_rules_firing=0;
}


void post_process_scan_linkage()
{
   int i;
   for (i=0; i<N_links; i++) 
     n_links_seen += 
	 linkset_add_solid(set_of_links_of_sentence, pp_link_array[i].name);
}

void post_process_prune_irrelevant_rules()
{
   int i,n;
   RULE_RECORD *rule;
   if (verbosity>1)
      printf("Saw %i unique link names in all linkages.\n", n_links_seen);
                       
   n=0;
   for (i=0;; i++) {
       rule = &(contains_one_rules[i]); 
       if (rule->msg==NULL) break;
       if (linkset_match_bw(set_of_links_of_sentence, rule->selector)) {
          relevant_contains_one_rules[n++] = i; 
          linkset_add(set_of_links_in_an_active_rule, rule->selector); 
       }
   }
   relevant_contains_one_rules[n] = -1;  
   if (verbosity>1) 
     printf("Using %i/%i 'contains one' rules\n", n,n_contains_one);

   n=0;
   for (i=0;; i++) {
       rule = &(contains_none_rules[i]); 
       if (rule->msg==NULL) break;
       if (linkset_match_bw(set_of_links_of_sentence, rule->selector)) {
          relevant_contains_none_rules[n++] = i; 
          linkset_add(set_of_links_in_an_active_rule, rule->selector); 
       }
   }
   relevant_contains_none_rules[n] = -1;  
   if (verbosity>1) 
     printf("Using %i/%i 'contains none' rules\n", n, n_contains_none);
}


void free_PP_node(PP_node * p) {
  D_type_list * dtl, * dtlx;
  Violation_list * v, *vx;
  int i;
  if (p->d_type_array) 
      {
	  for (i=0; i<N_links; i++){
	      for (dtl = p->d_type_array[i]; dtl != NULL; dtl = dtlx) {
		  dtlx = dtl->next;
		  xfree((char *) dtl, sizeof(D_type_list));
	      }
	  }
	  if (N_links > 0) 
	      xfree((char *) p->d_type_array, N_links * sizeof(D_type_list *));
      }
  for (v = p->v; v!=NULL; v = vx) {
      vx = v->next;
      xfree((char *) v, sizeof(Violation_list));
  }
  xfree((char *) p, sizeof (PP_node));
}


/**********************************************************************/

/* string comparison in postprocessing. The first parameter is a
   post-processing symbol. The second one is a connector name from a link. The
   upper case parts must match. We imagine that the first arg is padded with an
   infinite sequence of "#" and that the 2nd one is padded with "*". "#"
   matches anything, but "*" is just like an ordinary char for matching 
   purposes. For efficiency sake there are several different versions of these 
   functions */

int post_process_match(char *s, char *t) {
/* string comparison in postprocessing */
/* The first parameter is a post-processing symbol  */
/* The second one is a connector name from a link */
/* The upper case parts must match */
/* we imaging that the first arg is padded with an infinit sequence of "#" */
/* and that the 2nd one is padded with "*".  */
/* "#" matches anything, but "*" is just like an ordinary char for matching purposes. */
    char c;
    while(isupper(*s) || isupper(*t)) {
	if (*s != *t) return FALSE;
	s++;
	t++;
    }

    while (*s != '\0') {
	if (*s != '#') {
	    if (*t == '\0') c = '*'; else c = *t;
	    if (*s != c) return FALSE;
	}
	s++;
	if (*t != '\0') t++;
    }
    return TRUE;
}

void post_process_init(char *kfile) {
    int d,i,j,n,domain_of_rule;
    if (q_init_called) 
      error("post_process: only call post_process_init() once!");
    q_init_called=1;
    read_knowledge_file(kfile);

  /* construct set of links which can start a bounded domain */
  set_of_links_starting_bounded_domain=linkset_open(PP_MAX_UNIQUE_LINK_NAMES);
  n=0;
  for (i=0; bounded_rules[i].msg!=NULL; i++) {
      domain_of_rule = bounded_rules[i].domain;
      for (j=0; (d=(starting_link_lookup_table[j].domain))!=-1; j++) {
	  if (d==domain_of_rule) 
	      n+=linkset_add(set_of_links_starting_bounded_domain,
			     starting_link_lookup_table[j].starting_link);
      }
  }
    if (verbosity>1) printf("%i links can start a bounded domain.\n", n);

  /* this set will store all the 'seen' links in any linkage
     of the current sentence */
    set_of_links_of_sentence=linkset_open(PP_MAX_UNIQUE_LINK_NAMES);

    /* this set  will store the links which are selectors in an "active"
       rule (i.e. one that is used for some linkage of this sentence) */
    set_of_links_in_an_active_rule = linkset_open(PP_MAX_UNIQUE_LINK_NAMES);

  /* init arrays which will contain list of relevant rules for a sent. */
    for (i=0; contains_one_rules[i].msg != NULL; i++);
    relevant_contains_one_rules = (int *) malloc ((i+1)*sizeof(int));
    for (i=0; contains_none_rules[i].msg != NULL; i++);
    relevant_contains_none_rules = (int *) malloc ((i+1)*sizeof(int));
}


int pp_process(char **msg) 
{
#ifdef _APPLY_GLOBAL_RULES_
  if (!apply_relevant_rules(apply_contains_one_globally,contains_one_rules,
			    relevant_contains_one_rules, msg)) return -1;
#endif  
  
  /* build graph; confirm that it's legally connected */
  build_graph();  
  build_domains();  
  build_domain_forest();
 
#if ! defined FOR_RELEASE  
  if(!check_domain_nesting()) 
    if (display_bad) printf("WARNING: The domains are not nested.\n");
#endif
  
  if (!apply_relevant_rules(apply_contains_one,contains_one_rules,
			    relevant_contains_one_rules, msg) || 
      !apply_relevant_rules(apply_contains_none,contains_none_rules,
	 	 	    relevant_contains_none_rules, msg) ||
      !apply_rules(apply_connected_without,connected_without_rules,msg) ||
      !apply_rules(apply_connected, connected_rules, msg) ||
      !apply_rules(apply_bounded, bounded_rules, msg)) return 1;
  return 0;
}


PP_node * post_process(void) {
  /* Takes as input:
     N_words (to know how big to make the graph).
     N_links, pp_link_array[]  (of course also uses the connectors, etc.
     that are reachable from the pp_link_array).
     pp_link_array[i].l = -1 means that this connector is to be ignored.
     Returns:
     For each link, the domain structure of that link.
     A list of the violation strings.
     */      

  static char *msg;
  Violation_list *v; 
  PP_node *pp_return;
  if (!q_init_called) 
    error("call post_process_init() before post_process()!");
  if(!postprocess_defined) return bogus_pp_node(); 

  pp_return = (PP_node *) xalloc(sizeof(PP_node));

  switch(pp_process(&msg))
    {
    case -1:
      /* global test failed before we could build the domains, etc. */
      n_global_rules_firing++;
      xfree((char *)pp_return, sizeof(PP_node));
      return NULL;
      break;
    case 1:
      /* one of the tests failed - issue violation and bail */
      n_local_rules_firing++;
      v = (Violation_list *) xalloc(sizeof (Violation_list));
      v->string    = msg;
      v->next      = NULL;
      pp_return->v = v;
      break; 
    case 0:
      /* everything hunky dorey */
      pp_return->v = NULL;
      break;
    }

  pp_return->d_type_array = build_type_array(); 
  free_post_processing_structures();  
  return pp_return;
}







