/* funiq.c * * Written by Bill Vaughn, * CVS, Univ. of Rochester, Rochester, NY * {seismo,allegra,decvax}!rochester!ur-cvsvax!bill * * This program reads one line from the standard input, searches for * the line in an evolving binary tree sorted alphabetically. * If the line is NOT found, it is inserted in the tree and put on * the standard output. No action is taken if the line IS found. * * The program is intended to work with one-word-per-line input, * but doesn't enforce that rule. The input to 'funiq' should be * random, NOT sorted. Sorted input gives worst case running time. * The program dynamically allocates all of its array space and uses * a binary tree storage method which, according to theory, has an * average search time proportional to log(N), where N is the number * of nodes in the tree at the time the search is performed. * * Principal uses: * 1) 'Funiq' can replace the 'sort -u' in the pipeline in /usr/bin/spell. * 'Funiq' will not block the pipe and it will run faster than sort. * The input to /usr/lib/spell doesn't have to be sorted anyway; the * final sort in the pipe does that. The reason for the 'sort -u' at * that point in the pipeline is to limit the calls to the hashing * function in /usr/lib/spell. 'Funiq' accomplishes this task much * more efficiently. * * 2) Also, 'funiq' can replace the default mode of 'uniq' in cases where * the output need not be sorted. The input should not be sorted * (as opposed to 'uniq', where is must be sorted.) * * Possible improvements: * Use a balancing method on the binary tree to improve * performance. See Knuth, 'Searching and Sorting', pp. 451-463 * for an algorithm to balance binary trees during insertion. * It will require an extra byte of data per node however. * (Further analysis reveals that this might not be worth the effort.) * * Compilation: * cc -O funiq.c -o funiq * or cc -O -DSTATS funiq.c -o funiq (produces statistics) */ #include /* The following defines the binary tree node structure */ struct btreenode { char *str; struct btreenode *left, *right; }; struct btreenode *head; /* binary tree head*/ struct btreenode *alloc, *hend; /* allocator pointers */ char *cstore, *cend; /* character array begining and end */ char *nextstr(), *allocstr(); struct btreenode *nextnode(), *allocbtree(); char *calloc(), *malloc(), *gets(); /* Feel free to change the following allocation parameters as desired/needed. * One can spend as much or as little time in 'malloc/calloc' as one wants. */ #define NODES 4096 /* Initial allocation for btree nodes */ #define STR 8192 /* Initial allocation for strings */ #define ADDNOD 1024 /* Additional allocation for btree nodes */ #define ADDSTR 2048 /* Additional allocation for strings */ #ifdef STATS int charsused = 0; char statfile[] = "/usr/dict/bstats"; #endif main(argc,argv) char *argv[]; { register int n; register char *q; register struct btreenode *r; #ifdef STATS register ncmps = 0; register maxdepth = 0; register nodesused = 0; register depth; register total = 0; FILE *f; #endif if (argc > 1) { if (freopen(argv[1],"r",stdin) == NULL) { fprintf(stderr,"funiq: %s not accessible\n",argv[1]); exit(1); } } /* Initialize the binary tree with first string of the input. * Being a little more selective here might help * balance the tree, but who knows? Random input is * supposed to produce a reasonably balanced tree. */ q = cstore = allocstr(STR); /* allocate string space */ cend = cstore + STR; if (gets(q)==NULL) /* get first line */ exit(0); alloc = head = allocbtree(NODES);/* allocate tree nodes */ hend = head + NODES; /* we're depending upon 'calloc' */ alloc->str = q; /* to set all the pointers to NULL */ puts(q); /* output the sucker */ #ifdef STATS total++; nodesused++; #endif q = nextstr(q); /* get new string pointer. */ /* Traverse tree with each input line, inserting and printing * it if it's NOT found and discarding it if it is. */ while(gets(q)!=NULL) { r = head; #ifdef STATS total++; depth=0; while ((ncmps++ , (n = strcmp(q,r->str))) != 0) { #else while ((n = strcmp(q,r->str)) != 0) { #endif if (n < 0) { if (r->left != NULL) { r = r->left; #ifdef STATS depth++; #endif continue; /* traversing left */ } else { /* Creating a left subtree */ r->left = nextnode(); #ifdef STATS nodesused++; #endif puts( (r->left)->str = q ); q = nextstr(q); break; /* get next line */ } } else { if (r->right != NULL) { r = r->right; #ifdef STATS depth++; #endif continue; /* traversing right */ } else { /* Creating a right subtree */ r->right = nextnode(); #ifdef STATS nodesused++; #endif puts( (r->right)->str = q ); q = nextstr(q); break; /* get next line */ } } } #ifdef STATS if (++depth > maxdepth) maxdepth = depth; } f = fopen(statfile,"a"); fprintf(f,"%d\t%d\t%d\t%d\t%d\n", charsused,nodesused,total,ncmps,maxdepth); fclose(f); #else } #endif exit(0); /* That was fairly simple wasn't it? */ } /* * Returns pointer to an array of 'n' btree nodes. */ struct btreenode *allocbtree(n) { register struct btreenode *x; x = (struct btreenode *) calloc((unsigned)n,(unsigned)sizeof(struct btreenode)); if (x == NULL) { fprintf(stderr,"funiq: not enough memory for tree nodes\n"); exit(1); } return(x); } /* * Returns a pointer to the next available btree node. * Gets more space if necessary. */ struct btreenode *nextnode() { if (++alloc >= hend ) { alloc = allocbtree(ADDNOD); hend = alloc + ADDNOD; } return(alloc); } /* * Returns pointer to 'n' bytes for character string storage. */ char *allocstr(n) { register char *x; x = malloc((unsigned)n); if (x == NULL) { fprintf(stderr,"funiq: not enough memory for strings\n"); exit(1); } return(x); } /* * Returns a pointer to the next available string space. * Gets more space if necessary. */ char *nextstr(s) register char *s; { register char *x; register int i; #ifdef STATS charsused += (i = strlen(s) + 1); #else i = strlen(s) + 1; #endif if ((x = s + i) >= cend) { x = allocstr(ADDSTR); cend = x + ADDSTR; } return(x); }