#include #include #include #include #include /* Take stream of words, one per line, interspersed with !newdocum. (The flag !newdocum marks the start of a new document.) Outputs each distinct word in the input, with a count of documents and a count of occurrences. Output is in res.out (or whatever). */ #define WORDLEN 8192 #define true 1 #define false 0 #define OUTFILE "res.out" #define TSIZE 65536 typedef struct hashrec { char *word; unsigned long numocc, curdoc, numdoc; struct hashrec *next; } HASHREC; HASHREC **ht; unsigned int shfthash(char *); int cmpit(); void setxorseed(int); void sorthash(); HASHREC *makerec(); void initrecarray(); unsigned long docno; void insert(char *); void printstats(FILE *); char *makechar(int); void initchararray(); /* Read words until end-of-file, inserting them into the tree, then display the results. */ main(int argc, char *argv[]) { char *gets(); char *outfile = OUTFILE; char *w, word[WORDLEN]; int i; hrtime_t start, end, time; /* nanoseconds */ hrtime_t vstart, vend, vtime; /* nanoseconds */ docno = 1; if( argc == 2 ) outfile = argv[1]; else if( argc > 2 ) { fprintf(stderr, "Usage: getstat [ outfile ]\n"); exit(1); } w = gets(word); ht = (HASHREC **) malloc( sizeof(HASHREC *) * TSIZE ); for( i=0 ; i> 2) ); } return((unsigned int)((h&0x7fffffff) % TSIZE)); } void insert(char *w) { HASHREC *htmp, *hprv; unsigned int hval = shfthash(w); /* memcmp is better than scmp if most comparisons are over the full string */ for( hprv = NULL, htmp=ht[hval] /*; htmp != NULL && memcmp(htmp->word, w, 8192) != 0*/ ; htmp != NULL && scmp(htmp->word, w) != 0 ; hprv = htmp, htmp = htmp->next ) { ; } if( htmp==NULL ) { /* htmp = (HASHREC *) malloc( sizeof(HASHREC) ); */ /* htmp->word = (char *) malloc( strlen(w) + 1 ); */ htmp = makerec(); htmp->word = makechar( strlen(w) + 1 ); strcpy(htmp->word, w); htmp->curdoc = docno; htmp->numocc = htmp->numdoc = 1; htmp->next = NULL; if( hprv==NULL ) ht[hval] = htmp; else hprv->next = htmp; } else { htmp->numocc++; if( docno > htmp->curdoc ) { htmp->curdoc = docno; htmp->numdoc++; } if( hprv!=NULL ) /* move to front on subsequent access */ { hprv->next = htmp->next; htmp->next = ht[hval]; ht[hval] = htmp; } } } #define NUMREC 10000 HASHREC *recarray; int recnum=NUMREC; void initrecarray() { recarray = (HASHREC *) malloc( sizeof(HASHREC) * NUMREC ); recnum = 0; } HASHREC * makerec() { HASHREC *nextrec; if( recnum==NUMREC ) initrecarray(); nextrec = &recarray[recnum]; recnum++; return(nextrec); } HASHREC **hh; int num; /* split printstats functionality to simplify timing */ void sorthash() { int i, j; HASHREC *htmp; for( i=0, num=0 ; inext ) num++; hh = (HASHREC **) malloc( sizeof(HASHREC *) * num ); for( i=0, j=0 ; inext, j++ ) hh[j] = htmp; qsort(hh, num, sizeof(HASHREC *), cmpit); } /* Print contents of hash table */ void printstats(FILE *fp) { int j; for( j=0 ; jword, hh[j]->numocc, hh[j]->numdoc); free(hh); } int cmpit(HASHREC **h1, HASHREC **h2) { return( scmp((*h1)->word, (*h2)->word) ); } int scmp( char *s1, char *s2 ) { while( *s1 != '\0' && *s1 == *s2 ) { s1++; s2++; } return( *s1-*s2 ); } /* Module for space management of strings */ /* Unused space at the end of each buffer is lost - never mind */ #define NUMCHAR 100000 char *chararray; int charnum=NUMCHAR; void initchararray() { chararray = (char *) malloc( sizeof(char) * NUMCHAR ); charnum = 0; } char * makechar(int len) { char *nextchar; if( charnum+len>=NUMCHAR ) initchararray(); nextchar = &chararray[charnum]; charnum += len; return(nextchar); }