From: CSBVAX::MRGATE!schmidt@bonnie.ics.uci.edu@SMTP 11-AUG-1988 19:46 To: ARISIA::EVERHART Subj: Minimum perfect hashing for gcc reserved words, pt. 2 Received: from rome.ics.uci.edu by prep.ai.mit.edu; Thu, 11 Aug 88 00:09:50 EST Received: from ics.uci.edu by ROME.ICS.UCI.EDU id aa18955; 10 Aug 88 22:32 PDT Received: from bonnie.ics.uci.edu by ICS.UCI.EDU id aa01365; 10 Aug 88 22:31 PDT To: info-gcc <@beanie.ICS.UCI.EDU:info-gcc@prep.ai.mit.edu> Cc: schmidt@bonnie.ics.uci.edu Subject: Minimum perfect hashing for gcc reserved words, pt. 2 Date: Wed, 10 Aug 88 22:31:36 -0700 From: "Douglas C. Schmidt" Message-Id: <8808102231.aa01365@ICS.UCI.EDU> Hi, As promised, the following is a "fast, smaller, and more cryptic" version of the minimum perfect hash function diffs for gcc reserved words. This version is slightly faster since it uses no function calls to strcmp, and slightly smaller, since it packs all the reserved words into a single character array (those of you who have received my minimum perfect hash function generator program will recognize the technique). The routines are also slightly more convoluted, since I've simply hard-coded in these funny "magic-numbers," used as offsets into the character array (#defines would help, I suppose). At any rate, here it is. Enjoy it! By the way, I've receive a large number of queries for the minimum perfect hash function generator program itself. Therefore, the program is now available via anonymous FTP from University of California, Irvine. The relevant addresses are: 192.5.19.1 or ics.uci.edu Use the "anonymous" login id, followed by the password "guest." The files have been "tarred" and compressed; you will find them in: pub/perfect-tar.1.0.Z Be sure to read the README file first! If anyone is having trouble getting through, please let me know and I'll send them to you individually. As you will soon find out, there is still some work to be done on the program. However, I decided to release it now so that others would have a chance to see how it works. I'll be adding more later on this summer, once classes are over. Thanks for your interest, I hope it's useful! Doug Schmidt ------------------------------( cut here )------------------------------ *** parse.y.old Fri Aug 5 23:26:21 1988 --- parse.y Wed Aug 10 22:19:59 1988 *************** *** 1219,1281 **** static char *token_buffer; /* Pointer to token buffer. Actual allocated length is maxtoken + 2. */ ! #define MAXRESERVED 9 ! /* frw[I] is index in `reswords' of the first word whose length is I; ! frw[I+1] is one plus the index of the last word whose length is I. ! The length of frw must be MAXRESERVED + 2 so there is an element ! at MAXRESERVED+1 for the case I == MAXRESERVED. */ ! static char frw[MAXRESERVED+2] = ! { 0, 0, 0, 2, 5, 13, 19, 29, 31, 35, 36 }; ! /* Table of reserved words. */ ! struct resword { char *name; short token; enum rid rid;}; ! #define NORID RID_UNUSED ! static struct resword reswords[] ! = {{"if", IF, NORID}, ! {"do", DO, NORID}, ! {"int", TYPESPEC, RID_INT}, ! {"for", FOR, NORID}, ! {"asm", ASM, NORID}, ! {"case", CASE, NORID}, ! {"char", TYPESPEC, RID_CHAR}, ! {"auto", SCSPEC, RID_AUTO}, ! {"goto", GOTO, NORID}, ! {"else", ELSE, NORID}, ! {"long", TYPESPEC, RID_LONG}, ! {"void", TYPESPEC, RID_VOID}, ! {"enum", ENUM, NORID}, ! {"float", TYPESPEC, RID_FLOAT}, ! {"short", TYPESPEC, RID_SHORT}, ! {"union", UNION, NORID}, ! {"break", BREAK, NORID}, ! {"while", WHILE, NORID}, ! {"const", TYPE_QUAL, RID_CONST}, ! {"double", TYPESPEC, RID_DOUBLE}, ! {"static", SCSPEC, RID_STATIC}, ! {"extern", SCSPEC, RID_EXTERN}, ! {"struct", STRUCT, NORID}, ! {"return", RETURN, NORID}, ! {"sizeof", SIZEOF, NORID}, ! {"typeof", TYPEOF, NORID}, ! {"switch", SWITCH, NORID}, ! {"signed", TYPESPEC, RID_SIGNED}, ! {"inline", SCSPEC, RID_INLINE}, ! {"typedef", SCSPEC, RID_TYPEDEF}, ! {"default", DEFAULT, NORID}, ! {"unsigned", TYPESPEC, RID_UNSIGNED}, ! {"continue", CONTINUE, NORID}, ! {"register", SCSPEC, RID_REGISTER}, ! {"volatile", TYPE_QUAL, RID_VOLATILE}, ! {"__alignof", ALIGNOF, NORID}}; /* The elements of `ridpointers' are identifier nodes ! for the reserved type names and storage classes. ! It is indexed by a RID_... value. */ tree ridpointers[(int) RID_MAX]; --- 1219,1387 ---- static char *token_buffer; /* Pointer to token buffer. Actual allocated length is maxtoken + 2. */ ! #define MIN_WORD_SIZE 2 /* minimum size for C reserved words */ ! #define MAX_WORD_SIZE 9 /* maximum size for C reserved words */ ! #define MIN_KEY_SIZE 4 /* range of the hash keys for the */ ! #define MAX_KEY_SIZE 39 /* minimum perfect hash generator */ ! typedef unsigned char TABLE_TYPE; /* should be a type that can hold values ! with range 0..190. A short might ! be more portable.... */ ! struct resword {TABLE_TYPE offset; short token; enum rid rid;}; ! #define NORID RID_UNUSED ! /* All the funny numbers in the offset field are simply indices into the ! word_table. Each one points the beginning of the reserved word ! associated with the "token" and "rid" field. It might be a nice ! idea to #define the numbers and make everything more readable.... */ ! static struct resword reswords[] = ! { ! {0, 0, NORID}, ! {0, 0, NORID}, /* these locations are not used. */ ! {0, 0, NORID}, /* they simplify the hash function. */ ! {0, 0, NORID}, ! {0, ELSE, NORID}, ! {4, ENUM, NORID}, ! {8, WHILE, NORID}, ! {13, SCSPEC, RID_EXTERN}, ! {19, TYPESPEC, RID_DOUBLE}, ! {25, DEFAULT, NORID}, ! {32, DO, NORID}, ! {34, GOTO, NORID}, ! {38, TYPESPEC, RID_SHORT}, ! {43, STRUCT, NORID}, ! {49, RETURN, NORID}, ! {55, TYPESPEC, RID_SIGNED}, ! {61, TYPESPEC, RID_FLOAT}, ! {66, TYPEOF, NORID}, ! {72, SCSPEC, RID_TYPEDEF}, ! {79, SWITCH, NORID}, ! {85, TYPESPEC, RID_INT}, ! {88, FOR, NORID}, ! {91, SCSPEC, RID_REGISTER}, ! {99, SCSPEC, RID_INLINE}, ! {105, SIZEOF, NORID}, ! {111, TYPESPEC, RID_VOID}, ! {115, ALIGNOF, NORID}, ! {124, TYPE_QUAL, RID_VOLATILE}, ! {132, CASE, NORID}, ! {136, TYPE_QUAL, RID_CONST}, ! {141, IF, NORID}, ! {143, TYPESPEC, RID_LONG}, ! {147, CONTINUE, NORID}, ! {155, ASM, NORID}, ! {158, UNION, NORID}, ! {163, TYPESPEC, RID_CHAR}, ! {167, BREAK, NORID}, ! {172, SCSPEC, RID_STATIC}, ! {178, TYPESPEC, RID_UNSIGNED}, ! {186, SCSPEC, RID_AUTO}, ! {190, 0, NORID}, ! }; ! /* The word table is simply a string of the reserved words packed end ! to end. The offset field in struct resword contains the index of ! the beginning of the appropriate keyword. The length of a ! particular keyword K is determined by subtracting its offset value ! from the offset value of K + 1. Thus, there is no need to store ! the length field explicitly, this is implicit in the resword struct. */ + static char *word_table = + "elseenumwhileexterndoubledefaultdogotoshortstructreturn\ + signedfloattypeoftypedefswitchintforregisterinlinesizeofvoid\ + __alignofvolatilecaseconstiflongcontinueasmunioncharbreakstatic\ + unsignedauto"; + + /* This function performs the minimum-perfect hash mapping from input + string to reswords table index. It only looks at the first and + last characters in the string, thus assuring the O(1) lookup time + (this keeps our constant down to an insignificant amount!). Compiling + the following 2 functions as inline removes all overhead of the + function calls. */ + + #ifdef __GNUC__ + inline static int hash(char *str,int len) + #else + static int hash(str,len) + register char *str; + register int len; + #endif + { + /* This table is used to build the hash table index that recognizes + reserved words in 0(1) steps. It is larger than strictly necessary, + but I'm trading off the space for the time-saving luxury of avoiding + subtraction of an offset. */ + + static char hash_table[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 6, 0, 29, 31, 24, + 2, 0, 11, 1, 6, 17, 0, 0, 26, 1, + 1, 6, 0, 0, 7, 7, 0, 28, 19, 1, + 0, 0, 0, 0, 0, 0, 0, 0, + }; + /* The hash function couldn't be simpler: add the length of the string + to the hash_table value of its first and last character. */ + + return len + hash_table[ (int) str[0]] + hash_table[ (int) str[len - 1]]; + } + + /* This routine attempts to match the string found in the word_table with + the one from the input stream. If all the relevant details match an + actual inline comparison is performed. The inline comparison is preferred + to strncmp to avoid the overhead of a function call. However, this is + simple to modify. In practice it seems to make no real difference in + program speed. */ + + #ifdef __GNUC__ + inline static struct resword *is_reserved_word(char *str,int len) + #else + static struct resword *is_reserved_word(str,len) + register char *str; + register int len; + #endif + { + /* First test to see whether the length is in the reserved word range. */ + + if (len <= MAX_WORD_SIZE && len >= MIN_WORD_SIZE) + { register int key = hash(str,len); + + /* Then make sure the hash value is within the table range. */ + + if (key >= MIN_KEY_SIZE && key <= MAX_KEY_SIZE) + { register int str_start = reswords[key].offset; + register int str_end = reswords[key+1].offset; + + while (str_start < str_end) + { + if (*str++ != word_table[str_start++]) + { + return(NULL); /* bail if no match */ + } + } + + if (*str == '\0') + { + return(&reswords[key]); /* got it! a reserved word */ + } + } + } + return(NULL); /* no luck */ + } + /* The elements of `ridpointers' are identifier nodes ! for the reserved type names and storage classes. ! It is indexed by a RID_... value. */ tree ridpointers[(int) RID_MAX]; *************** *** 1810,1844 **** value = IDENTIFIER; yylval.itype = 0; ! /* Try to recognize a keyword. */ ! if (p - token_buffer <= MAXRESERVED) ! { ! register int lim = frw [p - token_buffer + 1]; ! register int i = frw[p - token_buffer]; ! register struct resword *p = &reswords[i]; ! ! for (; i < lim; i++, p++) ! if (p->name[0] == token_buffer[0] ! && !strcmp (p->name, token_buffer)) ! { ! if (p->rid) ! yylval.ttype = ridpointers[(int) p->rid]; ! if ((! flag_no_asm ! || ((int) p->token != ASM ! && (int) p->token != TYPEOF ! && strcmp (p->name, "inline"))) ! /* -ftraditional means don't recognize ! typeof, const, volatile, signed or inline. */ ! && (! flag_traditional ! || ((int) p->token != TYPE_QUAL ! && (int) p->token != TYPEOF ! && strcmp (p->name, "signed") ! && strcmp (p->name, "inline")))) ! value = (int) p->token; ! break; ! } ! } /* If we did not find a keyword, look for an identifier (or a typename). */ --- 1916,1944 ---- value = IDENTIFIER; yylval.itype = 0; ! /* Try to recognize a keyword. Uses minimum-perfect hash function */ ! ! { ! register struct resword *ptr; ! if (ptr = is_reserved_word (token_buffer,p - token_buffer)) ! { ! if (ptr->rid) ! yylval.ttype = ridpointers[(int) ptr->rid]; ! if ((! flag_no_asm ! || ((int) ptr->token != ASM ! && (int) ptr->token != TYPEOF ! && ptr->rid != RID_INLINE)) /* why waste a strcmp? */ ! /* -ftraditional means don't recognize ! typeof, const, volatile, signed or inline. */ ! && (! flag_traditional ! || ((int) ptr->token != TYPE_QUAL ! && (int) ptr->token != TYPEOF ! && ptr->rid != RID_SIGNED /* again, no need for */ ! && ptr->rid != RID_INLINE))) /* a function call. */ ! value = (int) ptr->token; ! } ! } /* If we did not find a keyword, look for an identifier (or a typename). */