C++/hash tables, fast hash lookup
Expert: vijayan - 6/15/2009
QuestionQUESTION: In follow up to the question I asked you earlier ("fast vector lookup"):
Yes, I was able to get both of your sample programs to successfully compile, although I had to do some tweaking to do so. I made the VC++ project in the directory you told me to, but VC++ still couldn't find the files. So I moved all the dependencies and their dependencies into the project's source directory and changed #include </google/dense_hash_set> to #include "dense_hash_set", and so on. So both md5_lookup.cc and string_lookup.cc compile, and I have been able to benchmark their performance and graph it.
now I just need to learn how to insert elements into the hash table (I'm assuming binary is the best format? as done in my pseudo code?) and return a bool TRUE or some type of confirmation when a hash table lookup finds a specified word in the hash table.
Thank you.
ANSWER: First, use a structure to hold your 16 byte (128 bit) digests.
enum { NBYTES = 16, NWORDS = NBYTES / sizeof(uint32_t) };
// put the 128 bit hash value in this structure
struct sixteen_byte_hash
{
char bytes[NBYTES] ;
};
this means that one of your functions to get a 16 byte digest would be something like:
size_t getHash( char szHash[], sixteen_byte_hash& hash )
{
hash.bytes[0] = hash.bytes[NBYTES-1] = 0xff ;
unsigned int number;
size_t nHashLen;
if(((nHashLen = strlen(szHash)) % 2) != 0)
return(0);
for(size_t i = 0;i < nHashLen;i++) {
if(isxdigit(szHash[i]) == 0)
return(0);
}
nHashLen /= 2;
for(size_t i = 0;i < nHashLen;i++) {
sscanf(&szHash[i*2],"%2x",&number);
hash.bytes[i] = (unsigned char)number;
}
return(nHashLen);
}
The rest of it is straightforward:
#include <google/dense_hash_set>
#include <stdint.h> // posix
// for VC++ 2008 Express 32 bit
// replace #include <stdint.h> withe the next two lines
// typedef unsigned char uint8_t ;
// typedef unsigned int uint32_t ;
inline uint32_t rotate( uint32_t x, uint32_t k )
{
return ( x << k ) | ( x >> ( 32U - k ) ) ;
}
inline void prelim_mix( uint32_t& a, uint32_t& b, uint32_t& c )
{
a -= c ; a ^= rotate( c, 4U ) ; c += b ;
b -= a ; b ^= rotate( a, 6U ) ; a += c ;
c -= b ; c ^= rotate( b, 8U ) ; b += a ;
a -= c ; a ^= rotate( c, 16U ) ; c += b ;
b -= a ; b ^= rotate( a, 19U ) ; a += c ;
c -= b ; c ^= rotate( b, 4U ) ; b += a ;
}
inline void final_mix( uint32_t& a, uint32_t& b, uint32_t& c )
{
c ^= b ; c -= rotate( b, 14U ) ;
a ^= c ; a -= rotate( c, 11U ) ;
b ^= a ; b -= rotate( a, 25U ) ;
c ^= b ; c -= rotate( b, 16U ) ;
a ^= c ; a -= rotate( c, 4U ) ;
b ^= a ; b -= rotate( a, 14U ) ;
c ^= b ; c -= rotate( b, 24U ) ;
}
enum { NBYTES = 16, NWORDS = NBYTES / sizeof(uint32_t) };
// put the 128 bit hash value in this structure
struct sixteen_byte_hash
{
char bytes[NBYTES] ;
};
// hash a 128 bit digest to generate a 32 bit hash
struct hash_sixteen_byte_hash
{
uint32_t operator() ( const sixteen_byte_hash& value ) const ;
};
// hash a 128 bit digest to generate a 32 bit hash
uint32_t hash_sixteen_byte_hash::operator() ( const sixteen_byte_hash& value ) const
{
static uint32_t seed = 0U ;
const uint32_t* words = reinterpret_cast< const uint32_t* > ( value.bytes ) ;
uint32_t a,b,c ;
a = b = c = 0xdeadbeef + ( NWORDS << 2U ) + seed ;
a += words[0] ;
b += words[1] ;
c += words[2] ;
prelim_mix( a, b, c) ;
a += words[3] ;
final_mix( a, b, c ) ;
return seed = c ;
}
// compare two 128 bit hash values to see if they are equal
struct equal_sixteen_byte_hash
{
inline bool operator() ( const sixteen_byte_hash& first,
const sixteen_byte_hash& second ) const
{ return std::memcmp( first.bytes, second.bytes, NBYTES ) == 0 ; }
};
typedef google::dense_hash_set< sixteen_byte_hash, hash_sixteen_byte_hash,
equal_sixteen_byte_hash > hash_set ;
inline void insert_into_hash_set( hash_set& hs, const sixteen_byte_hash& value )
{ hs.insert( value ) ; }
inline bool look_up_in_hash_set( const hash_set& hs, const sixteen_byte_hash& value )
{ return hs.find( value ) != hs.end() ; }
int main()
{
// initialize the hash_set with these two lines
hash_set hs ;
hs.set_empty_key( sixteen_byte_hash() ) ; // set empty key to all nulls
// rest of code for main
}
---------- FOLLOW-UP ----------
QUESTION: I'm still a little lost, sorry. I tried to implement your functions in code below, but I'm sure it's not correct. What I am attempting to do is read hashes from a dump .txt file into the hash table. I used the string 'hashstring' to demonstrate such a hash string read from the file. I use your modified getHash to convert that string to binary and store in 'bufferone'. bufferone was originally of type 'unsigned char bufferone[16]', but you said something about putting the hash in the sixteen_byte_hash structure. Once the binary of the hash is stored in bufferone, I store bufferone in the hash table using the insert_into_hash_set function. Then I look it up using the bool look_up_in_hash_set function. Here is the code:
#include "dense_hash_set"
typedef unsigned char uint8_t ;
typedef unsigned int uint32_t ;
inline uint32_t rotate( uint32_t x, uint32_t k )
{
return ( x << k ) | ( x >> ( 32U - k ) ) ;
}
inline void prelim_mix( uint32_t& a, uint32_t& b, uint32_t& c )
{
a -= c ; a ^= rotate( c, 4U ) ; c += b ;
b -= a ; b ^= rotate( a, 6U ) ; a += c ;
c -= b ; c ^= rotate( b, 8U ) ; b += a ;
a -= c ; a ^= rotate( c, 16U ) ; c += b ;
b -= a ; b ^= rotate( a, 19U ) ; a += c ;
c -= b ; c ^= rotate( b, 4U ) ; b += a ;
}
inline void final_mix( uint32_t& a, uint32_t& b, uint32_t& c )
{
c ^= b ; c -= rotate( b, 14U ) ;
a ^= c ; a -= rotate( c, 11U ) ;
b ^= a ; b -= rotate( a, 25U ) ;
c ^= b ; c -= rotate( b, 16U ) ;
a ^= c ; a -= rotate( c, 4U ) ;
b ^= a ; b -= rotate( a, 14U ) ;
c ^= b ; c -= rotate( b, 24U ) ;
}
enum { NBYTES = 16, NWORDS = NBYTES / sizeof(uint32_t) };
// put the 128 bit hash value in this structure
struct sixteen_byte_hash
{
char bytes[NBYTES] ;
};
// hash a 128 bit digest to generate a 32 bit hash
struct hash_sixteen_byte_hash
{
uint32_t operator() ( const sixteen_byte_hash& value ) const ;
};
// hash a 128 bit digest to generate a 32 bit hash
uint32_t hash_sixteen_byte_hash::operator() ( const sixteen_byte_hash& value ) const
{
static uint32_t seed = 0U ;
const uint32_t* words = reinterpret_cast< const uint32_t* > ( value.bytes ) ;
uint32_t a,b,c ;
a = b = c = 0xdeadbeef + ( NWORDS << 2U ) + seed ;
a += words[0] ;
b += words[1] ;
c += words[2] ;
prelim_mix( a, b, c) ;
a += words[3] ;
final_mix( a, b, c ) ;
return seed = c ;
}
// compare two 128 bit hash values to see if they are equal
struct equal_sixteen_byte_hash
{
inline bool operator() ( const sixteen_byte_hash& first,
const sixteen_byte_hash& second ) const
{ return std::memcmp( first.bytes, second.bytes, NBYTES ) == 0 ; }
};
typedef google::dense_hash_set< sixteen_byte_hash, hash_sixteen_byte_hash,
equal_sixteen_byte_hash > hash_set ;
inline void insert_into_hash_set( hash_set& hs, const sixteen_byte_hash& value )
{ hs.insert( value ) ; }
inline bool look_up_in_hash_set( const hash_set& hs, const sixteen_byte_hash& value )
{ return hs.find( value ) != hs.end() ; }
size_t getHash( char szHash[], sixteen_byte_hash& hash )
{
hash.bytes[0] = hash.bytes[NBYTES-1] = 0xff ;
unsigned int number;
size_t nHashLen;
if(((nHashLen = strlen(szHash)) % 2) != 0)
return(0);
for(size_t i = 0;i < nHashLen;i++) {
if(isxdigit(szHash[i]) == 0)
return(0);
}
nHashLen /= 2;
for(size_t i = 0;i < nHashLen;i++) {
sscanf(&szHash[i*2],"%2x",&number);
hash.bytes[i] = (unsigned char)number;
}
return(nHashLen);
}
int main()
{
// initialize the hash_set with these two lines
hash_set hs ;
hs.set_empty_key( sixteen_byte_hash() ) ; // set empty key to all nulls
sixteen_byte_hash bufferone;//buffer to store binary of hashstring in?
char hashstring[]="CDD1C328071B79D9CEEEC33F663EC965";//md4 hash string read //from 'hashses.txt' dump file
getHash(hashstring,bufferone);//convert string 'hashstring' to binary stored in 'bufferone'
insert_into_hash_set(hs,bufferone);//insert bufferone in the hash_table
//this is the format of hash that my program produces (stored in unsigned char[16]):
uint8_t md4_hash[16]={ 0xCD, 0xD1, 0xC3, 0x28, 0x07, 0x1B, 0x79, 0xD9, 0xCE, 0xEE, 0xC3, 0x3F, 0x66, 0x3E, 0xC9, 0x65 };
//I want to check hash_table for hash 'md4_hash'
if(look_up_in_hash_set(hs,md4_hash))
{
std::cout << "found hash in hash_table!\n";
}
// rest of code for main
}
What am I doing wrong? I'm still very new to programming on this level. Your help is greatly appreciated. Thank you.
AnswerYou need to use the struct sixteen_byte_hash for *both* insert and lookup. To make your task easier, I have added a few constructors for the struct.
There was a typo in the earlier code; in the function
hash_sixteen_byte_hash::operator().
The return should be just return c ; (instead of return seed = c ;)
The full code which should compile now:
#include <google/dense_hash_set>
typedef unsigned char uint8_t ;
typedef unsigned int uint32_t ;
inline uint32_t rotate( uint32_t x, uint32_t k )
{
return ( x << k ) | ( x >> ( 32U - k ) ) ;
}
inline void prelim_mix( uint32_t& a, uint32_t& b, uint32_t& c )
{
a -= c ; a ^= rotate( c, 4U ) ; c += b ;
b -= a ; b ^= rotate( a, 6U ) ; a += c ;
c -= b ; c ^= rotate( b, 8U ) ; b += a ;
a -= c ; a ^= rotate( c, 16U ) ; c += b ;
b -= a ; b ^= rotate( a, 19U ) ; a += c ;
c -= b ; c ^= rotate( b, 4U ) ; b += a ;
}
inline void final_mix( uint32_t& a, uint32_t& b, uint32_t& c )
{
c ^= b ; c -= rotate( b, 14U ) ;
a ^= c ; a -= rotate( c, 11U ) ;
b ^= a ; b -= rotate( a, 25U ) ;
c ^= b ; c -= rotate( b, 16U ) ;
a ^= c ; a -= rotate( c, 4U ) ;
b ^= a ; b -= rotate( a, 14U ) ;
c ^= b ; c -= rotate( b, 24U ) ;
}
enum { NBYTES = 16, NWORDS = NBYTES / sizeof(uint32_t) };
// put the 128 bit hash value in this structure
struct sixteen_byte_hash
{
sixteen_byte_hash()
{ std::memset( bytes, 0, NBYTES ) ; }
sixteen_byte_hash( const uint8_t* array )
{ std::memcpy( bytes, array, NBYTES ) ; }
sixteen_byte_hash( const char* array )
{ std::memcpy( bytes, array, NBYTES ) ; }
char bytes[NBYTES] ;
};
// hash a 128 bit digest to generate a 32 bit hash
struct hash_sixteen_byte_hash
{
uint32_t operator() ( const sixteen_byte_hash& value ) const ;
};
// hash a 128 bit digest to generate a 32 bit hash
uint32_t hash_sixteen_byte_hash::operator() ( const sixteen_byte_hash& value ) const
{
static uint32_t seed = 0U ;
const uint32_t* words = reinterpret_cast< const uint32_t* > ( value.bytes ) ;
uint32_t a,b,c ;
a = b = c = 0xdeadbeef + ( NWORDS << 2U ) + seed ;
a += words[0] ;
b += words[1] ;
c += words[2] ;
prelim_mix( a, b, c) ;
a += words[3] ;
final_mix( a, b, c ) ;
return c ;
}
// compare two 128 bit hash values to see if they are equal
struct equal_sixteen_byte_hash
{
inline bool operator() ( const sixteen_byte_hash& first,
const sixteen_byte_hash& second ) const
{ return std::memcmp( first.bytes, second.bytes, NBYTES ) == 0 ; }
};
typedef google::dense_hash_set< sixteen_byte_hash, hash_sixteen_byte_hash,
equal_sixteen_byte_hash > hash_set ;
inline void insert_into_hash_set( hash_set& hs, const sixteen_byte_hash& value )
{ hs.insert( value ) ; }
inline bool look_up_in_hash_set( const hash_set& hs, const sixteen_byte_hash& value )
{ return hs.find( value ) != hs.end() ; }
size_t getHash( char szHash[], sixteen_byte_hash& hash )
{
hash.bytes[0] = hash.bytes[NBYTES-1] = 0xff ;
unsigned int number;
size_t nHashLen;
if(((nHashLen = strlen(szHash)) % 2) != 0)
return(0);
for(size_t i = 0;i < nHashLen;i++) {
if(isxdigit(szHash[i]) == 0)
return(0);
}
nHashLen /= 2;
for(size_t i = 0;i < nHashLen;i++) {
sscanf(&szHash[i*2],"%2x",&number);
hash.bytes[i] = (unsigned char)number;
}
return(nHashLen);
}
int main()
{
// initialize the hash_set with these two lines
hash_set hs ;
hs.set_empty_key( sixteen_byte_hash() ) ; // set empty key to all nulls
sixteen_byte_hash bufferone;//buffer to store binary of hashstring in?
char hashstring[]="CDD1C328071B79D9CEEEC33F663EC965";//md4 hash string read //from 'hashses.txt' dump file
getHash(hashstring,bufferone);//convert string 'hashstring' to binary stored in 'bufferone'
//std::memcpy(
insert_into_hash_set(hs,bufferone);//insert bufferone in the hash_table
//this is the format of hash that my program produces (stored in unsigned char[16]):
uint8_t md4_hash[16]={ 0xCD, 0xD1, 0xC3, 0x28, 0x07, 0x1B, 0x79, 0xD9, 0xCE, 0xEE, 0xC3, 0x3F, 0x66, 0x3E, 0xC9, 0x65 };
//I want to check hash_table for hash 'md4_hash'
if(look_up_in_hash_set(hs,md4_hash))
{
std::cout << "found hash in hash_table!\n";
}
// rest of code for main
}