Thursday, August 25, 2011

Code Snippet: Murmur hash inverse / pre-image

Today's caring by sharing. I needed this non-trivial code snippet today and couldn't find it anywhere on the internet, so here it is for future reference. It computes the inverse / pre-image of a murmur hash. I. e., given a 32 bit murmur hash value, it computes a 32 bit value that when hashed produces that hash value:

/// Inverts a (h ^= h >> s) operation with 8 <= s <= 16
unsigned int invert_shift_xor(unsigned int hs, unsigned int s)
{
	XENSURE(s >= 8 && s <= 16);
	unsigned hs0 = hs >> 24;
	unsigned hs1 = (hs >> 16) & 0xff;
	unsigned hs2 = (hs >> 8) & 0xff;
	unsigned hs3 = hs & 0xff;

	unsigned h0 = hs0;
	unsigned h1 = hs1 ^ (h0 >> (s-8));
	unsigned h2 = (hs2 ^ (h0 << (16-s)) ^ (h1 >> (s-8))) & 0xff;
	unsigned h3 = (hs3 ^ (h1 << (16-s)) ^ (h2 >> (s-8))) & 0xff;
	return (h0<<24) + (h1<<16) + (h2<<8) + h3;
}

unsigned int murmur_hash_inverse(unsigned int h, unsigned int seed)
{
	const unsigned int m = 0x5bd1e995;
	const unsigned int minv = 0xe59b19bd;	// Multiplicative inverse of m under % 2^32
	const int r = 24;

	h = invert_shift_xor(h,15);
	h *= minv;
	h = invert_shift_xor(h,13);

	unsigned int hforward = seed ^ 4;
	hforward *= m;
	unsigned int k = hforward ^ h;
	k *= minv;
	k ^= k >> r;
	k *= minv;

	#ifdef PLATFORM_BIG_ENDIAN
		char *data = (char *)&k;
		k = (data[0]) + (data[1] << 8) + (data[2] << 16) + (data[3] << 24);
	#endif

	return k;
}
And for reference, here is the full code, with both the regular murmur hash and the inverses for 32- and 64-bit hashes:
unsigned int murmur_hash ( const void * key, int len, unsigned int seed )
{
	// 'm' and 'r' are mixing constants generated offline.
	// They're not really 'magic', they just happen to work well.

	const unsigned int m = 0x5bd1e995;
	const int r = 24;

	// Initialize the hash to a 'random' value

	unsigned int h = seed ^ len;

	// Mix 4 bytes at a time into the hash

	const unsigned char * data = (const unsigned char *)key;

	while(len >= 4)
	{
		#ifdef PLATFORM_BIG_ENDIAN
			unsigned int k = (data[0]) + (data[1] << 8) + (data[2] << 16) + (data[3] << 24);
		#else
			unsigned int k = *(unsigned int *)data;
		#endif

		k *= m;
		k ^= k >> r;
		k *= m;

		h *= m;
		h ^= k;

		data += 4;
		len -= 4;
	}

	// Handle the last few bytes of the input array

	switch(len)
	{
	case 3: h ^= data[2] << 16;
	case 2: h ^= data[1] << 8;
	case 1: h ^= data[0];
		h *= m;
	};

	// Do a few final mixes of the hash to ensure the last few
	// bytes are well-incorporated.

	h ^= h >> 13;
	h *= m;
	h ^= h >> 15;

	return h;
}

/// Inverts a (h ^= h >> s) operation with 8 <= s <= 16
unsigned int invert_shift_xor(unsigned int hs, unsigned int s)
{
	XENSURE(s >= 8 && s <= 16);
	unsigned hs0 = hs >> 24;
	unsigned hs1 = (hs >> 16) & 0xff;
	unsigned hs2 = (hs >> 8) & 0xff;
	unsigned hs3 = hs & 0xff;

	unsigned h0 = hs0;
	unsigned h1 = hs1 ^ (h0 >> (s-8));
	unsigned h2 = (hs2 ^ (h0 << (16-s)) ^ (h1 >> (s-8))) & 0xff;
	unsigned h3 = (hs3 ^ (h1 << (16-s)) ^ (h2 >> (s-8))) & 0xff;
	return (h0<<24) + (h1<<16) + (h2<<8) + h3;
}

unsigned int murmur_hash_inverse(unsigned int h, unsigned int seed)
{
	const unsigned int m = 0x5bd1e995;
	const unsigned int minv = 0xe59b19bd;	// Multiplicative inverse of m under % 2^32
	const int r = 24;

	h = invert_shift_xor(h,15);
	h *= minv;
	h = invert_shift_xor(h,13);

	unsigned int hforward = seed ^ 4;
	hforward *= m;
	unsigned int k = hforward ^ h;
	k *= minv;
	k ^= k >> r;
	k *= minv;

	#ifdef PLATFORM_BIG_ENDIAN
		char *data = (char *)&k;
		k = (data[0]) + (data[1] << 8) + (data[2] << 16) + (data[3] << 24);
	#endif

	return k;
}

uint64 murmur_hash_64(const void * key, int len, uint64 seed)
{
	const uint64 m = 0xc6a4a7935bd1e995ULL;
	const int r = 47;

	uint64 h = seed ^ (len * m);

	const uint64 * data = (const uint64 *)key;
	const uint64 * end = data + (len/8);

	while(data != end)
	{
		#ifdef PLATFORM_BIG_ENDIAN
			uint64 k = *data++;
			char *p = (char *)&k;
			char c;
			c = p[0]; p[0] = p[7]; p[7] = c;
			c = p[1]; p[1] = p[6]; p[6] = c;
			c = p[2]; p[2] = p[5]; p[5] = c;
			c = p[3]; p[3] = p[4]; p[4] = c;
		#else
			uint64 k = *data++;
		#endif

		k *= m;
		k ^= k >> r;
		k *= m;
		
		h ^= k;
		h *= m;
	}

	const unsigned char * data2 = (const unsigned char*)data;

	switch(len & 7)
	{
	case 7: h ^= uint64(data2[6]) << 48;
	case 6: h ^= uint64(data2[5]) << 40;
	case 5: h ^= uint64(data2[4]) << 32;
	case 4: h ^= uint64(data2[3]) << 24;
	case 3: h ^= uint64(data2[2]) << 16;
	case 2: h ^= uint64(data2[1]) << 8;
	case 1: h ^= uint64(data2[0]);
			h *= m;
	};
 
	h ^= h >> r;
	h *= m;
	h ^= h >> r;

	return h;
}

uint64 murmur_hash_64_inverse(uint64 h, uint64 seed)
{
	const uint64 m = 0xc6a4a7935bd1e995ULL;
	const uint64 minv = 0x5f7a0ea7e59b19bdULL; // Multiplicative inverse of m under % 2^64
	const int r = 47;

	h ^= h >> r;
	h *= minv;
	h ^= h >> r;
	h *= minv;

	uint64 hforward = seed ^ (8 * m);
	uint64 k = h ^ hforward;

	k *= minv;
	k ^= k >> r;
	k *= minv;

	#ifdef PLATFORM_BIG_ENDIAN
		char *p = (char *)&k;
		char c;
		c = p[0]; p[0] = p[7]; p[7] = c;
		c = p[1]; p[1] = p[6]; p[6] = c;
		c = p[2]; p[2] = p[5]; p[5] = c;
		c = p[3]; p[3] = p[4]; p[4] = c;
	#endif
	
	return k;
}

4 comments:

  1. I just started using hashes instead of strings/guids, and I'm using murmurhash. I'm using murmurhash3 right now, but I don't think I could manage making my own inverse hash function in a decent amount of time.

    Might as well change to murmurhash2. I tried this code, but I seem to be missing the function/macro XENSURE.

    ReplyDelete
  2. XENSURE is just our internal assert function, you can replace it with your own assert.

    ReplyDelete
  3. What are the use cases of this?

    ReplyDelete