In this program, I present a naive and clumsy equivalent of HyperLogLog. It might not, strictly speaking, be a qualified cardinality counter. Through a serious experiments, its performance is barely satisfactory and stable.
I build it not for practical use, but for teaching purpose. The idea of this program mainly comes from HyperLogLog, but my version is much easier with several lines. I recommend you read it before going through the source code of HyperLogLog.
The first time I come across HyperLogLog, its mechanism reminds me of the bitcoin. I mean for bitcoin miners, you have to work out the hash that with n leading 0s which will be a legal digest for the transactions that will be accepted by miner and ledger. We can always estimate the difficulty of generating such kind of hash sequence and the times we need to compute before reaching the desired outcome.
But you need to be aware that no matter you compute n or 2*n-1 times, there is probability that the length of longest leading 0s is the same one. Consequently, we need to use several buckets to make the statistics more uniformly distributed.
// a hash util helps generates more equally distributed outcomes. static MurmurHash hasher = null; // the leading bits of binary representation of the hash sequence that represent the bucket id int bucketBits = 8; // the total quantity of buckets int bucketNum = 1 << bucketBits; // the bucket array used to keep track of the length of longest leading 0s int[] buckets = newint[bucketNum];
publicstaticvoidmain(String[] args){ hasher = new TinyHyperLogLog().new MurmurHash();
TinyHyperLogLog hll = new TinyHyperLogLog();
for (int i = 0; i < 1500000; i++) { hll.add(i + ""); }
System.out.println(hll.getCardinality()); }
publicvoidadd(String val){ // hash function has significant impact on the final performance, it's recommended to use MurmurHash int hash = hasher.hash32(val);
// bucket id, be careful using >>>, rather than >> which will take the sign into consideration int bid = hash >>> (32 - bucketBits); int zeros = Integer.numberOfLeadingZeros(hash << bucketBits);
// update the length of longest leading 0s buckets[bid] = Math.max(buckets[bid], zeros); }
publicintgetCardinality(){ // harmonic mean, this one is recommended. double avg1 = 0;
// arithmetic average, not recommended. double avg2 = 0;
for (int i = 0; i < bucketNum; i++) { if (buckets[i] == 0) { avg1 += 1.0 / 1; } else { avg1 += 1.0 / buckets[i]; }
/** * murmur hash 2.0. * <p> * The murmur hash is a relatively fast hash function from * http://murmurhash.googlepages.com/ for platforms with efficient * multiplication. * <p> * This is a re-implementation of the original C code plus some * additional features. * <p> * Public domain. * * @author Viliam Holub * @version 1.0.2 */ finalclassMurmurHash{ privateMurmurHash(){ }
/** * Generates 32 bit hash from byte array of the given length and * seed. * * @param data byte array to hash * @param length length of the array to hash * @param seed initial seed value * @return 32 bit hash of the given array */ publicinthash32(finalbyte[] data, int length, int seed){ // 'm' and 'r' are mixing constants generated offline. // They're not really 'magic', they just happen to work well. finalint m = 0x5bd1e995; finalint r = 24;
// Initialize the hash to a random value int h = seed ^ length; int length4 = length / 4; for (int i = 0; i < length4; i++) { finalint i4 = i * 4; int k = (data[i4 + 0] & 0xff) + ((data[i4 + 1] & 0xff) << 8) + ((data[i4 + 2] & 0xff) << 16) + ((data[i4 + 3] & 0xff) << 24); k *= m; k ^= k >>> r; k *= m; h *= m; h ^= k; }
// Handle the last few bytes of the input array switch (length % 4) { case3: h ^= (data[(length & ~3) + 2] & 0xff) << 16; case2: h ^= (data[(length & ~3) + 1] & 0xff) << 8; case1: h ^= (data[length & ~3] & 0xff); h *= m; } h ^= h >>> 13; h *= m; h ^= h >>> 15; return h; }
/** * Generates 32 bit hash from byte array with default seed value. * * @param data byte array to hash * @param length length of the array to hash * @return 32 bit hash of the given array */ publicinthash32(finalbyte[] data, int length){ return hash32(data, length, 0x9747b28c); }
/** * Generates 32 bit hash from a string. * * @param text string to hash * @return 32 bit hash of the given string */ publicinthash32(final String text){ finalbyte[] bytes = text.getBytes(); return hash32(bytes, bytes.length); }
/** * Generates 32 bit hash from a substring. * * @param text string to hash * @param from starting index * @param length length of the substring to hash * @return 32 bit hash of the given string */ publicinthash32(final String text, int from, int length){ return hash32(text.substring(from, from + length)); }
/** * Generates 64 bit hash from byte array of the given length and seed. * * @param data byte array to hash * @param length length of the array to hash * @param seed initial seed value * @return 64 bit hash of the given array */ publiclonghash64(finalbyte[] data, int length, int seed){ finallong m = 0xc6a4a7935bd1e995L; finalint r = 47; long h = (seed & 0xffffffffl) ^ (length * m); int length8 = length / 8; for (int i = 0; i < length8; i++) { finalint i8 = i * 8; long k = ((long) data[i8 + 0] & 0xff) + (((long) data[i8 + 1] & 0xff) << 8) + (((long) data[i8 + 2] & 0xff) << 16) + (((long) data[i8 + 3] & 0xff) << 24) + (((long) data[i8 + 4] & 0xff) << 32) + (((long) data[i8 + 5] & 0xff) << 40) + (((long) data[i8 + 6] & 0xff) << 48) + (((long) data[i8 + 7] & 0xff) << 56); k *= m; k ^= k >>> r; k *= m; h ^= k; h *= m; } switch (length % 8) { case7: h ^= (long) (data[(length & ~7) + 6] & 0xff) << 48; case6: h ^= (long) (data[(length & ~7) + 5] & 0xff) << 40; case5: h ^= (long) (data[(length & ~7) + 4] & 0xff) << 32; case4: h ^= (long) (data[(length & ~7) + 3] & 0xff) << 24; case3: h ^= (long) (data[(length & ~7) + 2] & 0xff) << 16; case2: h ^= (long) (data[(length & ~7) + 1] & 0xff) << 8; case1: h ^= (long) (data[length & ~7] & 0xff); h *= m; } ; h ^= h >>> r; h *= m; h ^= h >>> r; return h; }
/** * Generates 64 bit hash from byte array with default seed value. * * @param data byte array to hash * @param length length of the array to hash * @return 64 bit hash of the given string */ publiclonghash64(finalbyte[] data, int length){ return hash64(data, length, 0xe17a1465); }
/** * Generates 64 bit hash from a string. * * @param text string to hash * @return 64 bit hash of the given string */ publiclonghash64(final String text){ finalbyte[] bytes = text.getBytes(); return hash64(bytes, bytes.length); }
/** * Generates 64 bit hash from a substring. * * @param text string to hash * @param from starting index * @param length length of the substring to hash * @return 64 bit hash of the given array */ publiclonghash64(final String text, int from, int length){ return hash64(text.substring(from, from + length)); } } }