A Tiny HyperLogLog

In this program, I present a naive and clumsy equivalent of HyperLogLog. It might not, strictly speaking, be a qualified cardinality counter. Through a serious experiments, its performance is barely satisfactory and stable.

I build it not for practical use, but for teaching purpose. The idea of this program mainly comes from HyperLogLog, but my version is much easier with several lines. I recommend you read it before going through the source code of HyperLogLog.

The first time I come across HyperLogLog, its mechanism reminds me of the bitcoin. I mean for bitcoin miners, you have to work out the hash that with n leading 0s which will be a legal digest for the transactions that will be accepted by miner and ledger. We can always estimate the difficulty of generating such kind of hash sequence and the times we need to compute before reaching the desired outcome.

But you need to be aware that no matter you compute n or 2*n-1 times, there is probability that the length of longest leading 0s is the same one. Consequently, we need to use several buckets to make the statistics more uniformly distributed.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
public class TinyHyperLogLog {

// a hash util helps generates more equally distributed outcomes.
static MurmurHash hasher = null;
// the leading bits of binary representation of the hash sequence that represent the bucket id
int bucketBits = 8;
// the total quantity of buckets
int bucketNum = 1 << bucketBits;
// the bucket array used to keep track of the length of longest leading 0s
int[] buckets = new int[bucketNum];

public static void main(String[] args) {
hasher = new TinyHyperLogLog().new MurmurHash();

TinyHyperLogLog hll = new TinyHyperLogLog();

for (int i = 0; i < 1500000; i++) {
hll.add(i + "");
}

System.out.println(hll.getCardinality());
}

public void add(String val) {
// hash function has significant impact on the final performance, it's recommended to use MurmurHash
int hash = hasher.hash32(val);

// bucket id, be careful using >>>, rather than >> which will take the sign into consideration
int bid = hash >>> (32 - bucketBits);
int zeros = Integer.numberOfLeadingZeros(hash << bucketBits);

// update the length of longest leading 0s
buckets[bid] = Math.max(buckets[bid], zeros);
}

public int getCardinality() {
// harmonic mean, this one is recommended.
double avg1 = 0;

// arithmetic average, not recommended.
double avg2 = 0;

for (int i = 0; i < bucketNum; i++) {
if (buckets[i] == 0) {
avg1 += 1.0 / 1;
} else {
avg1 += 1.0 / buckets[i];
}

avg2 += buckets[i];
}

return (int) (bucketNum * Math.pow(2, bucketNum / avg1));
// return (int)(bucketNum * Math.pow(2, avg2/ bucketNum));
}

/**
* murmur hash 2.0.
* <p>
* The murmur hash is a relatively fast hash function from
* http://murmurhash.googlepages.com/ for platforms with efficient
* multiplication.
* <p>
* This is a re-implementation of the original C code plus some
* additional features.
* <p>
* Public domain.
*
* @author Viliam Holub
* @version 1.0.2
*/
final class MurmurHash {
private MurmurHash() {
}

/**
* Generates 32 bit hash from byte array of the given length and
* seed.
*
* @param data byte array to hash
* @param length length of the array to hash
* @param seed initial seed value
* @return 32 bit hash of the given array
*/
public int hash32(final byte[] data, int length, int seed) {
// 'm' and 'r' are mixing constants generated offline.
// They're not really 'magic', they just happen to work well.
final int m = 0x5bd1e995;
final int r = 24;

// Initialize the hash to a random value
int h = seed ^ length;
int length4 = length / 4;
for (int i = 0; i < length4; i++) {
final int i4 = i * 4;
int k = (data[i4 + 0] & 0xff) + ((data[i4 + 1] & 0xff) << 8)
+ ((data[i4 + 2] & 0xff) << 16) + ((data[i4 + 3] & 0xff) << 24);
k *= m;
k ^= k >>> r;
k *= m;
h *= m;
h ^= k;
}

// Handle the last few bytes of the input array
switch (length % 4) {
case 3:
h ^= (data[(length & ~3) + 2] & 0xff) << 16;
case 2:
h ^= (data[(length & ~3) + 1] & 0xff) << 8;
case 1:
h ^= (data[length & ~3] & 0xff);
h *= m;
}
h ^= h >>> 13;
h *= m;
h ^= h >>> 15;
return h;
}

/**
* Generates 32 bit hash from byte array with default seed value.
*
* @param data byte array to hash
* @param length length of the array to hash
* @return 32 bit hash of the given array
*/
public int hash32(final byte[] data, int length) {
return hash32(data, length, 0x9747b28c);
}

/**
* Generates 32 bit hash from a string.
*
* @param text string to hash
* @return 32 bit hash of the given string
*/
public int hash32(final String text) {
final byte[] bytes = text.getBytes();
return hash32(bytes, bytes.length);
}

/**
* Generates 32 bit hash from a substring.
*
* @param text string to hash
* @param from starting index
* @param length length of the substring to hash
* @return 32 bit hash of the given string
*/
public int hash32(final String text, int from, int length) {
return hash32(text.substring(from, from + length));
}

/**
* Generates 64 bit hash from byte array of the given length and seed.
*
* @param data byte array to hash
* @param length length of the array to hash
* @param seed initial seed value
* @return 64 bit hash of the given array
*/
public long hash64(final byte[] data, int length, int seed) {
final long m = 0xc6a4a7935bd1e995L;
final int r = 47;
long h = (seed & 0xffffffffl) ^ (length * m);
int length8 = length / 8;
for (int i = 0; i < length8; i++) {
final int i8 = i * 8;
long k = ((long) data[i8 + 0] & 0xff) + (((long) data[i8 + 1] & 0xff) << 8)
+ (((long) data[i8 + 2] & 0xff) << 16) + (((long) data[i8 + 3] & 0xff) << 24)
+ (((long) data[i8 + 4] & 0xff) << 32) + (((long) data[i8 + 5] & 0xff) << 40)
+ (((long) data[i8 + 6] & 0xff) << 48) + (((long) data[i8 + 7] & 0xff) << 56);
k *= m;
k ^= k >>> r;
k *= m;
h ^= k;
h *= m;
}
switch (length % 8) {
case 7:
h ^= (long) (data[(length & ~7) + 6] & 0xff) << 48;
case 6:
h ^= (long) (data[(length & ~7) + 5] & 0xff) << 40;
case 5:
h ^= (long) (data[(length & ~7) + 4] & 0xff) << 32;
case 4:
h ^= (long) (data[(length & ~7) + 3] & 0xff) << 24;
case 3:
h ^= (long) (data[(length & ~7) + 2] & 0xff) << 16;
case 2:
h ^= (long) (data[(length & ~7) + 1] & 0xff) << 8;
case 1:
h ^= (long) (data[length & ~7] & 0xff);
h *= m;
}
;
h ^= h >>> r;
h *= m;
h ^= h >>> r;
return h;
}

/**
* Generates 64 bit hash from byte array with default seed value.
*
* @param data byte array to hash
* @param length length of the array to hash
* @return 64 bit hash of the given string
*/
public long hash64(final byte[] data, int length) {
return hash64(data, length, 0xe17a1465);
}

/**
* Generates 64 bit hash from a string.
*
* @param text string to hash
* @return 64 bit hash of the given string
*/
public long hash64(final String text) {
final byte[] bytes = text.getBytes();
return hash64(bytes, bytes.length);
}

/**
* Generates 64 bit hash from a substring.
*
* @param text string to hash
* @param from starting index
* @param length length of the substring to hash
* @return 64 bit hash of the given array
*/
public long hash64(final String text, int from, int length) {
return hash64(text.substring(from, from + length));
}
}
}

References

  1. https://odino.org/my-favorite-data-structure-hyperloglog/
  2. http://content.research.neustar.biz/blog/hll.html
  3. https://www.jianshu.com/p/55defda6dcd2
  4. https://github.com/tnm/murmurhash-java