@
目录hllhdr
struct hllhdr { char magic[4]; /* "HYLL" */ uint8_t encoding; /* HLL_DENSE or HLL_SPARSE. */ uint8_t notused[3]; /* Reserved for future use, must be zero. */ uint8_t card[8]; /* Cached cardinality, little endian. */ uint8_t registers[]; /* Data bytes. */ };
一些宏定义
/* The cached cardinality MSB is used to signal validity of the cached value. */ #define HLL_INVALIDATE_CACHE(hdr) (hdr)->card[0] |= (1<<7) #define HLL_VALID_CACHE(hdr) (((hdr)->card[0] & (1<<7)) == 0) #define HLL_P 14 /* The greater is P, the smaller the error. */ #define HLL_REGISTERS (1<<HLL_P) /* With P=14, 16384 registers. */ #define HLL_P_MASK (HLL_REGISTERS-1) /* Mask to index register. */ #define HLL_BITS 6 /* Enough to count up to 63 leading zeroes. */ #define HLL_REGISTER_MAX ((1<<HLL_BITS)-1) #define HLL_HDR_SIZE sizeof(struct hllhdr) #define HLL_DENSE_SIZE (HLL_HDR_SIZE+((HLL_REGISTERS*HLL_BITS+7)/8)) #define HLL_DENSE 0 /* Dense encoding. */ #define HLL_SPARSE 1 /* Sparse encoding. */ #define HLL_RAW 255 /* Only used internally, never exposed. */ #define HLL_MAX_ENCODING 1 static char *invalid_hll_err = "-INVALIDOBJ Corrupted HLL object detected\r\n"; ...
哈希函数 O(1)
uint64_t MurmurHash64A (const void * key, int len, unsigned int seed) { const uint64_t m = 0xc6a4a7935bd1e995; const int r = 47; uint64_t h = seed ^ (len * m); const uint8_t *data = (const uint8_t *)key; const uint8_t *end = data + (len-(len&7)); while(data != end) { uint64_t k; #if (BYTE_ORDER == LITTLE_ENDIAN) k = *((uint64_t*)data); #else k = (uint64_t) data[0]; k |= (uint64_t) data[1] << 8; k |= (uint64_t) data[2] << 16; k |= (uint64_t) data[3] << 24; k |= (uint64_t) data[4] << 32; k |= (uint64_t) data[5] << 40; k |= (uint64_t) data[6] << 48; k |= (uint64_t) data[7] << 56; #endif k *= m; k ^= k >> r; k *= m; h ^= k; h *= m; data += 8; } switch(len & 7) { case 7: h ^= (uint64_t)data[6] << 48; case 6: h ^= (uint64_t)data[5] << 40; case 5: h ^= (uint64_t)data[4] << 32; case 4: h ^= (uint64_t)data[3] << 24; case 3: h ^= (uint64_t)data[2] << 16; case 2: h ^= (uint64_t)data[1] << 8; case 1: h ^= (uint64_t)data[0]; h *= m; }; h ^= h >> r; h *= m; h ^= h >> r; return h; }
// 返回000...1的长度 // regp 保存哈希桶index int hllPatLen(unsigned char *ele, size_t elesize, long *regp) { uint64_t hash, bit, index; int count; // 获取哈希值 hash = MurmurHash64A(ele,elesize,0xadc83b19ULL); index = hash & HLL_P_MASK; // (1 << 14)个桶的index hash |= ((uint64_t)1<<63);// 确保至少一个1 bit = HLL_REGISTERS; // 第15位开始算起 count = 1; // 第一个出现1的rank while((hash & bit) == 0) {// 直到找到1 count++; bit <<= 1; } *regp = (int) index;// (1 << 14)个桶的index return count;// 返回000...1的长度 }
int hllDenseAdd(uint8_t *registers, unsigned char *ele, size_t elesize) { uint8_t oldcount, count; long index; // 元素哈希处理 count = hllPatLen(ele,elesize,&index); // 000...1的长度更长则添加 HLL_DENSE_GET_REGISTER(oldcount,registers,index); if (count > oldcount) { HLL_DENSE_SET_REGISTER(registers,index,count); return 1; } else { return 0; } }
关于HyperLogLog可以详见本人之前的博客解析