HyperLogLog 에 대해 잠깐 테스트를 해 보았습니다.

BS가 근무하는 팀에서 이런 저런 스터디를 하는데 Redis 에 대해서 스터디 중입니다.
이번 스터디에 Redis 의 PFADD, PFCOUNT, PFMERGE 를 다루게 되어 조금 찾아 보았습니다.

참고자료 #1: [성태의 닷넷 이야기]
참고자료 #2: [Naver D2]
참고자료 #3: [Adnan.Korkmaz Blog]
참고자료 #4: [Microsoft CardinalityEstimation]
참고자료 #5: [Wikipedia]

위 자료에서 3, 4번의 소스와 라이브러리를 받아 간단하게 테스트 프로그램을 작성하였습니다.

using CardinalityEstimation;
using System;
using System.Collections.Generic;
using System.Security.Cryptography;
using System.Text;

namespace HyperLogLogTest {

    // from http://adnan-korkmaz.blogspot.com/2012/06/hyperloglog-c-implementation.html

    public class HyperLogLog {
        private readonly double mapSize, alpha_m, k;
        private readonly int kComplement;
        private readonly Dictionary<int, int> Lookup = new Dictionary<int, int>();
        private const double pow_2_32 = 4294967297;
        private readonly Func<object, uint> hashFunc;

        public HyperLogLog(double stdError, Func<object, uint> hashFunc) {
            this.hashFunc = hashFunc;

            mapSize = 1.04 / stdError;
            k = (long)Math.Ceiling(Log2(mapSize * mapSize));

            kComplement = 32 - (int)k;
            mapSize = (long)Math.Pow(2, k);

            alpha_m = mapSize == 16 ? 0.673
                  : mapSize == 32 ? 0.697
                  : mapSize == 64 ? 0.709
                  : 0.7213 / (1 + 1.079 / mapSize);
            for (int i = 0; i < mapSize; i++)
                Lookup[i] = 0;
        }

        private static double Log2(double x) {
            return Math.Log(x) / 0.69314718055994530941723212145818;//Ln2 
        }
        private static int GetRank(uint hash, int max) {
            int r = 1;
            uint one = 1;
            while ((hash & one) == 0 && r <= max) { ++r; hash >>= 1;
            }
            return r;
        }

        public int Count() {
            double c = 0, E;

            for (var i = 0; i < mapSize; i++)
                c += 1d / Math.Pow(2, Lookup[i]);

            E = alpha_m * mapSize * mapSize / c;

            // Make corrections & smoothen things. 
            if (E <= (5 / 2) * mapSize) {
                double V = 0;
                for (var i = 0; i < mapSize; i++) if (Lookup[i] == 0) V++; if (V > 0)
                    E = mapSize * Math.Log(mapSize / V);
            }
            else
                if (E > (1 / 30) * pow_2_32)
                E = -pow_2_32 * Math.Log(1 - E / pow_2_32);
            // Made corrections & smoothen things, or not. 

            return (int)E;
        }

        public void Add(object val) {
            uint hashCode = hashFunc(val);
            int j = (int)(hashCode >> kComplement);

            Lookup[j] = Math.Max(Lookup[j], GetRank(hashCode, kComplement));
        }
    }

    class Program {

        static MD5 md5 = MD5.Create();
        static readonly string outputFormat = "{0,-40}: {1,10}: {2,10}";

        static uint CustomHash(object val) {
            string text = val.ToString();
            uint hash = 0;

            for (int i = 0, l = text.Length; i < l; i++) {
                hash += text[i];
                hash += hash << 10; hash ^= hash >> 6;
            }
            hash += hash << 3; hash ^= hash >> 6;
            hash += hash << 16;

            return hash;
        }

        static uint DefaultHash(object val) {
            return (uint)val.GetHashCode();
        }

        static uint MD5Hash(object val) {
            var bt = md5.ComputeHash(Encoding.UTF8.GetBytes(val.ToString()));
            return BitConverter.ToUInt32(bt, 0) ^ BitConverter.ToUInt32(bt, 4) ^ BitConverter.ToUInt32(bt, 8) ^ BitConverter.ToUInt32(bt, 12);
        }

        static void HyperLogLogTest(Func<object, uint> hashFunc, string name, int testCount) {
            var hll = new HyperLogLog(0.01, hashFunc);
            for (var i = 0; i < testCount; i++)
                hll.Add(i);
            Console.WriteLine(outputFormat, name, testCount, hll.Count());
        }

        // from https://github.com/Microsoft/CardinalityEstimation
        static void MSHyperLogLogTestFnv1A(Func<object, uint> hashFunc, string name, int testCount) {
            var hllMS = new CardinalityEstimator(hashFunctionId: CardinalityEstimation.Hash.HashFunctionId.Fnv1A);
            for (var i = 0; i < testCount; i++)
                hllMS.Add(i);
            Console.WriteLine(outputFormat, name, testCount, hllMS.Count());
        }

        // from https://github.com/Microsoft/CardinalityEstimation
        static void MSHyperLogLogTestMurmur3(Func<object, uint> hashFunc, string name, int testCount) {
            var hllMS = new CardinalityEstimator(hashFunctionId: CardinalityEstimation.Hash.HashFunctionId.Murmur3);
            for (var i = 0; i < testCount; i++)
                hllMS.Add(i);
            Console.WriteLine(outputFormat, name, testCount, hllMS.Count());
        }

        static void Main(string[] args) {
            HyperLogLogTest(DefaultHash, "Adnan.Korkmaz: DefaultHash", 10);
            HyperLogLogTest(DefaultHash, "Adnan.Korkmaz: DefaultHash", 10000);
            HyperLogLogTest(MD5Hash, "Adnan.Korkmaz: MD5 + XOR", 10);
            HyperLogLogTest(MD5Hash, "Adnan.Korkmaz: MD5 + XOR", 10000);
            HyperLogLogTest(CustomHash, "Adnan.Korkmaz: Adnan.Korkmaz", 10);
            HyperLogLogTest(CustomHash, "Adnan.Korkmaz: Adnan.Korkmaz", 10000);
            MSHyperLogLogTestFnv1A(CustomHash, "CardinalityEstimation with Fnv1A", 10);
            MSHyperLogLogTestFnv1A(CustomHash, "CardinalityEstimation with Fnv1A", 10000);
            MSHyperLogLogTestMurmur3(CustomHash, "CardinalityEstimation with Murmur3", 10);
            MSHyperLogLogTestMurmur3(CustomHash, "CardinalityEstimation with Murmur3", 10000);
        }
    }
}

최대한 비슷한 결과를 얻기 위해 두 라이브러리 모두 레지스터 수량을 16K 개(pow(2, 14)) 로 하였습니다.
그리고 결과는 아래와 같이 나왔습니다.

Adnan.Korkmaz: DefaultHash              :         10:          1
Adnan.Korkmaz: DefaultHash              :      10000:          1
Adnan.Korkmaz: MD5 + XOR                :         10:         10
Adnan.Korkmaz: MD5 + XOR                :      10000:      10068
Adnan.Korkmaz: Adnan.Korkmaz            :         10:         10
Adnan.Korkmaz: Adnan.Korkmaz            :      10000:       9853
CardinalityEstimation with Fnv1A        :         10:         10
CardinalityEstimation with Fnv1A        :      10000:      11419
CardinalityEstimation with Murmur3      :         10:         10
CardinalityEstimation with Murmur3      :      10000:       9980

여기에서 확인되는 것은 동일한 알고리즘으로 구현된 것이라도 어떤 Hash 함수를 넣는가에 따라서 굉장히 다른 결과를 얻게 된다는 것입니다.
C#의 GetHashCode() 의 경우에는… 쩝…

Redis 의 PFADD, PFCOUNT 를 쓰면

var redis = require('redis');
var client = redis.createClient();

function test(n) {
  client.flushall();
  for (i = 0; i < n; i++)
    client.pfadd("hll", i);
  client.pfcount("hll", function(e, r) {
    console.log(n, r);
  });
}

test(10);
test(10000);

client.flushall();
client.quit();
10 10
10000 9999

이렇게 나오네요.