diff --git a/oss/libpopcnt/cgmanifest.json b/oss/libpopcnt/cgmanifest.json
index 02ac4729e0..b65c9c29b0 100644
--- a/oss/libpopcnt/cgmanifest.json
+++ b/oss/libpopcnt/cgmanifest.json
@@ -6,7 +6,7 @@
         "type": "git",
         "git": {
           "repositoryUrl": "https://github.com/kimwalisch/libpopcnt",
-          "commitHash": "043a99fba31121a70bcb2f589faa17f534ae6085"
+          "commitHash": "c49987e90e56191c399cab881ab87b5daecc9b8e"
         }
       }
     }
diff --git a/oss/libpopcnt/libpopcnt.h b/oss/libpopcnt/libpopcnt.h
index e23b8e4081..ffcd976b03 100644
--- a/oss/libpopcnt/libpopcnt.h
+++ b/oss/libpopcnt/libpopcnt.h
@@ -1,841 +1,798 @@
-/*
- * libpopcnt.h - C/C++ library for counting the number of 1 bits (bit
- * population count) in an array as quickly as possible using
- * specialized CPU instructions i.e. POPCNT, AVX2, AVX512, NEON.
- *
- * Copyright (c) 2016 - 2019, Kim Walisch
- * Copyright (c) 2016 - 2018, Wojciech Muła
- *
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- *    list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef LIBPOPCNT_H
-#define LIBPOPCNT_H
-
-#include <stdint.h>
-
-#ifndef __has_builtin
-  #define __has_builtin(x) 0
-#endif
-
-#ifndef __has_attribute
-  #define __has_attribute(x) 0
-#endif
-
-#ifdef __GNUC__
-  #define GNUC_PREREQ(x, y) \
-      (__GNUC__ > x || (__GNUC__ == x && __GNUC_MINOR__ >= y))
-#else
-  #define GNUC_PREREQ(x, y) 0
-#endif
-
-#ifdef __clang__
-  #define CLANG_PREREQ(x, y) \
-      (__clang_major__ > x || (__clang_major__ == x && __clang_minor__ >= y))
-#else
-  #define CLANG_PREREQ(x, y) 0
-#endif
-
-#if (_MSC_VER < 1900) && \
-    !defined(__cplusplus)
-  #define inline __inline
-#endif
-
-#if (defined(__i386__) || \
-     defined(__x86_64__) || \
-     defined(_M_IX86) || \
-     defined(_M_X64))
-  #define X86_OR_X64
-#endif
-
-#if defined(X86_OR_X64) && \
-   (defined(__cplusplus) || \
-    defined(_MSC_VER) || \
-   (GNUC_PREREQ(4, 2) || \
-    __has_builtin(__sync_val_compare_and_swap)))
-  #define HAVE_CPUID
-#endif
-
-#if GNUC_PREREQ(4, 2) || \
-    __has_builtin(__builtin_popcount)
-  #define HAVE_BUILTIN_POPCOUNT
-#endif
-
-#if GNUC_PREREQ(4, 2) || \
-    CLANG_PREREQ(3, 0)
-  #define HAVE_ASM_POPCNT
-#endif
-
-#if defined(HAVE_CPUID) && \
-   (defined(HAVE_ASM_POPCNT) || \
-    defined(_MSC_VER))
-  #define HAVE_POPCNT
-#endif
-
-#if defined(HAVE_CPUID) && \
-    GNUC_PREREQ(4, 9)
-  #define HAVE_AVX2
-#endif
-
-#if defined(HAVE_CPUID) && \
-    GNUC_PREREQ(5, 0)
-  #define HAVE_AVX512
-#endif
-
-#if defined(HAVE_CPUID) && \
-    defined(_MSC_VER) && \
-    defined(__AVX2__)
-  #define HAVE_AVX2
-#endif
-
-#if defined(HAVE_CPUID) && \
-    defined(_MSC_VER) && \
-    defined(__AVX512__)
-  #define HAVE_AVX512
-#endif
-
-#if defined(HAVE_CPUID) && \
-    CLANG_PREREQ(3, 8) && \
-    __has_attribute(target) && \
-   (!defined(_MSC_VER) || defined(__AVX2__)) && \
-   (!defined(__apple_build_version__) || __apple_build_version__ >= 8000000)
-  #define HAVE_AVX2
-  #define HAVE_AVX512
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * This uses fewer arithmetic operations than any other known
- * implementation on machines with fast multiplication.
- * It uses 12 arithmetic operations, one of which is a multiply.
- * http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation
- */
-static inline uint64_t popcount64(uint64_t x)
-{
-  uint64_t m1 = 0x5555555555555555ll;
-  uint64_t m2 = 0x3333333333333333ll;
-  uint64_t m4 = 0x0F0F0F0F0F0F0F0Fll;
-  uint64_t h01 = 0x0101010101010101ll;
-
-  x -= (x >> 1) & m1;
-  x = (x & m2) + ((x >> 2) & m2);
-  x = (x + (x >> 4)) & m4;
-
-  return (x * h01) >> 56;
-}
-
-#if defined(HAVE_ASM_POPCNT) && \
-    defined(__x86_64__)
-
-static inline uint64_t popcnt64(uint64_t x)
-{
-  __asm__ ("popcnt %1, %0" : "=r" (x) : "0" (x));
-  return x;
-}
-
-#elif defined(HAVE_ASM_POPCNT) && \
-      defined(__i386__)
-
-static inline uint32_t popcnt32(uint32_t x)
-{
-  __asm__ ("popcnt %1, %0" : "=r" (x) : "0" (x));
-  return x;
-}
-
-static inline uint64_t popcnt64(uint64_t x)
-{
-  return popcnt32((uint32_t) x) +
-         popcnt32((uint32_t)(x >> 32));
-}
-
-#elif defined(_MSC_VER) && \
-      defined(_M_X64)
-
-#include <nmmintrin.h>
-
-static inline uint64_t popcnt64(uint64_t x)
-{
-  return _mm_popcnt_u64(x);
-}
-
-#elif defined(_MSC_VER) && \
-      defined(_M_IX86)
-
-#include <nmmintrin.h>
-
-static inline uint64_t popcnt64(uint64_t x)
-{
-  return _mm_popcnt_u32((uint32_t) x) + 
-         _mm_popcnt_u32((uint32_t)(x >> 32));
-}
-
-/* non x86 CPUs */
-#elif defined(HAVE_BUILTIN_POPCOUNT)
-
-static inline uint64_t popcnt64(uint64_t x)
-{
-  return __builtin_popcountll(x);
-}
-
-/* no hardware POPCNT,
- * use pure integer algorithm */
-#else
-
-static inline uint64_t popcnt64(uint64_t x)
-{
-  return popcount64(x);
-}
-
-#endif
-
-static inline uint64_t popcnt64_unrolled(const uint64_t* data, uint64_t size)
-{
-  uint64_t i = 0;
-  uint64_t limit = size - size % 4;
-  uint64_t cnt = 0;
-
-  for (; i < limit; i += 4)
-  {
-    cnt += popcnt64(data[i+0]);
-    cnt += popcnt64(data[i+1]);
-    cnt += popcnt64(data[i+2]);
-    cnt += popcnt64(data[i+3]);
-  }
-
-  for (; i < size; i++)
-    cnt += popcnt64(data[i]);
-
-  return cnt;
-}
-
-#if defined(HAVE_CPUID)
-
-#if defined(_MSC_VER)
-  #include <intrin.h>
-  #include <immintrin.h>
-#endif
-
-/* %ecx bit flags */
-#define bit_POPCNT (1 << 23)
-
-/* %ebx bit flags */
-#define bit_AVX2   (1 << 5)
-#define bit_AVX512 (1 << 30)
-
-/* xgetbv bit flags */
-#define XSTATE_SSE (1 << 1)
-#define XSTATE_YMM (1 << 2)
-#define XSTATE_ZMM (7 << 5)
-
-static inline void run_cpuid(int eax, int ecx, int* abcd)
-{
-#if defined(_MSC_VER)
-  __cpuidex(abcd, eax, ecx);
-#else
-  int ebx = 0;
-  int edx = 0;
-
-  #if defined(__i386__) && \
-      defined(__PIC__)
-    /* in case of PIC under 32-bit EBX cannot be clobbered */
-    __asm__ ("movl %%ebx, %%edi;"
-             "cpuid;"
-             "xchgl %%ebx, %%edi;"
-             : "=D" (ebx),
-               "+a" (eax),
-               "+c" (ecx),
-               "=d" (edx));
-  #else
-    __asm__ ("cpuid;"
-             : "+b" (ebx),
-               "+a" (eax),
-               "+c" (ecx),
-               "=d" (edx));
-  #endif
-
-  abcd[0] = eax;
-  abcd[1] = ebx;
-  abcd[2] = ecx;
-  abcd[3] = edx;
-#endif
-}
-
-#if defined(HAVE_AVX2) || \
-    defined(HAVE_AVX512)
-
-static inline int get_xcr0()
-{
-  int xcr0;
-
-#if defined(_MSC_VER)
-  xcr0 = (int) _xgetbv(0);
-#else
-  __asm__ ("xgetbv" : "=a" (xcr0) : "c" (0) : "%edx" );
-#endif
-
-  return xcr0;
-}
-
-#endif
-
-static inline int get_cpuid()
-{
-  int flags = 0;
-  int abcd[4];
-
-  run_cpuid(1, 0, abcd);
-
-  if ((abcd[2] & bit_POPCNT) == bit_POPCNT)
-    flags |= bit_POPCNT;
-
-#if defined(HAVE_AVX2) || \
-    defined(HAVE_AVX512)
-
-  int osxsave_mask = (1 << 27);
-
-  /* ensure OS supports extended processor state management */
-  if ((abcd[2] & osxsave_mask) != osxsave_mask)
-    return 0;
-
-  int ymm_mask = XSTATE_SSE | XSTATE_YMM;
-  int zmm_mask = XSTATE_SSE | XSTATE_YMM | XSTATE_ZMM;
-
-  int xcr0 = get_xcr0();
-
-  if ((xcr0 & ymm_mask) == ymm_mask)
-  {
-    run_cpuid(7, 0, abcd);
-
-    if ((abcd[1] & bit_AVX2) == bit_AVX2)
-      flags |= bit_AVX2;
-
-    if ((xcr0 & zmm_mask) == zmm_mask)
-    {
-      if ((abcd[1] & bit_AVX512) == bit_AVX512)
-        flags |= bit_AVX512;
-    }
-  }
-
-#endif
-
-  return flags;
-}
-
-#endif /* cpuid */
-
-#if defined(HAVE_AVX2)
-
-#include <immintrin.h>
-
-#if !defined(_MSC_VER)
-  __attribute__ ((target ("avx2")))
-#endif
-static inline void CSA256(__m256i* h, __m256i* l, __m256i a, __m256i b, __m256i c)
-{
-  __m256i u = _mm256_xor_si256(a, b);
-  *h = _mm256_or_si256(_mm256_and_si256(a, b), _mm256_and_si256(u, c));
-  *l = _mm256_xor_si256(u, c);
-}
-
-#if !defined(_MSC_VER)
-  __attribute__ ((target ("avx2")))
-#endif
-static inline __m256i popcnt256(__m256i v)
-{
-  __m256i lookup1 = _mm256_setr_epi8(
-      4, 5, 5, 6, 5, 6, 6, 7,
-      5, 6, 6, 7, 6, 7, 7, 8,
-      4, 5, 5, 6, 5, 6, 6, 7,
-      5, 6, 6, 7, 6, 7, 7, 8
-  );
-
-  __m256i lookup2 = _mm256_setr_epi8(
-      4, 3, 3, 2, 3, 2, 2, 1,
-      3, 2, 2, 1, 2, 1, 1, 0,
-      4, 3, 3, 2, 3, 2, 2, 1,
-      3, 2, 2, 1, 2, 1, 1, 0
-  );
-
-  __m256i low_mask = _mm256_set1_epi8(0x0f);
-  __m256i lo = _mm256_and_si256(v, low_mask);
-  __m256i hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), low_mask);
-  __m256i popcnt1 = _mm256_shuffle_epi8(lookup1, lo);
-  __m256i popcnt2 = _mm256_shuffle_epi8(lookup2, hi);
-
-  return _mm256_sad_epu8(popcnt1, popcnt2);
-}
-
-/*
- * AVX2 Harley-Seal popcount (4th iteration).
- * The algorithm is based on the paper "Faster Population Counts
- * using AVX2 Instructions" by Daniel Lemire, Nathan Kurz and
- * Wojciech Mula (23 Nov 2016).
- * @see https://arxiv.org/abs/1611.07612
- */
-#if !defined(_MSC_VER)
-  __attribute__ ((target ("avx2")))
-#endif
-static inline uint64_t popcnt_avx2(const __m256i* data, uint64_t size)
-{
-  __m256i cnt = _mm256_setzero_si256();
-  __m256i ones = _mm256_setzero_si256();
-  __m256i twos = _mm256_setzero_si256();
-  __m256i fours = _mm256_setzero_si256();
-  __m256i eights = _mm256_setzero_si256();
-  __m256i sixteens = _mm256_setzero_si256();
-  __m256i twosA, twosB, foursA, foursB, eightsA, eightsB;
-
-  uint64_t i = 0;
-  uint64_t limit = size - size % 16;
-  uint64_t* cnt64;
-
-  for(; i < limit; i += 16)
-  {
-    CSA256(&twosA, &ones, ones, data[i+0], data[i+1]);
-    CSA256(&twosB, &ones, ones, data[i+2], data[i+3]);
-    CSA256(&foursA, &twos, twos, twosA, twosB);
-    CSA256(&twosA, &ones, ones, data[i+4], data[i+5]);
-    CSA256(&twosB, &ones, ones, data[i+6], data[i+7]);
-    CSA256(&foursB, &twos, twos, twosA, twosB);
-    CSA256(&eightsA, &fours, fours, foursA, foursB);
-    CSA256(&twosA, &ones, ones, data[i+8], data[i+9]);
-    CSA256(&twosB, &ones, ones, data[i+10], data[i+11]);
-    CSA256(&foursA, &twos, twos, twosA, twosB);
-    CSA256(&twosA, &ones, ones, data[i+12], data[i+13]);
-    CSA256(&twosB, &ones, ones, data[i+14], data[i+15]);
-    CSA256(&foursB, &twos, twos, twosA, twosB);
-    CSA256(&eightsB, &fours, fours, foursA, foursB);
-    CSA256(&sixteens, &eights, eights, eightsA, eightsB);
-
-    cnt = _mm256_add_epi64(cnt, popcnt256(sixteens));
-  }
-
-  cnt = _mm256_slli_epi64(cnt, 4);
-  cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(popcnt256(eights), 3));
-  cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(popcnt256(fours), 2));
-  cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(popcnt256(twos), 1));
-  cnt = _mm256_add_epi64(cnt, popcnt256(ones));
-
-  for(; i < size; i++)
-    cnt = _mm256_add_epi64(cnt, popcnt256(data[i]));
-
-  cnt64 = (uint64_t*) &cnt;
-
-  return cnt64[0] +
-         cnt64[1] +
-         cnt64[2] +
-         cnt64[3];
-}
-
-/* Align memory to 32 bytes boundary */
-static inline void align_avx2(const uint8_t** p, uint64_t* size, uint64_t* cnt)
-{
-  for (; (uintptr_t) *p % 8; (*p)++)
-  {
-    *cnt += popcnt64(**p);
-    *size -= 1;
-  }
-  for (; (uintptr_t) *p % 32; (*p) += 8)
-  {
-    *cnt += popcnt64(
-        *(const uint64_t*) *p);
-    *size -= 8;
-  }
-}
-
-#endif
-
-#if defined(HAVE_AVX512)
-
-#include <immintrin.h>
-
-#if !defined(_MSC_VER)
-  __attribute__ ((target ("avx512bw")))
-#endif
-static inline __m512i popcnt512(__m512i v)
-{
-  __m512i m1 = _mm512_set1_epi8(0x55);
-  __m512i m2 = _mm512_set1_epi8(0x33);
-  __m512i m4 = _mm512_set1_epi8(0x0F);
-  __m512i t1 = _mm512_sub_epi8(v, (_mm512_srli_epi16(v, 1) & m1));
-  __m512i t2 = _mm512_add_epi8(t1 & m2, (_mm512_srli_epi16(t1, 2) & m2));
-  __m512i t3 = _mm512_add_epi8(t2, _mm512_srli_epi16(t2, 4)) & m4;
-
-  return _mm512_sad_epu8(t3, _mm512_setzero_si512());
-}
-
-#if !defined(_MSC_VER)
-  __attribute__ ((target ("avx512bw")))
-#endif
-static inline void CSA512(__m512i* h, __m512i* l, __m512i a, __m512i b, __m512i c)
-{
-  *l = _mm512_ternarylogic_epi32(c, b, a, 0x96);
-  *h = _mm512_ternarylogic_epi32(c, b, a, 0xe8);
-}
-
-/*
- * AVX512 Harley-Seal popcount (4th iteration).
- * The algorithm is based on the paper "Faster Population Counts
- * using AVX2 Instructions" by Daniel Lemire, Nathan Kurz and
- * Wojciech Mula (23 Nov 2016).
- * @see https://arxiv.org/abs/1611.07612
- */
-#if !defined(_MSC_VER)
-  __attribute__ ((target ("avx512bw")))
-#endif
-static inline uint64_t popcnt_avx512(const __m512i* data, const uint64_t size)
-{
-  __m512i cnt = _mm512_setzero_si512();
-  __m512i ones = _mm512_setzero_si512();
-  __m512i twos = _mm512_setzero_si512();
-  __m512i fours = _mm512_setzero_si512();
-  __m512i eights = _mm512_setzero_si512();
-  __m512i sixteens = _mm512_setzero_si512();
-  __m512i twosA, twosB, foursA, foursB, eightsA, eightsB;
-
-  uint64_t i = 0;
-  uint64_t limit = size - size % 16;
-  uint64_t* cnt64;
-
-  for(; i < limit; i += 16)
-  {
-    CSA512(&twosA, &ones, ones, data[i+0], data[i+1]);
-    CSA512(&twosB, &ones, ones, data[i+2], data[i+3]);
-    CSA512(&foursA, &twos, twos, twosA, twosB);
-    CSA512(&twosA, &ones, ones, data[i+4], data[i+5]);
-    CSA512(&twosB, &ones, ones, data[i+6], data[i+7]);
-    CSA512(&foursB, &twos, twos, twosA, twosB);
-    CSA512(&eightsA, &fours, fours, foursA, foursB);
-    CSA512(&twosA, &ones, ones, data[i+8], data[i+9]);
-    CSA512(&twosB, &ones, ones, data[i+10], data[i+11]);
-    CSA512(&foursA, &twos, twos, twosA, twosB);
-    CSA512(&twosA, &ones, ones, data[i+12], data[i+13]);
-    CSA512(&twosB, &ones, ones, data[i+14], data[i+15]);
-    CSA512(&foursB, &twos, twos, twosA, twosB);
-    CSA512(&eightsB, &fours, fours, foursA, foursB);
-    CSA512(&sixteens, &eights, eights, eightsA, eightsB);
-
-    cnt = _mm512_add_epi64(cnt, popcnt512(sixteens));
-  }
-
-  cnt = _mm512_slli_epi64(cnt, 4);
-  cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(popcnt512(eights), 3));
-  cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(popcnt512(fours), 2));
-  cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(popcnt512(twos), 1));
-  cnt = _mm512_add_epi64(cnt, popcnt512(ones));
-
-  for(; i < size; i++)
-    cnt = _mm512_add_epi64(cnt, popcnt512(data[i]));
-
-  cnt64 = (uint64_t*) &cnt;
-
-  return cnt64[0] +
-         cnt64[1] +
-         cnt64[2] +
-         cnt64[3] +
-         cnt64[4] +
-         cnt64[5] +
-         cnt64[6] +
-         cnt64[7];
-}
-
-/* Align memory to 64 bytes boundary */
-static inline void align_avx512(const uint8_t** p, uint64_t* size, uint64_t* cnt)
-{
-  for (; (uintptr_t) *p % 8; (*p)++)
-  {
-    *cnt += popcnt64(**p);
-    *size -= 1;
-  }
-  for (; (uintptr_t) *p % 64; (*p) += 8)
-  {
-    *cnt += popcnt64(
-        *(const uint64_t*) *p);
-    *size -= 8;
-  }
-}
-
-#endif
-
-/* x86 CPUs */
-#if defined(X86_OR_X64)
-
-/* Align memory to 8 bytes boundary */
-static inline void align_8(const uint8_t** p, uint64_t* size, uint64_t* cnt)
-{
-  for (; *size > 0 && (uintptr_t) *p % 8; (*p)++)
-  {
-    *cnt += popcount64(**p);
-    *size -= 1;
-  }
-}
-
-static inline uint64_t popcount64_unrolled(const uint64_t* data, uint64_t size)
-{
-  uint64_t i = 0;
-  uint64_t limit = size - size % 4;
-  uint64_t cnt = 0;
-
-  for (; i < limit; i += 4)
-  {
-    cnt += popcount64(data[i+0]);
-    cnt += popcount64(data[i+1]);
-    cnt += popcount64(data[i+2]);
-    cnt += popcount64(data[i+3]);
-  }
-
-  for (; i < size; i++)
-    cnt += popcount64(data[i]);
-
-  return cnt;
-}
-
-/*
- * Count the number of 1 bits in the data array
- * @data: An array
- * @size: Size of data in bytes
- */
-static inline uint64_t popcnt(const void* data, uint64_t size)
-{
-  const uint8_t* ptr = (const uint8_t*) data;
-  uint64_t cnt = 0;
-  uint64_t i;
-
-#if defined(HAVE_CPUID)
-  #if defined(__cplusplus)
-    /* C++11 thread-safe singleton */
-    static const int cpuid = get_cpuid();
-  #else
-    static int cpuid_ = -1;
-    int cpuid = cpuid_;
-    if (cpuid == -1)
-    {
-      cpuid = get_cpuid();
-
-      #if defined(_MSC_VER)
-        _InterlockedCompareExchange(&cpuid_, cpuid, -1);
-      #else
-        __sync_val_compare_and_swap(&cpuid_, -1, cpuid);
-      #endif
-    }
-  #endif
-#endif
-
-#if defined(HAVE_AVX512)
-
-  /* AVX512 requires arrays >= 1024 bytes */
-  if ((cpuid & bit_AVX512) &&
-      size >= 1024)
-  {
-    align_avx512(&ptr, &size, &cnt);
-    cnt += popcnt_avx512((const __m512i*) ptr, size / 64);
-    ptr += size - size % 64;
-    size = size % 64;
-  }
-
-#endif
-
-#if defined(HAVE_AVX2)
-
-  /* AVX2 requires arrays >= 512 bytes */
-  if ((cpuid & bit_AVX2) &&
-      size >= 512)
-  {
-    align_avx2(&ptr, &size, &cnt);
-    cnt += popcnt_avx2((const __m256i*) ptr, size / 32);
-    ptr += size - size % 32;
-    size = size % 32;
-  }
-
-#endif
-
-#if defined(HAVE_POPCNT)
-
-  if (cpuid & bit_POPCNT)
-  {
-    cnt += popcnt64_unrolled((const uint64_t*) ptr, size / 8);
-    ptr += size - size % 8;
-    size = size % 8;
-    for (i = 0; i < size; i++)
-      cnt += popcnt64(ptr[i]);
-
-    return cnt;
-  }
-
-#endif
-
-  /* pure integer popcount algorithm */
-  if (size >= 8)
-  {
-    align_8(&ptr, &size, &cnt);
-    cnt += popcount64_unrolled((const uint64_t*) ptr, size / 8);
-    ptr += size - size % 8;
-    size = size % 8;
-  }
-
-  /* pure integer popcount algorithm */
-  for (i = 0; i < size; i++)
-    cnt += popcount64(ptr[i]);
-
-  return cnt;
-}
-
-#elif defined(__ARM_NEON) || \
-      defined(__aarch64__)
-
-#include <arm_neon.h>
-
-/* Align memory to 8 bytes boundary */
-static inline void align_8(const uint8_t** p, uint64_t* size, uint64_t* cnt)
-{
-  for (; *size > 0 && (uintptr_t) *p % 8; (*p)++)
-  {
-    *cnt += popcnt64(**p);
-    *size -= 1;
-  }
-}
-
-static inline uint64x2_t vpadalq(uint64x2_t sum, uint8x16_t t)
-{
-  return vpadalq_u32(sum, vpaddlq_u16(vpaddlq_u8(t)));
-}
-
-/*
- * Count the number of 1 bits in the data array
- * @data: An array
- * @size: Size of data in bytes
- */
-static inline uint64_t popcnt(const void* data, uint64_t size)
-{
-  uint64_t cnt = 0;
-  uint64_t chunk_size = 64;
-  const uint8_t* ptr = (const uint8_t*) data;
-
-  if (size >= chunk_size)
-  {
-    uint64_t i = 0;
-    uint64_t iters = size / chunk_size;
-    uint64x2_t sum = vcombine_u64(vcreate_u64(0), vcreate_u64(0));
-    uint8x16_t zero = vcombine_u8(vcreate_u8(0), vcreate_u8(0));
-
-    do
-    {
-      uint8x16_t t0 = zero;
-      uint8x16_t t1 = zero;
-      uint8x16_t t2 = zero;
-      uint8x16_t t3 = zero;
-
-      /*
-       * After every 31 iterations we need to add the
-       * temporary sums (t0, t1, t2, t3) to the total sum.
-       * We must ensure that the temporary sums <= 255
-       * and 31 * 8 bits = 248 which is OK.
-       */
-      uint64_t limit = (i + 31 < iters) ? i + 31 : iters;
-  
-      /* Each iteration processes 64 bytes */
-      for (; i < limit; i++)
-      {
-        uint8x16x4_t input = vld4q_u8(ptr);
-        ptr += chunk_size;
-
-        t0 = vaddq_u8(t0, vcntq_u8(input.val[0]));
-        t1 = vaddq_u8(t1, vcntq_u8(input.val[1]));
-        t2 = vaddq_u8(t2, vcntq_u8(input.val[2]));
-        t3 = vaddq_u8(t3, vcntq_u8(input.val[3]));
-      }
-
-      sum = vpadalq(sum, t0);
-      sum = vpadalq(sum, t1);
-      sum = vpadalq(sum, t2);
-      sum = vpadalq(sum, t3);
-    }
-    while (i < iters);
-
-    uint64_t tmp[2];
-    vst1q_u64(tmp, sum);
-    cnt += tmp[0];
-    cnt += tmp[1];
-  }
-
-  size %= chunk_size;
-  align_8(&ptr, &size, &cnt);
-  const uint64_t* ptr64 = (const uint64_t*) ptr;
-  uint64_t iters = size / 8;
-
-  for (uint64_t i = 0; i < iters; i++)
-    cnt += popcnt64(ptr64[i]);
-
-  ptr += size - size % 8;
-  size = size % 8;
-
-  for (uint64_t i = 0; i < size; i++)
-    cnt += popcnt64(ptr[i]);
-
-  return cnt;
-}
-
-/* all other CPUs */
-#else
-
-/* Align memory to 8 bytes boundary */
-static inline void align_8(const uint8_t** p, uint64_t* size, uint64_t* cnt)
-{
-  for (; *size > 0 && (uintptr_t) *p % 8; (*p)++)
-  {
-    *cnt += popcnt64(**p);
-    *size -= 1;
-  }
-}
-
-/*
- * Count the number of 1 bits in the data array
- * @data: An array
- * @size: Size of data in bytes
- */
-static inline uint64_t popcnt(const void* data, uint64_t size)
-{
-  const uint8_t* ptr = (const uint8_t*) data;
-  uint64_t cnt = 0;
-  uint64_t i;
-
-  align_8(&ptr, &size, &cnt);
-  cnt += popcnt64_unrolled((const uint64_t*) ptr, size / 8);
-  ptr += size - size % 8;
-  size = size % 8;
-  for (i = 0; i < size; i++)
-    cnt += popcnt64(ptr[i]);
-
-  return cnt;
-}
-
-#endif
-
-#ifdef __cplusplus
-} /* extern "C" */
-#endif
-
-#endif /* LIBPOPCNT_H */
+/*
+ * libpopcnt.h - C/C++ library for counting the number of 1 bits (bit
+ * population count) in an array as quickly as possible using
+ * specialized CPU instructions i.e. POPCNT, AVX2, AVX512, NEON.
+ *
+ * Copyright (c) 2016 - 2020, Kim Walisch
+ * Copyright (c) 2016 - 2018, Wojciech Muła
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef LIBPOPCNT_H
+#define LIBPOPCNT_H
+
+#include <stdint.h>
+#include <string.h>
+
+#ifndef __has_builtin
+  #define __has_builtin(x) 0
+#endif
+
+#ifndef __has_attribute
+  #define __has_attribute(x) 0
+#endif
+
+#ifdef __GNUC__
+  #define GNUC_PREREQ(x, y) \
+      (__GNUC__ > x || (__GNUC__ == x && __GNUC_MINOR__ >= y))
+#else
+  #define GNUC_PREREQ(x, y) 0
+#endif
+
+#ifdef __clang__
+  #define CLANG_PREREQ(x, y) \
+      (__clang_major__ > x || (__clang_major__ == x && __clang_minor__ >= y))
+#else
+  #define CLANG_PREREQ(x, y) 0
+#endif
+
+#if (_MSC_VER < 1900) && \
+    !defined(__cplusplus)
+  #define inline __inline
+#endif
+
+#if (defined(__i386__) || \
+     defined(__x86_64__) || \
+     defined(_M_IX86) || \
+     defined(_M_X64))
+  #define X86_OR_X64
+#endif
+
+#if GNUC_PREREQ(4, 2) || \
+    __has_builtin(__builtin_popcount)
+  #define HAVE_BUILTIN_POPCOUNT
+#endif
+
+#if GNUC_PREREQ(4, 2) || \
+    CLANG_PREREQ(3, 0)
+  #define HAVE_ASM_POPCNT
+#endif
+
+#if defined(X86_OR_X64) && \
+   (defined(HAVE_ASM_POPCNT) || \
+    defined(_MSC_VER))
+  #define HAVE_POPCNT
+#endif
+
+#if defined(X86_OR_X64) && \
+    GNUC_PREREQ(4, 9)
+  #define HAVE_AVX2
+#endif
+
+#if defined(X86_OR_X64) && \
+    GNUC_PREREQ(5, 0)
+  #define HAVE_AVX512
+#endif
+
+#if defined(X86_OR_X64)
+  /* MSVC compatible compilers (Windows) */
+  #if defined(_MSC_VER)
+    /* clang-cl (LLVM 10 from 2020) requires /arch:AVX2 or
+    * /arch:AVX512 to enable vector instructions */
+    #if defined(__clang__)
+      #if defined(__AVX2__)
+        #define HAVE_AVX2
+      #endif
+      #if defined(__AVX512__)
+        #define HAVE_AVX2
+        #define HAVE_AVX512
+      #endif
+    /* MSVC 2017 or later does not require
+    * /arch:AVX2 or /arch:AVX512 */
+    #elif _MSC_VER >= 1910
+      #define HAVE_AVX2
+      #define HAVE_AVX512
+    #endif
+  /* Clang (Unix-like OSes) */
+  #elif CLANG_PREREQ(3, 8) && \
+        __has_attribute(target) && \
+        (!defined(__apple_build_version__) || __apple_build_version__ >= 8000000)
+    #define HAVE_AVX2
+    #define HAVE_AVX512
+  #endif
+#endif
+
+/*
+ * Only enable CPUID runtime checks if this is really
+ * needed. E.g. do not enable if user has compiled
+ * using -march=native on a CPU that supports AVX512.
+ */
+#if defined(X86_OR_X64) && \
+   (defined(__cplusplus) || \
+    defined(_MSC_VER) || \
+   (GNUC_PREREQ(4, 2) || \
+    __has_builtin(__sync_val_compare_and_swap))) && \
+   ((defined(HAVE_AVX512) && !(defined(__AVX512__) || defined(__AVX512BW__))) || \
+    (defined(HAVE_AVX2) && !defined(__AVX2__)) || \
+    (defined(HAVE_POPCNT) && !defined(__POPCNT__)))
+  #define HAVE_CPUID
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * This uses fewer arithmetic operations than any other known
+ * implementation on machines with fast multiplication.
+ * It uses 12 arithmetic operations, one of which is a multiply.
+ * http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation
+ */
+static inline uint64_t popcount64(uint64_t x)
+{
+  uint64_t m1 = 0x5555555555555555ll;
+  uint64_t m2 = 0x3333333333333333ll;
+  uint64_t m4 = 0x0F0F0F0F0F0F0F0Fll;
+  uint64_t h01 = 0x0101010101010101ll;
+
+  x -= (x >> 1) & m1;
+  x = (x & m2) + ((x >> 2) & m2);
+  x = (x + (x >> 4)) & m4;
+
+  return (x * h01) >> 56;
+}
+
+#if defined(HAVE_ASM_POPCNT) && \
+    defined(__x86_64__)
+
+static inline uint64_t popcnt64(uint64_t x)
+{
+  __asm__ ("popcnt %1, %0" : "=r" (x) : "0" (x));
+  return x;
+}
+
+#elif defined(HAVE_ASM_POPCNT) && \
+      defined(__i386__)
+
+static inline uint32_t popcnt32(uint32_t x)
+{
+  __asm__ ("popcnt %1, %0" : "=r" (x) : "0" (x));
+  return x;
+}
+
+static inline uint64_t popcnt64(uint64_t x)
+{
+  return popcnt32((uint32_t) x) +
+         popcnt32((uint32_t)(x >> 32));
+}
+
+#elif defined(_MSC_VER) && \
+      defined(_M_X64)
+
+#include <nmmintrin.h>
+
+static inline uint64_t popcnt64(uint64_t x)
+{
+  return _mm_popcnt_u64(x);
+}
+
+#elif defined(_MSC_VER) && \
+      defined(_M_IX86)
+
+#include <nmmintrin.h>
+
+static inline uint64_t popcnt64(uint64_t x)
+{
+  return _mm_popcnt_u32((uint32_t) x) + 
+         _mm_popcnt_u32((uint32_t)(x >> 32));
+}
+
+/* non x86 CPUs */
+#elif defined(HAVE_BUILTIN_POPCOUNT)
+
+static inline uint64_t popcnt64(uint64_t x)
+{
+  return __builtin_popcountll(x);
+}
+
+/* no hardware POPCNT,
+ * use pure integer algorithm */
+#else
+
+static inline uint64_t popcnt64(uint64_t x)
+{
+  return popcount64(x);
+}
+
+#endif
+
+#if defined(HAVE_CPUID)
+
+#if defined(_MSC_VER)
+  #include <intrin.h>
+  #include <immintrin.h>
+#endif
+
+/* %ecx bit flags */
+#define bit_POPCNT (1 << 23)
+
+/* %ebx bit flags */
+#define bit_AVX2   (1 << 5)
+#define bit_AVX512 (1 << 30)
+
+/* xgetbv bit flags */
+#define XSTATE_SSE (1 << 1)
+#define XSTATE_YMM (1 << 2)
+#define XSTATE_ZMM (7 << 5)
+
+static inline void run_cpuid(int eax, int ecx, int* abcd)
+{
+#if defined(_MSC_VER)
+  __cpuidex(abcd, eax, ecx);
+#else
+  int ebx = 0;
+  int edx = 0;
+
+  #if defined(__i386__) && \
+      defined(__PIC__)
+    /* in case of PIC under 32-bit EBX cannot be clobbered */
+    __asm__ ("movl %%ebx, %%edi;"
+             "cpuid;"
+             "xchgl %%ebx, %%edi;"
+             : "=D" (ebx),
+               "+a" (eax),
+               "+c" (ecx),
+               "=d" (edx));
+  #else
+    __asm__ ("cpuid;"
+             : "+b" (ebx),
+               "+a" (eax),
+               "+c" (ecx),
+               "=d" (edx));
+  #endif
+
+  abcd[0] = eax;
+  abcd[1] = ebx;
+  abcd[2] = ecx;
+  abcd[3] = edx;
+#endif
+}
+
+#if defined(HAVE_AVX2) || \
+    defined(HAVE_AVX512)
+
+static inline int get_xcr0()
+{
+  int xcr0;
+
+#if defined(_MSC_VER)
+  xcr0 = (int) _xgetbv(0);
+#else
+  __asm__ ("xgetbv" : "=a" (xcr0) : "c" (0) : "%edx" );
+#endif
+
+  return xcr0;
+}
+
+#endif
+
+static inline int get_cpuid()
+{
+  int flags = 0;
+  int abcd[4];
+
+  run_cpuid(1, 0, abcd);
+
+  if ((abcd[2] & bit_POPCNT) == bit_POPCNT)
+    flags |= bit_POPCNT;
+
+#if defined(HAVE_AVX2) || \
+    defined(HAVE_AVX512)
+
+  int osxsave_mask = (1 << 27);
+
+  /* ensure OS supports extended processor state management */
+  if ((abcd[2] & osxsave_mask) != osxsave_mask)
+    return 0;
+
+  int ymm_mask = XSTATE_SSE | XSTATE_YMM;
+  int zmm_mask = XSTATE_SSE | XSTATE_YMM | XSTATE_ZMM;
+
+  int xcr0 = get_xcr0();
+
+  if ((xcr0 & ymm_mask) == ymm_mask)
+  {
+    run_cpuid(7, 0, abcd);
+
+    if ((abcd[1] & bit_AVX2) == bit_AVX2)
+      flags |= bit_AVX2;
+
+    if ((xcr0 & zmm_mask) == zmm_mask)
+    {
+      if ((abcd[1] & bit_AVX512) == bit_AVX512)
+        flags |= bit_AVX512;
+    }
+  }
+
+#endif
+
+  return flags;
+}
+
+#endif /* cpuid */
+
+#if defined(HAVE_AVX2)
+
+#include <immintrin.h>
+
+#if !defined(_MSC_VER)
+  __attribute__ ((target ("avx2")))
+#endif
+static inline void CSA256(__m256i* h, __m256i* l, __m256i a, __m256i b, __m256i c)
+{
+  __m256i u = _mm256_xor_si256(a, b);
+  *h = _mm256_or_si256(_mm256_and_si256(a, b), _mm256_and_si256(u, c));
+  *l = _mm256_xor_si256(u, c);
+}
+
+#if !defined(_MSC_VER)
+  __attribute__ ((target ("avx2")))
+#endif
+static inline __m256i popcnt256(__m256i v)
+{
+  __m256i lookup1 = _mm256_setr_epi8(
+      4, 5, 5, 6, 5, 6, 6, 7,
+      5, 6, 6, 7, 6, 7, 7, 8,
+      4, 5, 5, 6, 5, 6, 6, 7,
+      5, 6, 6, 7, 6, 7, 7, 8
+  );
+
+  __m256i lookup2 = _mm256_setr_epi8(
+      4, 3, 3, 2, 3, 2, 2, 1,
+      3, 2, 2, 1, 2, 1, 1, 0,
+      4, 3, 3, 2, 3, 2, 2, 1,
+      3, 2, 2, 1, 2, 1, 1, 0
+  );
+
+  __m256i low_mask = _mm256_set1_epi8(0x0f);
+  __m256i lo = _mm256_and_si256(v, low_mask);
+  __m256i hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), low_mask);
+  __m256i popcnt1 = _mm256_shuffle_epi8(lookup1, lo);
+  __m256i popcnt2 = _mm256_shuffle_epi8(lookup2, hi);
+
+  return _mm256_sad_epu8(popcnt1, popcnt2);
+}
+
+/*
+ * AVX2 Harley-Seal popcount (4th iteration).
+ * The algorithm is based on the paper "Faster Population Counts
+ * using AVX2 Instructions" by Daniel Lemire, Nathan Kurz and
+ * Wojciech Mula (23 Nov 2016).
+ * @see https://arxiv.org/abs/1611.07612
+ */
+#if !defined(_MSC_VER)
+  __attribute__ ((target ("avx2")))
+#endif
+static inline uint64_t popcnt_avx2(const __m256i* ptr, uint64_t size)
+{
+  __m256i cnt = _mm256_setzero_si256();
+  __m256i ones = _mm256_setzero_si256();
+  __m256i twos = _mm256_setzero_si256();
+  __m256i fours = _mm256_setzero_si256();
+  __m256i eights = _mm256_setzero_si256();
+  __m256i sixteens = _mm256_setzero_si256();
+  __m256i twosA, twosB, foursA, foursB, eightsA, eightsB;
+
+  uint64_t i = 0;
+  uint64_t limit = size - size % 16;
+  uint64_t* cnt64;
+
+  for(; i < limit; i += 16)
+  {
+    CSA256(&twosA, &ones, ones, _mm256_loadu_si256(ptr + i + 0), _mm256_loadu_si256(ptr + i + 1));
+    CSA256(&twosB, &ones, ones, _mm256_loadu_si256(ptr + i + 2), _mm256_loadu_si256(ptr + i + 3));
+    CSA256(&foursA, &twos, twos, twosA, twosB);
+    CSA256(&twosA, &ones, ones, _mm256_loadu_si256(ptr + i + 4), _mm256_loadu_si256(ptr + i + 5));
+    CSA256(&twosB, &ones, ones, _mm256_loadu_si256(ptr + i + 6), _mm256_loadu_si256(ptr + i + 7));
+    CSA256(&foursB, &twos, twos, twosA, twosB);
+    CSA256(&eightsA, &fours, fours, foursA, foursB);
+    CSA256(&twosA, &ones, ones, _mm256_loadu_si256(ptr + i + 8), _mm256_loadu_si256(ptr + i + 9));
+    CSA256(&twosB, &ones, ones, _mm256_loadu_si256(ptr + i + 10), _mm256_loadu_si256(ptr + i + 11));
+    CSA256(&foursA, &twos, twos, twosA, twosB);
+    CSA256(&twosA, &ones, ones, _mm256_loadu_si256(ptr + i + 12), _mm256_loadu_si256(ptr + i + 13));
+    CSA256(&twosB, &ones, ones, _mm256_loadu_si256(ptr + i + 14), _mm256_loadu_si256(ptr + i + 15));
+    CSA256(&foursB, &twos, twos, twosA, twosB);
+    CSA256(&eightsB, &fours, fours, foursA, foursB);
+    CSA256(&sixteens, &eights, eights, eightsA, eightsB);
+
+    cnt = _mm256_add_epi64(cnt, popcnt256(sixteens));
+  }
+
+  cnt = _mm256_slli_epi64(cnt, 4);
+  cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(popcnt256(eights), 3));
+  cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(popcnt256(fours), 2));
+  cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(popcnt256(twos), 1));
+  cnt = _mm256_add_epi64(cnt, popcnt256(ones));
+
+  for(; i < size; i++)
+    cnt = _mm256_add_epi64(cnt, popcnt256(_mm256_loadu_si256(ptr + i)));
+
+  cnt64 = (uint64_t*) &cnt;
+
+  return cnt64[0] +
+         cnt64[1] +
+         cnt64[2] +
+         cnt64[3];
+}
+
+#endif
+
+#if defined(HAVE_AVX512)
+
+#include <immintrin.h>
+
+#if !defined(_MSC_VER)
+  __attribute__ ((target ("avx512bw")))
+#endif
+static inline __m512i popcnt512(__m512i v)
+{
+  __m512i m1 = _mm512_set1_epi8(0x55);
+  __m512i m2 = _mm512_set1_epi8(0x33);
+  __m512i m4 = _mm512_set1_epi8(0x0F);
+  __m512i vm = _mm512_and_si512(_mm512_srli_epi16(v, 1), m1);
+  __m512i t1 = _mm512_sub_epi8(v, vm);
+  __m512i tm = _mm512_and_si512(t1, m2);
+  __m512i tm2 = _mm512_and_si512(_mm512_srli_epi16(t1, 2), m2);
+  __m512i t2 = _mm512_add_epi8(tm, tm2);
+  __m512i tt = _mm512_add_epi8(t2, _mm512_srli_epi16(t2, 4));
+  __m512i t3 = _mm512_and_si512(tt, m4);
+
+  return _mm512_sad_epu8(t3, _mm512_setzero_si512());
+}
+
+#if !defined(_MSC_VER)
+  __attribute__ ((target ("avx512bw")))
+#endif
+static inline void CSA512(__m512i* h, __m512i* l, __m512i a, __m512i b, __m512i c)
+{
+  *l = _mm512_ternarylogic_epi32(c, b, a, 0x96);
+  *h = _mm512_ternarylogic_epi32(c, b, a, 0xe8);
+}
+
+/*
+ * AVX512 Harley-Seal popcount (4th iteration).
+ * The algorithm is based on the paper "Faster Population Counts
+ * using AVX2 Instructions" by Daniel Lemire, Nathan Kurz and
+ * Wojciech Mula (23 Nov 2016).
+ * @see https://arxiv.org/abs/1611.07612
+ */
+#if !defined(_MSC_VER)
+  __attribute__ ((target ("avx512bw")))
+#endif
+static inline uint64_t popcnt_avx512(const __m512i* ptr, const uint64_t size)
+{
+  __m512i cnt = _mm512_setzero_si512();
+  __m512i ones = _mm512_setzero_si512();
+  __m512i twos = _mm512_setzero_si512();
+  __m512i fours = _mm512_setzero_si512();
+  __m512i eights = _mm512_setzero_si512();
+  __m512i sixteens = _mm512_setzero_si512();
+  __m512i twosA, twosB, foursA, foursB, eightsA, eightsB;
+
+  uint64_t i = 0;
+  uint64_t limit = size - size % 16;
+  uint64_t* cnt64;
+
+  for(; i < limit; i += 16)
+  {
+    CSA512(&twosA, &ones, ones, _mm512_loadu_si512(ptr + i + 0), _mm512_loadu_si512(ptr + i + 1));
+    CSA512(&twosB, &ones, ones, _mm512_loadu_si512(ptr + i + 2), _mm512_loadu_si512(ptr + i + 3));
+    CSA512(&foursA, &twos, twos, twosA, twosB);
+    CSA512(&twosA, &ones, ones, _mm512_loadu_si512(ptr + i + 4), _mm512_loadu_si512(ptr + i + 5));
+    CSA512(&twosB, &ones, ones, _mm512_loadu_si512(ptr + i + 6), _mm512_loadu_si512(ptr + i + 7));
+    CSA512(&foursB, &twos, twos, twosA, twosB);
+    CSA512(&eightsA, &fours, fours, foursA, foursB);
+    CSA512(&twosA, &ones, ones, _mm512_loadu_si512(ptr + i + 8), _mm512_loadu_si512(ptr + i + 9));
+    CSA512(&twosB, &ones, ones, _mm512_loadu_si512(ptr + i + 10), _mm512_loadu_si512(ptr + i + 11));
+    CSA512(&foursA, &twos, twos, twosA, twosB);
+    CSA512(&twosA, &ones, ones, _mm512_loadu_si512(ptr + i + 12), _mm512_loadu_si512(ptr + i + 13));
+    CSA512(&twosB, &ones, ones, _mm512_loadu_si512(ptr + i + 14), _mm512_loadu_si512(ptr + i + 15));
+    CSA512(&foursB, &twos, twos, twosA, twosB);
+    CSA512(&eightsB, &fours, fours, foursA, foursB);
+    CSA512(&sixteens, &eights, eights, eightsA, eightsB);
+
+    cnt = _mm512_add_epi64(cnt, popcnt512(sixteens));
+  }
+
+  cnt = _mm512_slli_epi64(cnt, 4);
+  cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(popcnt512(eights), 3));
+  cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(popcnt512(fours), 2));
+  cnt = _mm512_add_epi64(cnt, _mm512_slli_epi64(popcnt512(twos), 1));
+  cnt = _mm512_add_epi64(cnt, popcnt512(ones));
+
+  for(; i < size; i++)
+    cnt = _mm512_add_epi64(cnt, popcnt512(_mm512_loadu_si512(ptr + i)));
+
+  cnt64 = (uint64_t*) &cnt;
+
+  return cnt64[0] +
+         cnt64[1] +
+         cnt64[2] +
+         cnt64[3] +
+         cnt64[4] +
+         cnt64[5] +
+         cnt64[6] +
+         cnt64[7];
+}
+
+#endif
+
+/* x86 CPUs */
+#if defined(X86_OR_X64)
+
+/*
+ * Count the number of 1 bits in the data array
+ * @data: An array
+ * @size: Size of data in bytes
+ */
+static inline uint64_t popcnt(const void* data, uint64_t size)
+{
+  uint64_t i = 0;
+  uint64_t cnt = 0;
+  const uint8_t* ptr = (const uint8_t*) data;
+
+/*
+ * CPUID runtime checks are only enabled if this is needed.
+ * E.g. CPUID is disabled when a user compiles his
+ * code using -march=native on a CPU with AVX512.
+ */
+#if defined(HAVE_CPUID)
+  #if defined(__cplusplus)
+    /* C++11 thread-safe singleton */
+    static const int cpuid = get_cpuid();
+  #else
+    static int cpuid_ = -1;
+    int cpuid = cpuid_;
+    if (cpuid == -1)
+    {
+      cpuid = get_cpuid();
+
+      #if defined(_MSC_VER)
+        _InterlockedCompareExchange(&cpuid_, cpuid, -1);
+      #else
+        __sync_val_compare_and_swap(&cpuid_, -1, cpuid);
+      #endif
+    }
+  #endif
+#endif
+
+#if defined(HAVE_AVX512)
+  #if defined(__AVX512__) || defined(__AVX512BW__)
+    /* AVX512 requires arrays >= 1024 bytes */
+    if (i + 1024 <= size)
+  #else
+    if ((cpuid & bit_AVX512) &&
+        i + 1024 <= size)
+  #endif
+    {
+      const __m512i* ptr512 = (const __m512i*)(ptr + i);
+      cnt += popcnt_avx512(ptr512, (size - i) / 64);
+      i = size - size % 64;
+    }
+#endif
+
+#if defined(HAVE_AVX2)
+  #if defined(__AVX2__)
+    /* AVX2 requires arrays >= 512 bytes */
+    if (i + 512 <= size)
+  #else
+    if ((cpuid & bit_AVX2) &&
+        i + 512 <= size)
+  #endif
+    {
+      const __m256i* ptr256 = (const __m256i*)(ptr + i);
+      cnt += popcnt_avx2(ptr256, (size - i) / 32);
+      i = size - size % 32;
+    }
+#endif
+
+#if defined(HAVE_POPCNT)
+  /* 
+   * The user has compiled without -mpopcnt.
+   * Unfortunately the MSVC compiler does not have
+   * a POPCNT macro so we cannot get rid of the
+   * runtime check for MSVC.
+   */
+  #if !defined(__POPCNT__)
+    if (cpuid & bit_POPCNT)
+  #endif
+    {
+      /* We use unaligned memory accesses here to improve performance */
+      for (; i < size - size % 8; i += 8)
+        cnt += popcnt64(*(const uint64_t*)(ptr + i));
+      for (; i < size; i++)
+        cnt += popcnt64(ptr[i]);
+
+      return cnt;
+    }
+#endif
+
+#if !defined(HAVE_POPCNT) || \
+    !defined(__POPCNT__)
+  /*
+   * Pure integer popcount algorithm.
+   * We use unaligned memory accesses here to improve performance.
+   */
+  for (; i < size - size % 8; i += 8)
+    cnt += popcount64(*(const uint64_t*)(ptr + i));
+
+  if (i < size)
+  {
+    uint64_t val = 0;
+    size_t bytes = (size_t)(size - i);
+    memcpy(&val, &ptr[i], bytes);
+    cnt += popcount64(val);
+  }
+
+  return cnt;
+#endif
+}
+
+#elif defined(__ARM_NEON) || \
+      defined(__aarch64__)
+
+#include <arm_neon.h>
+
+static inline uint64x2_t vpadalq(uint64x2_t sum, uint8x16_t t)
+{
+  return vpadalq_u32(sum, vpaddlq_u16(vpaddlq_u8(t)));
+}
+
+/*
+ * Count the number of 1 bits in the data array
+ * @data: An array
+ * @size: Size of data in bytes
+ */
+static inline uint64_t popcnt(const void* data, uint64_t size)
+{
+  uint64_t i = 0;
+  uint64_t cnt = 0;
+  uint64_t chunk_size = 64;
+  const uint8_t* ptr = (const uint8_t*) data;
+
+  if (size >= chunk_size)
+  {
+    uint64_t iters = size / chunk_size;
+    uint64x2_t sum = vcombine_u64(vcreate_u64(0), vcreate_u64(0));
+    uint8x16_t zero = vcombine_u8(vcreate_u8(0), vcreate_u8(0));
+
+    do
+    {
+      uint8x16_t t0 = zero;
+      uint8x16_t t1 = zero;
+      uint8x16_t t2 = zero;
+      uint8x16_t t3 = zero;
+
+      /*
+       * After every 31 iterations we need to add the
+       * temporary sums (t0, t1, t2, t3) to the total sum.
+       * We must ensure that the temporary sums <= 255
+       * and 31 * 8 bits = 248 which is OK.
+       */
+      uint64_t limit = (i + 31 < iters) ? i + 31 : iters;
+  
+      /* Each iteration processes 64 bytes */
+      for (; i < limit; i++)
+      {
+        uint8x16x4_t input = vld4q_u8(ptr);
+        ptr += chunk_size;
+
+        t0 = vaddq_u8(t0, vcntq_u8(input.val[0]));
+        t1 = vaddq_u8(t1, vcntq_u8(input.val[1]));
+        t2 = vaddq_u8(t2, vcntq_u8(input.val[2]));
+        t3 = vaddq_u8(t3, vcntq_u8(input.val[3]));
+      }
+
+      sum = vpadalq(sum, t0);
+      sum = vpadalq(sum, t1);
+      sum = vpadalq(sum, t2);
+      sum = vpadalq(sum, t3);
+    }
+    while (i < iters);
+
+    i = 0;
+    size %= chunk_size;
+
+    uint64_t tmp[2];
+    vst1q_u64(tmp, sum);
+    cnt += tmp[0];
+    cnt += tmp[1];
+  }
+
+#if defined(__ARM_FEATURE_UNALIGNED)
+  /* We use unaligned memory accesses here to improve performance */
+  for (; i < size - size % 8; i += 8)
+    cnt += popcnt64(*(const uint64_t*)(ptr + i));
+#else
+  if (i + 8 <= size)
+  {
+    /* Align memory to an 8 byte boundary */
+    for (; (uintptr_t)(ptr + i) % 8; i++)
+      cnt += popcnt64(ptr[i]);
+    for (; i < size - size % 8; i += 8)
+      cnt += popcnt64(*(const uint64_t*)(ptr + i));
+  }
+#endif
+
+  if (i < size)
+  {
+    uint64_t val = 0;
+    size_t bytes = (size_t)(size - i);
+    memcpy(&val, &ptr[i], bytes);
+    cnt += popcount64(val);
+  }
+
+  return cnt;
+}
+
+/* all other CPUs */
+#else
+
+/*
+ * Count the number of 1 bits in the data array
+ * @data: An array
+ * @size: Size of data in bytes
+ */
+static inline uint64_t popcnt(const void* data, uint64_t size)
+{
+  uint64_t i = 0;
+  uint64_t cnt = 0;
+  const uint8_t* ptr = (const uint8_t*) data;
+
+  if (size >= 8)
+  {
+    /*
+     * Since we don't know whether this CPU architecture
+     * supports unaligned memory accesses we align
+     * memory to an 8 byte boundary.
+     */
+    for (; (uintptr_t)(ptr + i) % 8; i++)
+      cnt += popcnt64(ptr[i]);
+    for (; i < size - size % 8; i += 8)
+      cnt += popcnt64(*(const uint64_t*)(ptr + i));
+  }
+
+  for (; i < size; i++)
+    cnt += popcnt64(ptr[i]);
+
+  return cnt;
+}
+
+#endif
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* LIBPOPCNT_H */