mirror of
https://github.com/gosticks/prefix-filter.git
synced 2025-10-16 11:55:40 +00:00
454 lines
15 KiB
C++
454 lines
15 KiB
C++
/* Taken from https://github.com/FastFilter/fastfilter_cpp */
|
|
#ifndef BLOOM_FILTER_BLOOM_FILTER_H_
|
|
#define BLOOM_FILTER_BLOOM_FILTER_H_
|
|
|
|
#include <algorithm>
|
|
#include <assert.h>
|
|
#include <sstream>
|
|
|
|
#include "../hashutil.h"
|
|
|
|
using namespace std;
|
|
using namespace hashing;
|
|
|
|
namespace bloomfilter {
|
|
// status returned by a Bloom filter operation
|
|
enum Status {
|
|
Ok = 0,
|
|
NotFound = 1,
|
|
NotEnoughSpace = 2,
|
|
NotSupported = 3,
|
|
};
|
|
|
|
inline uint32_t reduce(uint32_t hash, uint32_t n) {
|
|
// http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
|
|
return (uint32_t) (((uint64_t) hash * n) >> 32);
|
|
}
|
|
|
|
/**
|
|
* Given a value "word", produces an integer in [0,p) without division.
|
|
* The function is as fair as possible in the sense that if you iterate
|
|
* through all possible values of "word", then you will generate all
|
|
* possible outputs as uniformly as possible.
|
|
*/
|
|
static inline uint32_t fastrange32(uint32_t word, uint32_t p) {
|
|
// http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
|
|
return (uint32_t) (((uint64_t) word * (uint64_t) p) >> 32);
|
|
}
|
|
|
|
|
|
/**
|
|
* Given a value "word", produces an integer in [0,p) without division.
|
|
* The function is as fair as possible in the sense that if you iterate
|
|
* through all possible values of "word", then you will generate all
|
|
* possible outputs as uniformly as possible.
|
|
*/
|
|
static inline uint64_t fastrange64(uint64_t word, uint64_t p) {
|
|
// http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
|
|
// #ifdef __SIZEOF_INT128__// then we know we have a 128-bit int
|
|
return (uint64_t) (((__uint128_t) word * (__uint128_t) p) >> 64);
|
|
}
|
|
|
|
|
|
#ifndef UINT32_MAX
|
|
#define UINT32_MAX (0xffffffff)
|
|
#endif// UINT32_MAX
|
|
|
|
/**
|
|
* Given a value "word", produces an integer in [0,p) without division.
|
|
* The function is as fair as possible in the sense that if you iterate
|
|
* through all possible values of "word", then you will generate all
|
|
* possible outputs as uniformly as possible.
|
|
*/
|
|
static inline size_t fastrangesize(uint64_t word, size_t p) {
|
|
#if (SIZE_MAX == UINT32_MAX)
|
|
return (size_t) fastrange32(word, p);
|
|
#else // assume 64-bit
|
|
return (size_t) fastrange64(word, p);
|
|
#endif// SIZE_MAX == UINT32_MAX
|
|
}
|
|
|
|
static inline size_t getBestK(size_t bitsPerItem) {
|
|
return max(1, (int) round((double) bitsPerItem * log(2)));
|
|
}
|
|
|
|
inline uint64_t getBit(uint32_t index) { return 1L << (index & 63); }
|
|
|
|
template<typename ItemType, size_t bits_per_item, bool branchless,
|
|
typename HashFamily = TwoIndependentMultiplyShift,
|
|
int k = (int) ((double) bits_per_item * 0.693147180559945 + 0.5)>
|
|
class BloomFilter {
|
|
public:
|
|
uint64_t *data;
|
|
size_t size;
|
|
size_t arrayLength;
|
|
size_t bitCount;
|
|
int kk;
|
|
HashFamily hasher;
|
|
|
|
double BitsPerItem() const { return k; }
|
|
|
|
explicit BloomFilter(const size_t n) : hasher() {
|
|
this->size = 0;
|
|
this->kk = getBestK(bits_per_item);
|
|
this->bitCount = n * bits_per_item;
|
|
this->arrayLength = (bitCount + 63) / 64;
|
|
data = new uint64_t[arrayLength];
|
|
std::fill_n(data, arrayLength, 0);
|
|
}
|
|
|
|
~BloomFilter() { delete[] data; }
|
|
|
|
// Add an item to the filter.
|
|
Status Add(const ItemType &item);
|
|
|
|
// Add multiple items to the filter.
|
|
Status AddAll(const vector<ItemType> &data, const size_t start,
|
|
const size_t end) {
|
|
return AddAll(data.data(), start, end);
|
|
}
|
|
Status AddAll(const ItemType *data, const size_t start,
|
|
const size_t end);
|
|
// Report if the item is inserted, with false positive rate.
|
|
Status Contain(const ItemType &item) const;
|
|
|
|
/* methods for providing stats */
|
|
// summary infomation
|
|
std::string Info() const;
|
|
|
|
// number of current inserted items;
|
|
size_t Size() const { return size; }
|
|
|
|
// size of the filter in bytes.
|
|
size_t SizeInBytes() const { return arrayLength * 8; }
|
|
|
|
std::string get_name() const {
|
|
size_t BPI = bits_per_item;
|
|
size_t hash_k = k;
|
|
std::string name = "Lem-BF-" + std::to_string(BPI) + "[k=" + std::to_string(hash_k) + "]";
|
|
return name;
|
|
// if (branchless) {
|
|
// return name + "-{branchless}";
|
|
// } else {
|
|
// return name + "-{with-branch}";
|
|
// }
|
|
}
|
|
};
|
|
|
|
template<typename ItemType, size_t bits_per_item, bool branchless,
|
|
typename HashFamily, int k>
|
|
Status BloomFilter<ItemType, bits_per_item, branchless, HashFamily, k>::Add(
|
|
const ItemType &key) {
|
|
uint64_t hash = hasher(key);
|
|
uint64_t a = (hash >> 32) | (hash << 32);
|
|
uint64_t b = hash;
|
|
for (int i = 0; i < k; i++) {
|
|
// int index = reduce(a, this->bitCount);
|
|
// data[index >> 6] |= getBit(index);
|
|
// reworked to avoid overflows
|
|
// use the fact that reduce is not very sensitive to lower bits of a
|
|
data[fastrangesize(a, this->arrayLength)] |= getBit(a);
|
|
a += b;
|
|
}
|
|
return Ok;
|
|
}
|
|
|
|
const int blockShift = 15;
|
|
const int blockLen = 1 << blockShift;
|
|
|
|
inline void applyBlock(uint32_t *tmp, int block, int len, uint64_t *data) {
|
|
for (int i = 0; i < len; i++) {
|
|
uint32_t index = tmp[(block << blockShift) + i];
|
|
data[index >> 6] |= getBit(index);
|
|
}
|
|
}
|
|
|
|
template<typename ItemType, size_t bits_per_item, bool branchless,
|
|
typename HashFamily, int k>
|
|
Status BloomFilter<ItemType, bits_per_item, branchless, HashFamily, k>::AddAll(
|
|
const ItemType *keys, const size_t start, const size_t end) {
|
|
// we have that AddAll assumes that arrayLength << 6 is a
|
|
// 32-bit integer
|
|
if (arrayLength > 0x3ffffff) {
|
|
for (size_t i = start; i < end; i++) {
|
|
Add(keys[i]);
|
|
}
|
|
return Ok;
|
|
}
|
|
int blocks = 1 + arrayLength / blockLen;
|
|
uint32_t *tmp = new uint32_t[blocks * blockLen];
|
|
int *tmpLen = new int[blocks]();
|
|
for (size_t i = start; i < end; i++) {
|
|
uint64_t key = keys[i];
|
|
uint64_t hash = hasher(key);
|
|
uint64_t a = (hash >> 32) | (hash << 32);
|
|
uint64_t b = hash;
|
|
for (int j = 0; j < k; j++) {
|
|
int index = fastrangesize(a, this->arrayLength);
|
|
int block = index >> blockShift;
|
|
int len = tmpLen[block];
|
|
tmp[(block << blockShift) + len] = (index << 6) + (a & 63);
|
|
tmpLen[block] = len + 1;
|
|
if (len + 1 == blockLen) {
|
|
applyBlock(tmp, block, len + 1, data);
|
|
tmpLen[block] = 0;
|
|
}
|
|
a += b;
|
|
}
|
|
}
|
|
for (int block = 0; block < blocks; block++) {
|
|
applyBlock(tmp, block, tmpLen[block], data);
|
|
}
|
|
delete[] tmp;
|
|
delete[] tmpLen;
|
|
return Ok;
|
|
}
|
|
|
|
inline char bittest64(const uint64_t *t, uint64_t bit) {
|
|
return (*t & (1L << (bit & 63))) != 0;
|
|
}
|
|
template<typename ItemType, size_t bits_per_item, bool branchless,
|
|
typename HashFamily, int k>
|
|
Status BloomFilter<ItemType, bits_per_item, branchless, HashFamily, k>::Contain(
|
|
const ItemType &key) const {
|
|
uint64_t hash = hasher(key);
|
|
uint64_t a = (hash >> 32) | (hash << 32);
|
|
uint64_t b = hash;
|
|
if (branchless && k >= 3) {
|
|
int b0 = data[fastrangesize(a, this->arrayLength)] >> (a & 63);
|
|
a += b;
|
|
int b1 = data[fastrangesize(a, this->arrayLength)] >> (a & 63);
|
|
a += b;
|
|
int b2 = data[fastrangesize(a, this->arrayLength)] >> (a & 63);
|
|
if ((b0 & b1 & b2 & 1) == 0) {
|
|
return NotFound;
|
|
}
|
|
for (int i = 3; i < k; i++) {
|
|
a += b;
|
|
if (((data[fastrangesize(a, this->arrayLength)] >> (a & 63)) & 1) == 0) {
|
|
return NotFound;
|
|
}
|
|
}
|
|
return Ok;
|
|
}
|
|
for (int i = 0; i < k; i++) {
|
|
if ((data[fastrangesize(a, this->arrayLength)] & getBit(a)) == 0) {
|
|
return NotFound;
|
|
}
|
|
a += b;
|
|
}
|
|
return Ok;
|
|
}
|
|
|
|
template<typename ItemType, size_t bits_per_item, bool branchless,
|
|
typename HashFamily, int k>
|
|
std::string
|
|
BloomFilter<ItemType, bits_per_item, branchless, HashFamily, k>::Info() const {
|
|
std::stringstream ss;
|
|
ss << "BloomFilter Status:\n"
|
|
<< "\t\tKeys stored: " << Size() << "\n";
|
|
if (Size() > 0) {
|
|
ss << "\t\tk: " << BitsPerItem() << "\n";
|
|
} else {
|
|
ss << "\t\tk: N/A\n";
|
|
}
|
|
return ss.str();
|
|
}
|
|
|
|
template<size_t m_up, size_t m_down, size_t k, bool branchless,
|
|
typename HashFamily = TwoIndependentMultiplyShift>
|
|
class BF_MA {
|
|
public:
|
|
uint64_t *data;
|
|
// const size_t m;
|
|
size_t cap = 0;
|
|
size_t arrayLength;
|
|
size_t bitCount;
|
|
int kk = k;
|
|
HashFamily hasher;
|
|
|
|
double BitsPerItem() const { return k; }
|
|
|
|
size_t get_m(size_t n) {
|
|
size_t res = (n * m_up + m_down - 1) / m_down;
|
|
assert(res > 0);
|
|
assert(res < (1ULL << 60u));
|
|
return res;
|
|
}
|
|
|
|
explicit BF_MA(const size_t n) : bitCount(get_m(n)),
|
|
arrayLength((get_m(n) + 63) / 64), hasher() {
|
|
//
|
|
// arrayLength((bitCount + 63) / 64),bitCount(n * bits_per_item),kk(k),hasher() {
|
|
// this->cap = 0;
|
|
// this->kk = getBestK(bits_per_item);
|
|
// this->bitCount = n * bits_per_item;
|
|
// bitCount(n * bits_per_item) this->arrayLength = (bitCount + 63) / 64;
|
|
data = new uint64_t[arrayLength];
|
|
std::fill_n(data, arrayLength, 0);
|
|
}
|
|
|
|
~BF_MA() { delete[] data; }
|
|
|
|
// Add an item to the filter.
|
|
Status Add(const uint64_t &item);
|
|
|
|
|
|
// Report if the item is inserted, with false positive rate.
|
|
Status Contain(const uint64_t &item) const;
|
|
|
|
Status contain_branchless(const uint64_t &item) const;
|
|
|
|
/* methods for providing stats */
|
|
// summary infomation
|
|
std::string Info() const;
|
|
|
|
// number of current inserted items;
|
|
size_t get_cap() const { return cap; }
|
|
|
|
// size of the filter in bytes.
|
|
size_t SizeInBytes() const { return arrayLength * 8; }
|
|
|
|
std::string get_name() const {
|
|
float BPI = (100 * m_up / m_down) / 100.0;
|
|
// double BPI = BPI_full * 100
|
|
|
|
size_t hash_k = k;
|
|
std::string name = "BF-MA-[bpi=10,k=" + std::to_string(k) + "]";
|
|
// std::string name = "BF-MA-" + std::to_string(BPI) + "[k=" + std::to_string(k) + "]";
|
|
|
|
if (branchless) {
|
|
return name + "-{branchless}";
|
|
} else {
|
|
return name + "-{with-branch}";
|
|
}
|
|
}
|
|
};
|
|
|
|
template<size_t m_up, size_t m_down, size_t k, bool branchless, typename HashFamily>
|
|
Status BF_MA<m_up, m_down, k, branchless, HashFamily>::Add(const uint64_t &key) {
|
|
cap++;
|
|
uint64_t hash = hasher(key);
|
|
uint64_t a = (hash >> 32) | (hash << 32);
|
|
uint64_t b = hash;
|
|
for (int i = 0; i < k; i++) {
|
|
// int index = reduce(a, this->bitCount);
|
|
// data[index >> 6] |= getBit(index);
|
|
// reworked to avoid overflows
|
|
// use the fact that reduce is not very sensitive to lower bits of a
|
|
data[fastrangesize(a, this->arrayLength)] |= getBit(a);
|
|
a += b;
|
|
}
|
|
return Ok;
|
|
}
|
|
template<size_t m_up, size_t m_down, size_t k, bool branchless, typename HashFamily>
|
|
Status BF_MA<m_up, m_down, k, branchless, HashFamily>::Contain(
|
|
const uint64_t &key) const {
|
|
uint64_t hash = hasher(key);
|
|
uint64_t a = (hash >> 32) | (hash << 32);
|
|
uint64_t b = hash;
|
|
|
|
for (int i = 0; i < k; i++) {
|
|
if ((data[fastrangesize(a, this->arrayLength)] & getBit(a)) == 0) {
|
|
return NotFound;
|
|
}
|
|
a += b;
|
|
}
|
|
return Ok;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if ((k == 2) and branchless) {
|
|
int b0 = data[fastrangesize(a, this->arrayLength)] >> (a & 63);
|
|
a += b;
|
|
int b1 = data[fastrangesize(a, this->arrayLength)] >> (a & 63);
|
|
if ((b0 & b1 & 1) == 0)
|
|
return NotFound;
|
|
return Ok;
|
|
} else if (k == 2) {
|
|
if (!(data[fastrangesize(a, this->arrayLength)] >> (a & 63)))
|
|
return NotFound;
|
|
|
|
a += b;
|
|
if (data[fastrangesize(a, this->arrayLength)] >> (a & 63)) {
|
|
return Ok;
|
|
}
|
|
return NotFound;
|
|
}
|
|
|
|
if (branchless && k >= 3) {
|
|
int b0 = data[fastrangesize(a, this->arrayLength)] >> (a & 63);
|
|
a += b;
|
|
int b1 = data[fastrangesize(a, this->arrayLength)] >> (a & 63);
|
|
a += b;
|
|
int b2 = data[fastrangesize(a, this->arrayLength)] >> (a & 63);
|
|
if ((b0 & b1 & b2 & 1) == 0) {
|
|
return NotFound;
|
|
}
|
|
for (int i = 3; i < k; i++) {
|
|
a += b;
|
|
if (((data[fastrangesize(a, this->arrayLength)] >> (a & 63)) & 1) == 0) {
|
|
return NotFound;
|
|
}
|
|
}
|
|
return Ok;
|
|
}
|
|
for (int i = 0; i < k; i++) {
|
|
if ((data[fastrangesize(a, this->arrayLength)] & getBit(a)) == 0) {
|
|
return NotFound;
|
|
}
|
|
a += b;
|
|
}
|
|
return Ok;
|
|
}
|
|
|
|
/* template<size_t m_up, size_t m_down, size_t k, bool branchless, typename HashFamily>
|
|
Status BF_MA<m_up, m_down, k, branchless, HashFamily>::contain_branchless(
|
|
const uint64_t &key) const {
|
|
uint64_t hash = hasher(key);
|
|
uint64_t a = (hash >> 32) | (hash << 32);
|
|
uint64_t b = hash;
|
|
size_t locations[k] = {0};
|
|
for (size_t i = 0; i < k; i++)
|
|
{
|
|
locations[i] = fastrangesize(a, this->arrayLength);
|
|
a += b;
|
|
}
|
|
|
|
int res = 1u;
|
|
for (size_t i = 0; i < k; i++)
|
|
{
|
|
res &= data[locations[i]] >> ()
|
|
}
|
|
|
|
|
|
}
|
|
|
|
*/
|
|
template<size_t m_up, size_t m_down, size_t k, bool branchless, typename HashFamily>
|
|
std::string
|
|
BF_MA<m_up, m_down, k, branchless, HashFamily>::Info() const {
|
|
std::stringstream ss;
|
|
ss << "BloomFilter Status:\n"
|
|
<< "\t\tKeys stored: " << get_cap() << "\n";
|
|
if (get_cap() > 0) {
|
|
ss << "\t\tk: " << BitsPerItem() << "\n";
|
|
} else {
|
|
ss << "\t\tk: N/A\n";
|
|
}
|
|
return ss.str();
|
|
}
|
|
|
|
|
|
}// namespace bloomfilter
|
|
|
|
#endif// BLOOM_FILTER_BLOOM_FILTER_H_
|