/* Taken from https://github.com/FastFilter/fastfilter_cpp */ #ifndef BLOOM_FILTER_BLOOM_FILTER_H_ #define BLOOM_FILTER_BLOOM_FILTER_H_ #include #include #include #include "../hashutil.h" using namespace std; using namespace hashing; namespace bloomfilter { // status returned by a Bloom filter operation enum Status { Ok = 0, NotFound = 1, NotEnoughSpace = 2, NotSupported = 3, }; inline uint32_t reduce(uint32_t hash, uint32_t n) { // http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/ return (uint32_t) (((uint64_t) hash * n) >> 32); } /** * Given a value "word", produces an integer in [0,p) without division. * The function is as fair as possible in the sense that if you iterate * through all possible values of "word", then you will generate all * possible outputs as uniformly as possible. */ static inline uint32_t fastrange32(uint32_t word, uint32_t p) { // http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/ return (uint32_t) (((uint64_t) word * (uint64_t) p) >> 32); } /** * Given a value "word", produces an integer in [0,p) without division. * The function is as fair as possible in the sense that if you iterate * through all possible values of "word", then you will generate all * possible outputs as uniformly as possible. */ static inline uint64_t fastrange64(uint64_t word, uint64_t p) { // http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/ // #ifdef __SIZEOF_INT128__// then we know we have a 128-bit int return (uint64_t) (((__uint128_t) word * (__uint128_t) p) >> 64); } #ifndef UINT32_MAX #define UINT32_MAX (0xffffffff) #endif// UINT32_MAX /** * Given a value "word", produces an integer in [0,p) without division. * The function is as fair as possible in the sense that if you iterate * through all possible values of "word", then you will generate all * possible outputs as uniformly as possible. */ static inline size_t fastrangesize(uint64_t word, size_t p) { #if (SIZE_MAX == UINT32_MAX) return (size_t) fastrange32(word, p); #else // assume 64-bit return (size_t) fastrange64(word, p); #endif// SIZE_MAX == UINT32_MAX } static inline size_t getBestK(size_t bitsPerItem) { return max(1, (int) round((double) bitsPerItem * log(2))); } inline uint64_t getBit(uint32_t index) { return 1L << (index & 63); } template class BloomFilter { public: uint64_t *data; size_t size; size_t arrayLength; size_t bitCount; int kk; HashFamily hasher; double BitsPerItem() const { return k; } explicit BloomFilter(const size_t n) : hasher() { this->size = 0; this->kk = getBestK(bits_per_item); this->bitCount = n * bits_per_item; this->arrayLength = (bitCount + 63) / 64; data = new uint64_t[arrayLength]; std::fill_n(data, arrayLength, 0); } ~BloomFilter() { delete[] data; } // Add an item to the filter. Status Add(const ItemType &item); // Add multiple items to the filter. Status AddAll(const vector &data, const size_t start, const size_t end) { return AddAll(data.data(), start, end); } Status AddAll(const ItemType *data, const size_t start, const size_t end); // Report if the item is inserted, with false positive rate. Status Contain(const ItemType &item) const; /* methods for providing stats */ // summary infomation std::string Info() const; // number of current inserted items; size_t Size() const { return size; } // size of the filter in bytes. size_t SizeInBytes() const { return arrayLength * 8; } std::string get_name() const { size_t BPI = bits_per_item; size_t hash_k = k; std::string name = "Lem-BF-" + std::to_string(BPI) + "[k=" + std::to_string(hash_k) + "]"; return name; // if (branchless) { // return name + "-{branchless}"; // } else { // return name + "-{with-branch}"; // } } }; template Status BloomFilter::Add( const ItemType &key) { uint64_t hash = hasher(key); uint64_t a = (hash >> 32) | (hash << 32); uint64_t b = hash; for (int i = 0; i < k; i++) { // int index = reduce(a, this->bitCount); // data[index >> 6] |= getBit(index); // reworked to avoid overflows // use the fact that reduce is not very sensitive to lower bits of a data[fastrangesize(a, this->arrayLength)] |= getBit(a); a += b; } return Ok; } const int blockShift = 15; const int blockLen = 1 << blockShift; inline void applyBlock(uint32_t *tmp, int block, int len, uint64_t *data) { for (int i = 0; i < len; i++) { uint32_t index = tmp[(block << blockShift) + i]; data[index >> 6] |= getBit(index); } } template Status BloomFilter::AddAll( const ItemType *keys, const size_t start, const size_t end) { // we have that AddAll assumes that arrayLength << 6 is a // 32-bit integer if (arrayLength > 0x3ffffff) { for (size_t i = start; i < end; i++) { Add(keys[i]); } return Ok; } int blocks = 1 + arrayLength / blockLen; uint32_t *tmp = new uint32_t[blocks * blockLen]; int *tmpLen = new int[blocks](); for (size_t i = start; i < end; i++) { uint64_t key = keys[i]; uint64_t hash = hasher(key); uint64_t a = (hash >> 32) | (hash << 32); uint64_t b = hash; for (int j = 0; j < k; j++) { int index = fastrangesize(a, this->arrayLength); int block = index >> blockShift; int len = tmpLen[block]; tmp[(block << blockShift) + len] = (index << 6) + (a & 63); tmpLen[block] = len + 1; if (len + 1 == blockLen) { applyBlock(tmp, block, len + 1, data); tmpLen[block] = 0; } a += b; } } for (int block = 0; block < blocks; block++) { applyBlock(tmp, block, tmpLen[block], data); } delete[] tmp; delete[] tmpLen; return Ok; } inline char bittest64(const uint64_t *t, uint64_t bit) { return (*t & (1L << (bit & 63))) != 0; } template Status BloomFilter::Contain( const ItemType &key) const { uint64_t hash = hasher(key); uint64_t a = (hash >> 32) | (hash << 32); uint64_t b = hash; if (branchless && k >= 3) { int b0 = data[fastrangesize(a, this->arrayLength)] >> (a & 63); a += b; int b1 = data[fastrangesize(a, this->arrayLength)] >> (a & 63); a += b; int b2 = data[fastrangesize(a, this->arrayLength)] >> (a & 63); if ((b0 & b1 & b2 & 1) == 0) { return NotFound; } for (int i = 3; i < k; i++) { a += b; if (((data[fastrangesize(a, this->arrayLength)] >> (a & 63)) & 1) == 0) { return NotFound; } } return Ok; } for (int i = 0; i < k; i++) { if ((data[fastrangesize(a, this->arrayLength)] & getBit(a)) == 0) { return NotFound; } a += b; } return Ok; } template std::string BloomFilter::Info() const { std::stringstream ss; ss << "BloomFilter Status:\n" << "\t\tKeys stored: " << Size() << "\n"; if (Size() > 0) { ss << "\t\tk: " << BitsPerItem() << "\n"; } else { ss << "\t\tk: N/A\n"; } return ss.str(); } template class BF_MA { public: uint64_t *data; // const size_t m; size_t cap = 0; size_t arrayLength; size_t bitCount; int kk = k; HashFamily hasher; double BitsPerItem() const { return k; } size_t get_m(size_t n) { size_t res = (n * m_up + m_down - 1) / m_down; assert(res > 0); assert(res < (1ULL << 60u)); return res; } explicit BF_MA(const size_t n) : bitCount(get_m(n)), arrayLength((get_m(n) + 63) / 64), hasher() { // // arrayLength((bitCount + 63) / 64),bitCount(n * bits_per_item),kk(k),hasher() { // this->cap = 0; // this->kk = getBestK(bits_per_item); // this->bitCount = n * bits_per_item; // bitCount(n * bits_per_item) this->arrayLength = (bitCount + 63) / 64; data = new uint64_t[arrayLength]; std::fill_n(data, arrayLength, 0); } ~BF_MA() { delete[] data; } // Add an item to the filter. Status Add(const uint64_t &item); // Report if the item is inserted, with false positive rate. Status Contain(const uint64_t &item) const; Status contain_branchless(const uint64_t &item) const; /* methods for providing stats */ // summary infomation std::string Info() const; // number of current inserted items; size_t get_cap() const { return cap; } // size of the filter in bytes. size_t SizeInBytes() const { return arrayLength * 8; } std::string get_name() const { float BPI = (100 * m_up / m_down) / 100.0; // double BPI = BPI_full * 100 size_t hash_k = k; std::string name = "BF-MA-[bpi=10,k=" + std::to_string(k) + "]"; // std::string name = "BF-MA-" + std::to_string(BPI) + "[k=" + std::to_string(k) + "]"; if (branchless) { return name + "-{branchless}"; } else { return name + "-{with-branch}"; } } }; template Status BF_MA::Add(const uint64_t &key) { cap++; uint64_t hash = hasher(key); uint64_t a = (hash >> 32) | (hash << 32); uint64_t b = hash; for (int i = 0; i < k; i++) { // int index = reduce(a, this->bitCount); // data[index >> 6] |= getBit(index); // reworked to avoid overflows // use the fact that reduce is not very sensitive to lower bits of a data[fastrangesize(a, this->arrayLength)] |= getBit(a); a += b; } return Ok; } template Status BF_MA::Contain( const uint64_t &key) const { uint64_t hash = hasher(key); uint64_t a = (hash >> 32) | (hash << 32); uint64_t b = hash; for (int i = 0; i < k; i++) { if ((data[fastrangesize(a, this->arrayLength)] & getBit(a)) == 0) { return NotFound; } a += b; } return Ok; if ((k == 2) and branchless) { int b0 = data[fastrangesize(a, this->arrayLength)] >> (a & 63); a += b; int b1 = data[fastrangesize(a, this->arrayLength)] >> (a & 63); if ((b0 & b1 & 1) == 0) return NotFound; return Ok; } else if (k == 2) { if (!(data[fastrangesize(a, this->arrayLength)] >> (a & 63))) return NotFound; a += b; if (data[fastrangesize(a, this->arrayLength)] >> (a & 63)) { return Ok; } return NotFound; } if (branchless && k >= 3) { int b0 = data[fastrangesize(a, this->arrayLength)] >> (a & 63); a += b; int b1 = data[fastrangesize(a, this->arrayLength)] >> (a & 63); a += b; int b2 = data[fastrangesize(a, this->arrayLength)] >> (a & 63); if ((b0 & b1 & b2 & 1) == 0) { return NotFound; } for (int i = 3; i < k; i++) { a += b; if (((data[fastrangesize(a, this->arrayLength)] >> (a & 63)) & 1) == 0) { return NotFound; } } return Ok; } for (int i = 0; i < k; i++) { if ((data[fastrangesize(a, this->arrayLength)] & getBit(a)) == 0) { return NotFound; } a += b; } return Ok; } /* template Status BF_MA::contain_branchless( const uint64_t &key) const { uint64_t hash = hasher(key); uint64_t a = (hash >> 32) | (hash << 32); uint64_t b = hash; size_t locations[k] = {0}; for (size_t i = 0; i < k; i++) { locations[i] = fastrangesize(a, this->arrayLength); a += b; } int res = 1u; for (size_t i = 0; i < k; i++) { res &= data[locations[i]] >> () } } */ template std::string BF_MA::Info() const { std::stringstream ss; ss << "BloomFilter Status:\n" << "\t\tKeys stored: " << get_cap() << "\n"; if (get_cap() > 0) { ss << "\t\tk: " << BitsPerItem() << "\n"; } else { ss << "\t\tk: N/A\n"; } return ss.str(); } }// namespace bloomfilter #endif// BLOOM_FILTER_BLOOM_FILTER_H_