diff --git a/.gitmodules b/.gitmodules index 80bc7e5..e69de29 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +0,0 @@ -[submodule "TurboPFor-Integer-Compression"] - path = TurboPFor-Integer-Compression - url = https://github.com/powturbo/TurboPFor-Integer-Compression diff --git a/Makefile b/Makefile index 702c530..dc96d75 100644 --- a/Makefile +++ b/Makefile @@ -13,18 +13,15 @@ endif all: plocate plocate-build -plocate: plocate.o io_uring_engine.o TurboPFor-Integer-Compression/libic.a +plocate: plocate.o io_uring_engine.o $(CXX) -o $@ $^ -lzstd $(URING_LIBS) $(LDFLAGS) -plocate-build: plocate-build.o TurboPFor-Integer-Compression/libic.a +plocate-build: plocate-build.o $(CXX) -o $@ $^ -lzstd $(LDFLAGS) -TurboPFor-Integer-Compression/libic.a: - cd TurboPFor-Integer-Compression/ && $(MAKE) - clean: $(RM) plocate.o plocate-build.o io_uring_engine.o bench.o plocate plocate-build bench - cd TurboPFor-Integer-Compression/ && $(MAKE) clean + ! [ -d TurboPFor-Integer-Compression/ ] || ( cd TurboPFor-Integer-Compression/ && $(MAKE) clean ) install: all $(INSTALL) -m 2755 -g mlocate plocate $(PREFIX)/bin/ @@ -33,6 +30,9 @@ install: all bench.o: bench.cpp turbopfor.h +TurboPFor-Integer-Compression/libic.a: + cd TurboPFor-Integer-Compression/ && $(MAKE) + bench: bench.o io_uring_engine.o TurboPFor-Integer-Compression/libic.a $(CXX) -o $@ $^ $(URING_LIBS) $(LDFLAGS) diff --git a/README b/README index 8202052..bae2170 100644 --- a/README +++ b/README @@ -2,10 +2,13 @@ plocate, a locate based on posting lists, consuming mlocate inputs and making a much faster index. Does not support querying by regex, case-insensitivity or really any options. -Alpha stage; file format is subject to change. To build: +Alpha stage; file format is subject to change. To build, run make. - git submodule init - make -j8 +If you wish to run some tests of the TurboPFor implementation against +the reference implementation, you can run: + + git clone https://github.com/powturbo/TurboPFor-Integer-Compression + make -j8 bench Copyright 2020 Steinar H. Gunderson . Licensed under the GNU General Public License, either version 2, diff --git a/TurboPFor-Integer-Compression b/TurboPFor-Integer-Compression deleted file mode 160000 index 4ab9f5b..0000000 --- a/TurboPFor-Integer-Compression +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 4ab9f5b0e023e836c5d7f31aa67440916889570a diff --git a/bench.cpp b/bench.cpp index 78fa339..afec897 100644 --- a/bench.cpp +++ b/bench.cpp @@ -10,6 +10,7 @@ #include "db.h" #include "io_uring_engine.h" #include "turbopfor.h" +#include "turbopfor-encode.h" #include "vp4.h" using namespace std; @@ -29,7 +30,7 @@ int main(void) unique_ptr ht(new Trigram[hdr.hashtable_size + hdr.extra_ht_slots + 1]); complete_pread(fd, ht.get(), (hdr.hashtable_size + hdr.extra_ht_slots + 1) * sizeof(Trigram), hdr.hash_table_offset_bytes); - size_t posting_list_bytes = 0, total_elements = 0; + size_t posting_list_bytes = 0, own_posting_list_bytes = 0, total_elements = 0, most_bytes_pl = 0; uint32_t longest_pl = 0; vector> posting_lists; for (unsigned i = 0; i < hdr.hashtable_size + hdr.extra_ht_slots; ++i) { @@ -42,13 +43,17 @@ int main(void) complete_pread(fd, &str[0], len, ht[i].offset); posting_lists.emplace_back(move(str), ht[i].num_docids); longest_pl = std::max(ht[i].num_docids, longest_pl); + most_bytes_pl = std::max(len, most_bytes_pl); posting_list_bytes += len; total_elements += ht[i].num_docids; } ht.reset(); fprintf(stderr, "Read %zu posting lists.\n", posting_lists.size()); - size_t num_errors = 0; + string encoded_pl; + encoded_pl.resize(longest_pl * 2 + 16384); // Lots of margin. + + size_t num_decode_errors = 0, num_encode_errors = 0; for (auto &[pl, num_docids] : posting_lists) { //fprintf(stderr, "%zu bytes, %u docids\n", pl.size(), num_docids); vector out1, out2; @@ -59,7 +64,38 @@ int main(void) decode_pfor_delta1<128>(pldata, num_docids, /*interleaved=*/true, &out2[0]); for (unsigned i = 0; i < num_docids; ++i) { if (out1[i] != out2[i]) { - if (++num_errors < 10) { + if (++num_decode_errors < 10) { + fprintf(stderr, "Decode error:\n"); + for (unsigned j = 0; j < num_docids; ++j) { + fprintf(stderr, "%3u: reference=%u ours=%u (diff=%d)\n", j, out1[j], out2[j], out1[j] - out2[j]); + } + } + break; + } + } + + // Test encoding, by encoding with out own implementation + // and checking that decoding with the reference gives + // the same result. We do not measure performance (we're slow). + uint32_t deltas[128]; + unsigned char *ptr = reinterpret_cast(&encoded_pl[0]); + ptr = write_baseval(out1[0], ptr); + for (unsigned i = 1; i < num_docids; i += 128) { + unsigned num_docids_this_block = std::min(num_docids - i, 128u); + for (unsigned j = 0; j < num_docids_this_block; ++j) { + deltas[j] = out1[i + j] - out1[i + j - 1] - 1; + } + bool interleaved = (num_docids_this_block == 128); + ptr = encode_pfor_single_block<128>(deltas, num_docids_this_block, interleaved, ptr); + } + own_posting_list_bytes += ptr - reinterpret_cast(&encoded_pl[0]); + + pldata = reinterpret_cast(&encoded_pl[0]); + p4nd1dec128v32(pldata, num_docids, &out2[0]); + for (unsigned i = 0; i < num_docids; ++i) { + if (out1[i] != out2[i]) { + if (++num_encode_errors < 10) { + fprintf(stderr, "Encode error:\n"); for (unsigned j = 0; j < num_docids; ++j) { fprintf(stderr, "%3u: reference=%u ours=%u (diff=%d)\n", j, out1[j], out2[j], out1[j] - out2[j]); } @@ -68,8 +104,10 @@ int main(void) } } } - fprintf(stderr, "%zu/%zu posting lists had errors in decoding.\n", num_errors, posting_lists.size()); + fprintf(stderr, "%zu/%zu posting lists had errors in decoding.\n", num_decode_errors, posting_lists.size()); + fprintf(stderr, "%zu/%zu posting lists had errors in encoding.\n", num_encode_errors, posting_lists.size()); + // Benchmark. vector dummy; dummy.resize(longest_pl + 128); steady_clock::time_point start = steady_clock::now(); @@ -89,6 +127,7 @@ int main(void) end = steady_clock::now(); double own_sec = duration(end - start).count(); fprintf(stderr, "Decoding with own implementation: %.3f ms (%.2f%% speed)\n", 1e3 * own_sec, 100.0 * reference_sec / own_sec); + fprintf(stderr, "Size with own implementation: %.1f MB (%.2f%% of reference, %+d bytes)\n", own_posting_list_bytes / 1048576.0, 100.0 * own_posting_list_bytes / posting_list_bytes, int(own_posting_list_bytes) - int(posting_list_bytes)); // Three numbers giving rules of thumb for judging our own implementation: // diff --git a/plocate-build.cpp b/plocate-build.cpp index 90141ff..2f8bf31 100644 --- a/plocate-build.cpp +++ b/plocate-build.cpp @@ -1,5 +1,4 @@ #include "db.h" -#include "vp4.h" #include #include @@ -19,6 +18,8 @@ #include #include +#include "turbopfor-encode.h" + #define P4NENC_BOUND(n) ((n + 127) / 128 + (n + 32) * sizeof(uint32_t)) #define dprintf(...) //#define dprintf(...) fprintf(stderr, __VA_ARGS__); @@ -122,7 +123,7 @@ void PostingListBuilder::finish() // No interleaving for partial blocks. unsigned char buf[P4NENC_BOUND(128)]; - unsigned char *end = p4enc32(pending_deltas.data(), pending_deltas.size(), buf); + unsigned char *end = encode_pfor_single_block<128>(pending_deltas.data(), pending_deltas.size(), /*interleaved=*/false, buf); encoded.append(reinterpret_cast(buf), reinterpret_cast(end)); } @@ -130,15 +131,15 @@ void PostingListBuilder::append_block() { unsigned char buf[P4NENC_BOUND(128)]; assert(pending_deltas.size() == 128); - unsigned char *end = p4enc128v32(pending_deltas.data(), 128, buf); + unsigned char *end = encode_pfor_single_block<128>(pending_deltas.data(), 128, /*interleaved=*/true, buf); encoded.append(reinterpret_cast(buf), reinterpret_cast(end)); } void PostingListBuilder::write_header(uint32_t docid) { unsigned char buf[P4NENC_BOUND(1)]; - size_t bytes = p4nd1enc128v32(&docid, 1, buf); - encoded.append(reinterpret_cast(buf), bytes); + unsigned char *end = write_baseval(docid, buf); + encoded.append(reinterpret_cast(buf), end - buf); } class Corpus { diff --git a/turbopfor-common.h b/turbopfor-common.h new file mode 100644 index 0000000..0d493b5 --- /dev/null +++ b/turbopfor-common.h @@ -0,0 +1,36 @@ +#ifndef _TURBOPFOR_COMMON_H +#define _TURBOPFOR_COMMON_H 1 + +// Common definitions and utilities between turbopfor.h (decode) +// and turbopfor-encode.h (encode). + +#include + +enum BlockType { + FOR = 0, + PFOR_VB = 1, + PFOR_BITMAP = 2, + CONSTANT = 3 +}; + +// Does not properly account for overflow. +inline unsigned div_round_up(unsigned val, unsigned div) +{ + return (val + div - 1) / div; +} + +inline unsigned bytes_for_packed_bits(unsigned num, unsigned bit_width) +{ + return div_round_up(num * bit_width, CHAR_BIT); +} + +constexpr uint32_t mask_for_bits(unsigned bit_width) +{ + if (bit_width == 32) { + return 0xFFFFFFFF; + } else { + return (1U << bit_width) - 1; + } +} + +#endif // !defined(_TURBOPFOR_COMMON_H) diff --git a/turbopfor-encode.h b/turbopfor-encode.h new file mode 100644 index 0000000..532aa19 --- /dev/null +++ b/turbopfor-encode.h @@ -0,0 +1,379 @@ +#ifndef _TURBOPFOR_ENCODE_H +#define _TURBOPFOR_ENCODE_H + +// Much like turbopfor.h (and shares all of the same caveats), except this is +// for encoding. It is _much_ slower than the reference implementation, but we +// encode only during build, and most time in build is spent in other things +// than encoding posting lists, so it only costs ~5-10% overall. Does not use +// any special character sets, and generally isn't optimized at all. +// +// It encodes about 0.01% denser than the reference encoder (averaged over +// a real plocate corpus), probably since it has a slower but more precise +// method for estimating the cost of a PFOR + varbyte block. + +#include "turbopfor-common.h" + +#include +#include +#include + +template +void write_le(Docid val, void *out) +{ + if constexpr (sizeof(Docid) == 8) { + val = htole64(val); + } else if constexpr (sizeof(Docid) == 4) { + val = htole32(val); + } else if constexpr (sizeof(Docid) == 2) { + val = htole16(val); + } else if constexpr (sizeof(Docid) == 1) { + val = val; + } else { + assert(false); + } + memcpy(out, &val, sizeof(val)); +} + +// Corresponds to read_baseval. +template +unsigned char *write_baseval(Docid in, unsigned char *out) +{ + if (in < 128) { + *out = in; + return out + 1; + } else if (in < 0x4000) { + out[0] = (in >> 8) | 0x80; + out[1] = in & 0xff; + return out + 2; + } else if (in < 0x200000) { + out[0] = (in >> 16) | 0xc0; + out[1] = in & 0xff; + out[2] = (in >> 8) & 0xff; + return out + 3; + } else { + assert(false); // Not implemented. + } +} + +// Writes a varbyte-encoded exception. +template +unsigned char *write_vb(Docid val, unsigned char *out) +{ + if (val <= 176) { + *out++ = val; + return out; + } else if (val <= 16560) { + val -= 177; + *out++ = (val >> 8) + 177; + *out++ = val & 0xff; + return out; + } else if (val <= 540848) { + val -= 16561; + *out = (val >> 16) + 241; + write_le(val & 0xffff, out + 1); + return out + 3; + } else if (val <= 16777215) { + *out = 249; + write_le(val, out + 1); + return out + 4; + } else { + *out = 250; + write_le(val, out + 1); + return out + 5; + } +} + +template +inline unsigned num_bits(Docid x) +{ + if (x == 0) { + return 0; + } else { + return sizeof(Docid) * CHAR_BIT - __builtin_clz(x); + } +} + +struct BitWriter { +public: + BitWriter(unsigned char *out, unsigned bits) + : out(out), bits(bits) {} + void write(uint32_t val) + { + cur_val |= val << bits_used; + write_le(cur_val, out); + + bits_used += bits; + cur_val >>= (bits_used / 8) * 8; + out += bits_used / 8; + bits_used %= 8; + } + +private: + unsigned char *out; + const unsigned bits; + unsigned bits_used = 0; + unsigned cur_val = 0; +}; + +template +struct InterleavedBitWriter { +public: + InterleavedBitWriter(unsigned char *out, unsigned bits) + : out(out), bits(bits) {} + void write(uint32_t val) + { + cur_val |= uint64_t(val) << bits_used; + if (bits_used + bits >= 32) { + write_le(cur_val & 0xffffffff, out); + out += Stride; + cur_val >>= 32; + bits_used -= 32; // Underflow, but will be fixed below. + } + write_le(cur_val, out); + bits_used += bits; + } + +private: + static constexpr unsigned Stride = NumStreams * sizeof(uint32_t); + unsigned char *out; + const unsigned bits; + unsigned bits_used = 0; + uint64_t cur_val = 0; +}; + +// Bitpacks a set of values (making sure the top bits are lopped off). +// If interleaved is set, makes SSE2-compatible interleaving (this is +// only allowed for full blocks). +template +unsigned char *encode_bitmap(const Docid *in, unsigned num, unsigned bit_width, bool interleaved, unsigned char *out) +{ + unsigned mask = mask_for_bits(bit_width); + if (interleaved) { + InterleavedBitWriter<4> bs0(out + 0 * sizeof(uint32_t), bit_width); + InterleavedBitWriter<4> bs1(out + 1 * sizeof(uint32_t), bit_width); + InterleavedBitWriter<4> bs2(out + 2 * sizeof(uint32_t), bit_width); + InterleavedBitWriter<4> bs3(out + 3 * sizeof(uint32_t), bit_width); + assert(num % 4 == 0); + for (unsigned i = 0; i < num / 4; ++i) { + bs0.write(in[i * 4 + 0] & mask); + bs1.write(in[i * 4 + 1] & mask); + bs2.write(in[i * 4 + 2] & mask); + bs3.write(in[i * 4 + 3] & mask); + } + } else { + BitWriter bs(out, bit_width); + for (unsigned i = 0; i < num; ++i) { + bs.write(in[i] & mask); + } + } + return out + bytes_for_packed_bits(num, bit_width); +} + +// See decode_for() for the format. +template +unsigned char *encode_for(const Docid *in, unsigned num, unsigned bit_width, bool interleaved, unsigned char *out) +{ + return encode_bitmap(in, num, bit_width, interleaved, out); +} + +// See decode_pfor_bitmap() for the format. +template +unsigned char *encode_pfor_bitmap(const Docid *in, unsigned num, unsigned bit_width, unsigned exception_bit_width, bool interleaved, unsigned char *out) +{ + *out++ = exception_bit_width; + + // Bitmap of exceptions. + { + BitWriter bs(out, 1); + for (unsigned i = 0; i < num; ++i) { + bs.write((in[i] >> bit_width) != 0); + } + out += bytes_for_packed_bits(num, 1); + } + + // Exceptions. + { + BitWriter bs(out, exception_bit_width); + unsigned num_exceptions = 0; + for (unsigned i = 0; i < num; ++i) { + if ((in[i] >> bit_width) != 0) { + bs.write(in[i] >> bit_width); + ++num_exceptions; + } + } + out += bytes_for_packed_bits(num_exceptions, exception_bit_width); + } + + // Base values. + out = encode_bitmap(in, num, bit_width, interleaved, out); + + return out; +} + +// See decode_pfor_vb() for the format. +template +unsigned char *encode_pfor_vb(const Docid *in, unsigned num, unsigned bit_width, bool interleaved, unsigned char *out) +{ + unsigned num_exceptions = 0; + for (unsigned i = 0; i < num; ++i) { + if ((in[i] >> bit_width) != 0) { + ++num_exceptions; + } + } + *out++ = num_exceptions; + + // Base values. + out = encode_bitmap(in, num, bit_width, interleaved, out); + + // Exceptions. + for (unsigned i = 0; i < num; ++i) { + unsigned val = in[i] >> bit_width; + if (val != 0) { + out = write_vb(val, out); + } + } + + // Exception indexes. + for (unsigned i = 0; i < num; ++i) { + unsigned val = in[i] >> bit_width; + if (val != 0) { + *out++ = i; + } + } + + return out; +} + +// Find out which block type would be the smallest for the given data. +template +BlockType decide_block_type(const Docid *in, unsigned num, unsigned *bit_width, unsigned *exception_bit_width) +{ + // Check if the block is constant. + bool constant = true; + for (unsigned i = 1; i < num; ++i) { + if (in[i] != in[0]) { + constant = false; + break; + } + } + if (constant) { + *bit_width = num_bits(in[0]); + return BlockType::CONSTANT; + } + + // Build up a histogram of bit sizes. + unsigned histogram[sizeof(Docid) * CHAR_BIT + 1] = { 0 }; + unsigned max_bits = 0; + for (unsigned i = 0; i < num; ++i) { + unsigned bits = num_bits(in[i]); + ++histogram[bits]; + max_bits = std::max(max_bits, bits); + } + + // Straight-up FOR. + unsigned best_cost = bytes_for_packed_bits(num, max_bits); + unsigned best_bit_width = max_bits; + + // Try PFOR with bitmap exceptions. + const unsigned bitmap_cost = bytes_for_packed_bits(num, 1); + unsigned num_exceptions = 0; + for (unsigned exception_bit_width = 1; exception_bit_width <= max_bits; ++exception_bit_width) { + unsigned test_bit_width = max_bits - exception_bit_width; + num_exceptions += histogram[test_bit_width + 1]; + + // 1 byte for signaling exception bit width, then the bitmap, + // then the base values, then the exceptions. + unsigned cost = 1 + bitmap_cost + bytes_for_packed_bits(num, test_bit_width) + + bytes_for_packed_bits(num_exceptions, exception_bit_width); + if (cost < best_cost) { + best_cost = cost; + best_bit_width = test_bit_width; + } + } + + // Try PFOR with varbyte exceptions. + bool best_is_varbyte = false; + for (unsigned test_bit_width = 0; test_bit_width < max_bits; ++test_bit_width) { + // 1 byte for signaling number of exceptions, plus the base values, + // and then we count up the varbytes and indexes. (This is precise + // but very slow.) + unsigned cost = 1 + bytes_for_packed_bits(num, test_bit_width); + for (unsigned i = 0; i < num && cost < best_cost; ++i) { + unsigned val = in[i] >> test_bit_width; + if (val == 0) { + // Not stored, and then also no index. + } else if (val <= 176) { + cost += 2; + } else if (val <= 16560) { + cost += 3; + } else if (val <= 540848) { + cost += 4; + } else if (val <= 16777215) { + cost += 5; + } else { + cost += 6; + } + } + if (cost < best_cost) { + best_cost = cost; + best_bit_width = test_bit_width; + best_is_varbyte = true; + } + } + + // TODO: Consider the last-resort option of just raw storage (255). + + if (best_is_varbyte) { + *bit_width = best_bit_width; + return BlockType::PFOR_VB; + } else if (best_bit_width == max_bits) { + *bit_width = max_bits; + return BlockType::FOR; + } else { + *bit_width = best_bit_width; + *exception_bit_width = max_bits - best_bit_width; + return BlockType::PFOR_BITMAP; + } +} + +// The basic entry point. Takes one block of integers (which already must +// be delta-minus-1-encoded) and packs it into TurboPFor format. +// interleaved corresponds to the interleaved parameter in decode_pfor_delta1() +// or the ā€œ128vā€ infix in the reference code's function names; such formats +// are much faster to decode, so for full blocks, you probably want it. +// The interleaved flag isn't stored anywhere; it's implicit whether you +// want to use it for full blocks or not. +// +// The first value must already be written using write_baseval() (so the delta +// coding starts from the second value). Returns the end of the string. +// May write 4 bytes past the end. +template +unsigned char *encode_pfor_single_block(const Docid *in, unsigned num, bool interleaved, unsigned char *out) +{ + assert(num > 0); + if (interleaved) { + assert(num == BlockSize); + } + + unsigned bit_width, exception_bit_width; + BlockType block_type = decide_block_type(in, num, &bit_width, &exception_bit_width); + *out++ = (block_type << 6) | bit_width; + + switch (block_type) { + case BlockType::CONSTANT: { + unsigned bit_width = num_bits(in[0]); + write_le(in[0], out); + return out + div_round_up(bit_width, 8); + } + case BlockType::FOR: + return encode_for(in, num, bit_width, interleaved, out); + case BlockType::PFOR_BITMAP: + return encode_pfor_bitmap(in, num, bit_width, exception_bit_width, interleaved, out); + case BlockType::PFOR_VB: + return encode_pfor_vb(in, num, bit_width, interleaved, out); + default: + assert(false); + } +} + +#endif // !defined(_TURBOPFOR_ENCODE_H) diff --git a/turbopfor.h b/turbopfor.h index 1796708..a21727a 100644 --- a/turbopfor.h +++ b/turbopfor.h @@ -25,6 +25,8 @@ #include #endif +#include "turbopfor-common.h" + // Forward declarations to declare to the template code below that they exist. // (These must seemingly be non-templates for function multiversioning to work.) __attribute__((target("default"))) @@ -49,15 +51,6 @@ const unsigned char * decode_pfor_vb_interleaved_128_32(const unsigned char *in, uint32_t *out); #endif -constexpr uint32_t mask_for_bits(unsigned bit_width) -{ - if (bit_width == 32) { - return 0xFFFFFFFF; - } else { - return (1U << bit_width) - 1; - } -} - template Docid read_le(const void *in) { @@ -206,17 +199,6 @@ private: }; #endif -// Does not properly account for overflow. -inline unsigned div_round_up(unsigned val, unsigned div) -{ - return (val + div - 1) / div; -} - -inline unsigned bytes_for_packed_bits(unsigned num, unsigned bit_width) -{ - return div_round_up(num * bit_width, CHAR_BIT); -} - // Constant block. Layout: // // - Bit width (6 bits) | type << 6 @@ -727,13 +709,6 @@ decode_pfor_vb_interleaved_128_32(const unsigned char *in, uint32_t *out) return in; } -enum BlockType { - FOR = 0, - PFOR_VB = 1, - PFOR_BITMAP = 2, - CONSTANT = 3 -}; - template const unsigned char *decode_pfor_delta1(const unsigned char *in, unsigned num, bool interleaved, Docid *out) {