diff --git a/.gitmodules b/.gitmodules
index 80bc7e5..e69de29 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +0,0 @@
-[submodule "TurboPFor-Integer-Compression"]
-	path = TurboPFor-Integer-Compression
-	url = https://github.com/powturbo/TurboPFor-Integer-Compression
diff --git a/Makefile b/Makefile
index 702c530..dc96d75 100644
--- a/Makefile
+++ b/Makefile
@@ -13,18 +13,15 @@ endif
 
 all: plocate plocate-build
 
-plocate: plocate.o io_uring_engine.o TurboPFor-Integer-Compression/libic.a
+plocate: plocate.o io_uring_engine.o
 	$(CXX) -o $@ $^ -lzstd $(URING_LIBS) $(LDFLAGS)
 
-plocate-build: plocate-build.o TurboPFor-Integer-Compression/libic.a
+plocate-build: plocate-build.o
 	$(CXX) -o $@ $^ -lzstd $(LDFLAGS)
 
-TurboPFor-Integer-Compression/libic.a:
-	cd TurboPFor-Integer-Compression/ && $(MAKE)
-
 clean:
 	$(RM) plocate.o plocate-build.o io_uring_engine.o bench.o plocate plocate-build bench
-	cd TurboPFor-Integer-Compression/ && $(MAKE) clean
+	! [ -d TurboPFor-Integer-Compression/ ] || ( cd TurboPFor-Integer-Compression/ && $(MAKE) clean )
 
 install: all
 	$(INSTALL) -m 2755 -g mlocate plocate $(PREFIX)/bin/
@@ -33,6 +30,9 @@ install: all
 
 bench.o: bench.cpp turbopfor.h
 
+TurboPFor-Integer-Compression/libic.a:
+	cd TurboPFor-Integer-Compression/ && $(MAKE)
+
 bench: bench.o io_uring_engine.o TurboPFor-Integer-Compression/libic.a
 	$(CXX) -o $@ $^ $(URING_LIBS) $(LDFLAGS)
 
diff --git a/README b/README
index 8202052..bae2170 100644
--- a/README
+++ b/README
@@ -2,10 +2,13 @@ plocate, a locate based on posting lists, consuming mlocate inputs
 and making a much faster index. Does not support querying by regex,
 case-insensitivity or really any options.
 
-Alpha stage; file format is subject to change. To build:
+Alpha stage; file format is subject to change. To build, run make.
 
-  git submodule init
-  make -j8
+If you wish to run some tests of the TurboPFor implementation against
+the reference implementation, you can run:
+
+  git clone https://github.com/powturbo/TurboPFor-Integer-Compression
+  make -j8 bench
 
 Copyright 2020 Steinar H. Gunderson <steinar+git@gunderson.no>.
 Licensed under the GNU General Public License, either version 2,
diff --git a/TurboPFor-Integer-Compression b/TurboPFor-Integer-Compression
deleted file mode 160000
index 4ab9f5b..0000000
--- a/TurboPFor-Integer-Compression
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 4ab9f5b0e023e836c5d7f31aa67440916889570a
diff --git a/bench.cpp b/bench.cpp
index 78fa339..afec897 100644
--- a/bench.cpp
+++ b/bench.cpp
@@ -10,6 +10,7 @@
 #include "db.h"
 #include "io_uring_engine.h"
 #include "turbopfor.h"
+#include "turbopfor-encode.h"
 #include "vp4.h"
 
 using namespace std;
@@ -29,7 +30,7 @@ int main(void)
 	unique_ptr<Trigram[]> ht(new Trigram[hdr.hashtable_size + hdr.extra_ht_slots + 1]);
 	complete_pread(fd, ht.get(), (hdr.hashtable_size + hdr.extra_ht_slots + 1) * sizeof(Trigram), hdr.hash_table_offset_bytes);
 
-	size_t posting_list_bytes = 0, total_elements = 0;
+	size_t posting_list_bytes = 0, own_posting_list_bytes = 0, total_elements = 0, most_bytes_pl = 0;
 	uint32_t longest_pl = 0;
 	vector<pair<string, unsigned>> posting_lists;
 	for (unsigned i = 0; i < hdr.hashtable_size + hdr.extra_ht_slots; ++i) {
@@ -42,13 +43,17 @@ int main(void)
 		complete_pread(fd, &str[0], len, ht[i].offset);
 		posting_lists.emplace_back(move(str), ht[i].num_docids);
 		longest_pl = std::max(ht[i].num_docids, longest_pl);
+		most_bytes_pl = std::max(len, most_bytes_pl);
 		posting_list_bytes += len;
 		total_elements += ht[i].num_docids;
 	}
 	ht.reset();
 	fprintf(stderr, "Read %zu posting lists.\n", posting_lists.size());
 
-	size_t num_errors = 0;
+	string encoded_pl;
+	encoded_pl.resize(longest_pl * 2 + 16384);  // Lots of margin.
+
+	size_t num_decode_errors = 0, num_encode_errors = 0;
 	for (auto &[pl, num_docids] : posting_lists) {
 		//fprintf(stderr, "%zu bytes, %u docids\n", pl.size(), num_docids);
 		vector<uint32_t> out1, out2;
@@ -59,7 +64,38 @@ int main(void)
 		decode_pfor_delta1<128>(pldata, num_docids, /*interleaved=*/true, &out2[0]);
 		for (unsigned i = 0; i < num_docids; ++i) {
 			if (out1[i] != out2[i]) {
-				if (++num_errors < 10) {
+				if (++num_decode_errors < 10) {
+					fprintf(stderr, "Decode error:\n");
+					for (unsigned j = 0; j < num_docids; ++j) {
+						fprintf(stderr, "%3u: reference=%u ours=%u  (diff=%d)\n", j, out1[j], out2[j], out1[j] - out2[j]);
+					}
+				}
+				break;
+			}
+		}
+
+		// Test encoding, by encoding with out own implementation
+		// and checking that decoding with the reference gives
+		// the same result. We do not measure performance (we're slow).
+		uint32_t deltas[128];
+		unsigned char *ptr = reinterpret_cast<unsigned char *>(&encoded_pl[0]);
+		ptr = write_baseval(out1[0], ptr);
+		for (unsigned i = 1; i < num_docids; i += 128) {
+			unsigned num_docids_this_block = std::min(num_docids - i, 128u);
+			for (unsigned j = 0; j < num_docids_this_block; ++j) {
+				deltas[j] = out1[i + j] - out1[i + j - 1] - 1;
+			}
+			bool interleaved = (num_docids_this_block == 128);
+			ptr = encode_pfor_single_block<128>(deltas, num_docids_this_block, interleaved, ptr);
+		}
+		own_posting_list_bytes += ptr - reinterpret_cast<unsigned char *>(&encoded_pl[0]);
+
+		pldata = reinterpret_cast<unsigned char *>(&encoded_pl[0]);
+		p4nd1dec128v32(pldata, num_docids, &out2[0]);
+		for (unsigned i = 0; i < num_docids; ++i) {
+			if (out1[i] != out2[i]) {
+				if (++num_encode_errors < 10) {
+					fprintf(stderr, "Encode error:\n");
 					for (unsigned j = 0; j < num_docids; ++j) {
 						fprintf(stderr, "%3u: reference=%u ours=%u  (diff=%d)\n", j, out1[j], out2[j], out1[j] - out2[j]);
 					}
@@ -68,8 +104,10 @@ int main(void)
 			}
 		}
 	}
-	fprintf(stderr, "%zu/%zu posting lists had errors in decoding.\n", num_errors, posting_lists.size());
+	fprintf(stderr, "%zu/%zu posting lists had errors in decoding.\n", num_decode_errors, posting_lists.size());
+	fprintf(stderr, "%zu/%zu posting lists had errors in encoding.\n", num_encode_errors, posting_lists.size());
 
+	// Benchmark.
 	vector<uint32_t> dummy;
 	dummy.resize(longest_pl + 128);
 	steady_clock::time_point start = steady_clock::now();
@@ -89,6 +127,7 @@ int main(void)
 	end = steady_clock::now();
 	double own_sec = duration<double>(end - start).count();
 	fprintf(stderr, "Decoding with own implementation: %.3f ms (%.2f%% speed)\n", 1e3 * own_sec, 100.0 * reference_sec / own_sec);
+	fprintf(stderr, "Size with own implementation: %.1f MB (%.2f%% of reference, %+d bytes)\n", own_posting_list_bytes / 1048576.0, 100.0 * own_posting_list_bytes / posting_list_bytes, int(own_posting_list_bytes) - int(posting_list_bytes));
 
 	// Three numbers giving rules of thumb for judging our own implementation:
 	//
diff --git a/plocate-build.cpp b/plocate-build.cpp
index 90141ff..2f8bf31 100644
--- a/plocate-build.cpp
+++ b/plocate-build.cpp
@@ -1,5 +1,4 @@
 #include "db.h"
-#include "vp4.h"
 
 #include <algorithm>
 #include <arpa/inet.h>
@@ -19,6 +18,8 @@
 #include <vector>
 #include <zstd.h>
 
+#include "turbopfor-encode.h"
+
 #define P4NENC_BOUND(n) ((n + 127) / 128 + (n + 32) * sizeof(uint32_t))
 #define dprintf(...)
 //#define dprintf(...) fprintf(stderr, __VA_ARGS__);
@@ -122,7 +123,7 @@ void PostingListBuilder::finish()
 
 	// No interleaving for partial blocks.
 	unsigned char buf[P4NENC_BOUND(128)];
-	unsigned char *end = p4enc32(pending_deltas.data(), pending_deltas.size(), buf);
+	unsigned char *end = encode_pfor_single_block<128>(pending_deltas.data(), pending_deltas.size(), /*interleaved=*/false, buf);
 	encoded.append(reinterpret_cast<char *>(buf), reinterpret_cast<char *>(end));
 }
 
@@ -130,15 +131,15 @@ void PostingListBuilder::append_block()
 {
 	unsigned char buf[P4NENC_BOUND(128)];
 	assert(pending_deltas.size() == 128);
-	unsigned char *end = p4enc128v32(pending_deltas.data(), 128, buf);
+	unsigned char *end = encode_pfor_single_block<128>(pending_deltas.data(), 128, /*interleaved=*/true, buf);
 	encoded.append(reinterpret_cast<char *>(buf), reinterpret_cast<char *>(end));
 }
 
 void PostingListBuilder::write_header(uint32_t docid)
 {
 	unsigned char buf[P4NENC_BOUND(1)];
-	size_t bytes = p4nd1enc128v32(&docid, 1, buf);
-	encoded.append(reinterpret_cast<char *>(buf), bytes);
+	unsigned char *end = write_baseval(docid, buf);
+	encoded.append(reinterpret_cast<char *>(buf), end - buf);
 }
 
 class Corpus {
diff --git a/turbopfor-common.h b/turbopfor-common.h
new file mode 100644
index 0000000..0d493b5
--- /dev/null
+++ b/turbopfor-common.h
@@ -0,0 +1,36 @@
+#ifndef _TURBOPFOR_COMMON_H
+#define _TURBOPFOR_COMMON_H 1
+
+// Common definitions and utilities between turbopfor.h (decode)
+// and turbopfor-encode.h (encode).
+
+#include <limits.h>
+
+enum BlockType {
+	FOR = 0,
+	PFOR_VB = 1,
+	PFOR_BITMAP = 2,
+	CONSTANT = 3
+};
+
+// Does not properly account for overflow.
+inline unsigned div_round_up(unsigned val, unsigned div)
+{
+	return (val + div - 1) / div;
+}
+
+inline unsigned bytes_for_packed_bits(unsigned num, unsigned bit_width)
+{
+	return div_round_up(num * bit_width, CHAR_BIT);
+}
+
+constexpr uint32_t mask_for_bits(unsigned bit_width)
+{
+	if (bit_width == 32) {
+		return 0xFFFFFFFF;
+	} else {
+		return (1U << bit_width) - 1;
+	}
+}
+
+#endif  // !defined(_TURBOPFOR_COMMON_H)
diff --git a/turbopfor-encode.h b/turbopfor-encode.h
new file mode 100644
index 0000000..532aa19
--- /dev/null
+++ b/turbopfor-encode.h
@@ -0,0 +1,379 @@
+#ifndef _TURBOPFOR_ENCODE_H
+#define _TURBOPFOR_ENCODE_H
+
+// Much like turbopfor.h (and shares all of the same caveats), except this is
+// for encoding. It is _much_ slower than the reference implementation, but we
+// encode only during build, and most time in build is spent in other things
+// than encoding posting lists, so it only costs ~5-10% overall. Does not use
+// any special character sets, and generally isn't optimized at all.
+//
+// It encodes about 0.01% denser than the reference encoder (averaged over
+// a real plocate corpus), probably since it has a slower but more precise
+// method for estimating the cost of a PFOR + varbyte block.
+
+#include "turbopfor-common.h"
+
+#include <assert.h>
+#include <limits.h>
+#include <stdint.h>
+
+template<class Docid>
+void write_le(Docid val, void *out)
+{
+	if constexpr (sizeof(Docid) == 8) {
+		val = htole64(val);
+	} else if constexpr (sizeof(Docid) == 4) {
+		val = htole32(val);
+	} else if constexpr (sizeof(Docid) == 2) {
+		val = htole16(val);
+	} else if constexpr (sizeof(Docid) == 1) {
+		val = val;
+	} else {
+		assert(false);
+	}
+	memcpy(out, &val, sizeof(val));
+}
+
+// Corresponds to read_baseval.
+template<class Docid>
+unsigned char *write_baseval(Docid in, unsigned char *out)
+{
+	if (in < 128) {
+		*out = in;
+		return out + 1;
+	} else if (in < 0x4000) {
+		out[0] = (in >> 8) | 0x80;
+		out[1] = in & 0xff;
+		return out + 2;
+	} else if (in < 0x200000) {
+		out[0] = (in >> 16) | 0xc0;
+		out[1] = in & 0xff;
+		out[2] = (in >> 8) & 0xff;
+		return out + 3;
+	} else {
+		assert(false);  // Not implemented.
+	}
+}
+
+// Writes a varbyte-encoded exception.
+template<class Docid>
+unsigned char *write_vb(Docid val, unsigned char *out)
+{
+	if (val <= 176) {
+		*out++ = val;
+		return out;
+	} else if (val <= 16560) {
+		val -= 177;
+		*out++ = (val >> 8) + 177;
+		*out++ = val & 0xff;
+		return out;
+	} else if (val <= 540848) {
+		val -= 16561;
+		*out = (val >> 16) + 241;
+		write_le<uint16_t>(val & 0xffff, out + 1);
+		return out + 3;
+	} else if (val <= 16777215) {
+		*out = 249;
+		write_le<uint32_t>(val, out + 1);
+		return out + 4;
+	} else {
+		*out = 250;
+		write_le<uint32_t>(val, out + 1);
+		return out + 5;
+	}
+}
+
+template<class Docid>
+inline unsigned num_bits(Docid x)
+{
+	if (x == 0) {
+		return 0;
+	} else {
+		return sizeof(Docid) * CHAR_BIT - __builtin_clz(x);
+	}
+}
+
+struct BitWriter {
+public:
+	BitWriter(unsigned char *out, unsigned bits)
+		: out(out), bits(bits) {}
+	void write(uint32_t val)
+	{
+		cur_val |= val << bits_used;
+		write_le<uint32_t>(cur_val, out);
+
+		bits_used += bits;
+		cur_val >>= (bits_used / 8) * 8;
+		out += bits_used / 8;
+		bits_used %= 8;
+	}
+
+private:
+	unsigned char *out;
+	const unsigned bits;
+	unsigned bits_used = 0;
+	unsigned cur_val = 0;
+};
+
+template<unsigned NumStreams>
+struct InterleavedBitWriter {
+public:
+	InterleavedBitWriter(unsigned char *out, unsigned bits)
+		: out(out), bits(bits) {}
+	void write(uint32_t val)
+	{
+		cur_val |= uint64_t(val) << bits_used;
+		if (bits_used + bits >= 32) {
+			write_le<uint32_t>(cur_val & 0xffffffff, out);
+			out += Stride;
+			cur_val >>= 32;
+			bits_used -= 32;  // Underflow, but will be fixed below.
+		}
+		write_le<uint32_t>(cur_val, out);
+		bits_used += bits;
+	}
+
+private:
+	static constexpr unsigned Stride = NumStreams * sizeof(uint32_t);
+	unsigned char *out;
+	const unsigned bits;
+	unsigned bits_used = 0;
+	uint64_t cur_val = 0;
+};
+
+// Bitpacks a set of values (making sure the top bits are lopped off).
+// If interleaved is set, makes SSE2-compatible interleaving (this is
+// only allowed for full blocks).
+template<class Docid>
+unsigned char *encode_bitmap(const Docid *in, unsigned num, unsigned bit_width, bool interleaved, unsigned char *out)
+{
+	unsigned mask = mask_for_bits(bit_width);
+	if (interleaved) {
+		InterleavedBitWriter<4> bs0(out + 0 * sizeof(uint32_t), bit_width);
+		InterleavedBitWriter<4> bs1(out + 1 * sizeof(uint32_t), bit_width);
+		InterleavedBitWriter<4> bs2(out + 2 * sizeof(uint32_t), bit_width);
+		InterleavedBitWriter<4> bs3(out + 3 * sizeof(uint32_t), bit_width);
+		assert(num % 4 == 0);
+		for (unsigned i = 0; i < num / 4; ++i) {
+			bs0.write(in[i * 4 + 0] & mask);
+			bs1.write(in[i * 4 + 1] & mask);
+			bs2.write(in[i * 4 + 2] & mask);
+			bs3.write(in[i * 4 + 3] & mask);
+		}
+	} else {
+		BitWriter bs(out, bit_width);
+		for (unsigned i = 0; i < num; ++i) {
+			bs.write(in[i] & mask);
+		}
+	}
+	return out + bytes_for_packed_bits(num, bit_width);
+}
+
+// See decode_for() for the format.
+template<class Docid>
+unsigned char *encode_for(const Docid *in, unsigned num, unsigned bit_width, bool interleaved, unsigned char *out)
+{
+	return encode_bitmap(in, num, bit_width, interleaved, out);
+}
+
+// See decode_pfor_bitmap() for the format.
+template<class Docid>
+unsigned char *encode_pfor_bitmap(const Docid *in, unsigned num, unsigned bit_width, unsigned exception_bit_width, bool interleaved, unsigned char *out)
+{
+	*out++ = exception_bit_width;
+
+	// Bitmap of exceptions.
+	{
+		BitWriter bs(out, 1);
+		for (unsigned i = 0; i < num; ++i) {
+			bs.write((in[i] >> bit_width) != 0);
+		}
+		out += bytes_for_packed_bits(num, 1);
+	}
+
+	// Exceptions.
+	{
+		BitWriter bs(out, exception_bit_width);
+		unsigned num_exceptions = 0;
+		for (unsigned i = 0; i < num; ++i) {
+			if ((in[i] >> bit_width) != 0) {
+				bs.write(in[i] >> bit_width);
+				++num_exceptions;
+			}
+		}
+		out += bytes_for_packed_bits(num_exceptions, exception_bit_width);
+	}
+
+	// Base values.
+	out = encode_bitmap(in, num, bit_width, interleaved, out);
+
+	return out;
+}
+
+// See decode_pfor_vb() for the format.
+template<class Docid>
+unsigned char *encode_pfor_vb(const Docid *in, unsigned num, unsigned bit_width, bool interleaved, unsigned char *out)
+{
+	unsigned num_exceptions = 0;
+	for (unsigned i = 0; i < num; ++i) {
+		if ((in[i] >> bit_width) != 0) {
+			++num_exceptions;
+		}
+	}
+	*out++ = num_exceptions;
+
+	// Base values.
+	out = encode_bitmap(in, num, bit_width, interleaved, out);
+
+	// Exceptions.
+	for (unsigned i = 0; i < num; ++i) {
+		unsigned val = in[i] >> bit_width;
+		if (val != 0) {
+			out = write_vb(val, out);
+		}
+	}
+
+	// Exception indexes.
+	for (unsigned i = 0; i < num; ++i) {
+		unsigned val = in[i] >> bit_width;
+		if (val != 0) {
+			*out++ = i;
+		}
+	}
+
+	return out;
+}
+
+// Find out which block type would be the smallest for the given data.
+template<class Docid>
+BlockType decide_block_type(const Docid *in, unsigned num, unsigned *bit_width, unsigned *exception_bit_width)
+{
+	// Check if the block is constant.
+	bool constant = true;
+	for (unsigned i = 1; i < num; ++i) {
+		if (in[i] != in[0]) {
+			constant = false;
+			break;
+		}
+	}
+	if (constant) {
+		*bit_width = num_bits(in[0]);
+		return BlockType::CONSTANT;
+	}
+
+	// Build up a histogram of bit sizes.
+	unsigned histogram[sizeof(Docid) * CHAR_BIT + 1] = { 0 };
+	unsigned max_bits = 0;
+	for (unsigned i = 0; i < num; ++i) {
+		unsigned bits = num_bits(in[i]);
+		++histogram[bits];
+		max_bits = std::max(max_bits, bits);
+	}
+
+	// Straight-up FOR.
+	unsigned best_cost = bytes_for_packed_bits(num, max_bits);
+	unsigned best_bit_width = max_bits;
+
+	// Try PFOR with bitmap exceptions.
+	const unsigned bitmap_cost = bytes_for_packed_bits(num, 1);
+	unsigned num_exceptions = 0;
+	for (unsigned exception_bit_width = 1; exception_bit_width <= max_bits; ++exception_bit_width) {
+		unsigned test_bit_width = max_bits - exception_bit_width;
+		num_exceptions += histogram[test_bit_width + 1];
+
+		// 1 byte for signaling exception bit width, then the bitmap,
+		// then the base values, then the exceptions.
+		unsigned cost = 1 + bitmap_cost + bytes_for_packed_bits(num, test_bit_width) +
+			bytes_for_packed_bits(num_exceptions, exception_bit_width);
+		if (cost < best_cost) {
+			best_cost = cost;
+			best_bit_width = test_bit_width;
+		}
+	}
+
+	// Try PFOR with varbyte exceptions.
+	bool best_is_varbyte = false;
+	for (unsigned test_bit_width = 0; test_bit_width < max_bits; ++test_bit_width) {
+		// 1 byte for signaling number of exceptions, plus the base values,
+		// and then we count up the varbytes and indexes. (This is precise
+		// but very slow.)
+		unsigned cost = 1 + bytes_for_packed_bits(num, test_bit_width);
+		for (unsigned i = 0; i < num && cost < best_cost; ++i) {
+			unsigned val = in[i] >> test_bit_width;
+			if (val == 0) {
+				// Not stored, and then also no index.
+			} else if (val <= 176) {
+				cost += 2;
+			} else if (val <= 16560) {
+				cost += 3;
+			} else if (val <= 540848) {
+				cost += 4;
+			} else if (val <= 16777215) {
+				cost += 5;
+			} else {
+				cost += 6;
+			}
+		}
+		if (cost < best_cost) {
+			best_cost = cost;
+			best_bit_width = test_bit_width;
+			best_is_varbyte = true;
+		}
+	}
+
+	// TODO: Consider the last-resort option of just raw storage (255).
+
+	if (best_is_varbyte) {
+		*bit_width = best_bit_width;
+		return BlockType::PFOR_VB;
+	} else if (best_bit_width == max_bits) {
+		*bit_width = max_bits;
+		return BlockType::FOR;
+	} else {
+		*bit_width = best_bit_width;
+		*exception_bit_width = max_bits - best_bit_width;
+		return BlockType::PFOR_BITMAP;
+	}
+}
+
+// The basic entry point. Takes one block of integers (which already must
+// be delta-minus-1-encoded) and packs it into TurboPFor format.
+// interleaved corresponds to the interleaved parameter in decode_pfor_delta1()
+// or the “128v” infix in the reference code's function names; such formats
+// are much faster to decode, so for full blocks, you probably want it.
+// The interleaved flag isn't stored anywhere; it's implicit whether you
+// want to use it for full blocks or not.
+//
+// The first value must already be written using write_baseval() (so the delta
+// coding starts from the second value). Returns the end of the string.
+// May write 4 bytes past the end.
+template<unsigned BlockSize, class Docid>
+unsigned char *encode_pfor_single_block(const Docid *in, unsigned num, bool interleaved, unsigned char *out)
+{
+	assert(num > 0);
+	if (interleaved) {
+		assert(num == BlockSize);
+	}
+
+	unsigned bit_width, exception_bit_width;
+	BlockType block_type = decide_block_type(in, num, &bit_width, &exception_bit_width);
+	*out++ = (block_type << 6) | bit_width;
+
+	switch (block_type) {
+	case BlockType::CONSTANT: {
+		unsigned bit_width = num_bits(in[0]);
+		write_le<Docid>(in[0], out);
+		return out + div_round_up(bit_width, 8);
+	}
+	case BlockType::FOR:
+		return encode_for(in, num, bit_width, interleaved, out);
+	case BlockType::PFOR_BITMAP:
+		return encode_pfor_bitmap(in, num, bit_width, exception_bit_width, interleaved, out);
+	case BlockType::PFOR_VB:
+		return encode_pfor_vb(in, num, bit_width, interleaved, out);
+	default:
+		assert(false);
+	}
+}
+
+#endif  // !defined(_TURBOPFOR_ENCODE_H)
diff --git a/turbopfor.h b/turbopfor.h
index 1796708..a21727a 100644
--- a/turbopfor.h
+++ b/turbopfor.h
@@ -25,6 +25,8 @@
 #include <immintrin.h>
 #endif
 
+#include "turbopfor-common.h"
+
 // Forward declarations to declare to the template code below that they exist.
 // (These must seemingly be non-templates for function multiversioning to work.)
 __attribute__((target("default")))
@@ -49,15 +51,6 @@ const unsigned char *
 decode_pfor_vb_interleaved_128_32(const unsigned char *in, uint32_t *out);
 #endif
 
-constexpr uint32_t mask_for_bits(unsigned bit_width)
-{
-	if (bit_width == 32) {
-		return 0xFFFFFFFF;
-	} else {
-		return (1U << bit_width) - 1;
-	}
-}
-
 template<class Docid>
 Docid read_le(const void *in)
 {
@@ -206,17 +199,6 @@ private:
 };
 #endif
 
-// Does not properly account for overflow.
-inline unsigned div_round_up(unsigned val, unsigned div)
-{
-	return (val + div - 1) / div;
-}
-
-inline unsigned bytes_for_packed_bits(unsigned num, unsigned bit_width)
-{
-	return div_round_up(num * bit_width, CHAR_BIT);
-}
-
 // Constant block. Layout:
 //
 //  - Bit width (6 bits) | type << 6
@@ -727,13 +709,6 @@ decode_pfor_vb_interleaved_128_32(const unsigned char *in, uint32_t *out)
 	return in;
 }
 
-enum BlockType {
-	FOR = 0,
-	PFOR_VB = 1,
-	PFOR_BITMAP = 2,
-	CONSTANT = 3
-};
-
 template<unsigned BlockSize, class Docid>
 const unsigned char *decode_pfor_delta1(const unsigned char *in, unsigned num, bool interleaved, Docid *out)
 {