1
1
mirror of http://git.sesse.net/plocate synced 2025-10-06 02:02:45 +02:00
Files
plocate/database-builder.h
Steinar H. Gunderson 2ec46dcd3e Add missing unistd.h #include.
Adapted from the Void Linux repository, for musl compatibility.
2021-08-23 17:54:57 +02:00

95 lines
2.3 KiB
C++

#ifndef _DATABASE_BUILDER_H
#define _DATABASE_BUILDER_H 1
#include "db.h"
#include <chrono>
#include <fcntl.h>
#include <memory>
#include <random>
#include <stddef.h>
#include <string>
#include <unistd.h>
#include <utility>
#include <vector>
#include <zstd.h>
class PostingListBuilder;
// {0,0} means unknown or so current that it should never match.
// {-1,0} means it's not a directory.
struct dir_time {
int64_t sec;
int32_t nsec;
bool operator<(const dir_time &other) const
{
if (sec != other.sec)
return sec < other.sec;
return nsec < other.nsec;
}
bool operator>=(const dir_time &other) const
{
return !(other < *this);
}
};
constexpr dir_time unknown_dir_time{ 0, 0 };
constexpr dir_time not_a_dir{ -1, 0 };
class DatabaseReceiver {
public:
virtual ~DatabaseReceiver() = default;
virtual void add_file(std::string filename, dir_time dt) = 0;
virtual void flush_block() = 0;
virtual void finish() { flush_block(); }
// EncodingCorpus only.
virtual size_t num_files_seen() const { return -1; }
};
class DictionaryBuilder : public DatabaseReceiver {
public:
DictionaryBuilder(size_t blocks_to_keep, size_t block_size)
: blocks_to_keep(blocks_to_keep), block_size(block_size) {}
void add_file(std::string filename, dir_time dt) override;
void flush_block() override;
std::string train(size_t buf_size);
private:
const size_t blocks_to_keep, block_size;
std::string current_block;
uint64_t block_num = 0;
size_t num_files_in_block = 0;
std::mt19937 reservoir_rand{ 1234 }; // Fixed seed for reproducibility.
bool keep_current_block = true;
int64_t slot_for_current_block = -1;
std::vector<std::string> sampled_blocks;
std::vector<size_t> lengths;
};
class EncodingCorpus;
class DatabaseBuilder {
public:
DatabaseBuilder(const char *outfile, gid_t owner, int block_size, std::string dictionary, bool check_visibility);
DatabaseReceiver *start_corpus(bool store_dir_times);
void set_next_dictionary(std::string next_dictionary);
void set_conf_block(std::string conf_block);
void finish_corpus();
private:
FILE *outfp;
std::string outfile;
std::string temp_filename;
Header hdr;
const int block_size;
std::chrono::steady_clock::time_point corpus_start;
EncodingCorpus *corpus = nullptr;
ZSTD_CDict *cdict = nullptr;
std::string next_dictionary, conf_block;
};
#endif // !defined(_DATABASE_BUILDER_H)