1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00
Files
MarginaliaSearch/code/common/model/java/nu/marginalia/model/idx/DocumentMetadata.java
Viktor Lofgren 0929d77247 (chore) Remove vestigial Serializable annotation from a few core models
Java serialization was briefly considered a long while ago, but it's a silly and ancient API and not something we want to use.
2025-09-24 10:42:10 +02:00

167 lines
5.9 KiB
Java

package nu.marginalia.model.idx;
import nu.marginalia.model.crawl.PubDate;
import java.util.EnumSet;
import java.util.Set;
import static java.lang.Math.max;
import static java.lang.Math.min;
/** Document level metadata designed to fit in a single 64 bit long.
*
* @param avgSentLength average sentence length
* @param rank domain ranking
* @param encDomainSize encoded number of documents in the domain
* @param topology a measure of how important the document is
* @param year encoded publishing year
* @param sets bit mask for search sets
* @param quality quality of the document (0-15); 0 is best, 15 is worst
* @param flags flags (see {@link DocumentFlags})
*/
public record DocumentMetadata(int avgSentLength,
int rank,
int encDomainSize,
int topology,
int year,
int sets,
int quality,
byte flags)
{
public String toString() {
StringBuilder sb = new StringBuilder(getClass().getSimpleName());
sb.append('[')
.append("avgSentL=").append(avgSentLength).append(", ")
.append("rank=").append(rank).append(", ")
.append("domainSize=").append(ENC_DOMAIN_SIZE_MULTIPLIER * encDomainSize).append(", ")
.append("topology=").append(topology).append(", ")
.append("year=").append(PubDate.fromYearByte(year)).append(", ")
.append("sets=").append(sets).append(", ")
.append("quality=").append(quality).append(", ")
.append("flags=").append(flagSet()).append("]");
return sb.toString();
}
public static final long ASL_MASK = 0x03L;
public static final int ASL_SHIFT = 56;
public static final long RANK_MASK = 0xFFL;
public static final int RANK_SHIFT = 48;
public static final long ENC_DOMAIN_SIZE_MASK = 0xFFL;
public static final int ENC_DOMAIN_SIZE_SHIFT = 40;
public static final int ENC_DOMAIN_SIZE_MULTIPLIER = 5;
public static final long TOPOLOGY_MASK = 0xFFL;
public static final int TOPOLOGY_SHIFT = 32;
public static final long YEAR_MASK = 0xFFL;
public static final int YEAR_SHIFT = 24;
public static final long SETS_MASK = 0xFL;
public static final int SETS_SHIFT = 16;
public static final long QUALITY_MASK = 0xFL;
public static final int QUALITY_SHIFT = 8;
public static long defaultValue() {
return 0L;
}
public DocumentMetadata() {
this(defaultValue());
}
public DocumentMetadata(int avgSentLength, int year, int quality, EnumSet<DocumentFlags> flags) {
this(avgSentLength, 0, 0, 0, year, 0, quality, encodeFlags(flags));
}
public DocumentMetadata withSizeAndTopology(int size, int topology) {
final int encSize = (int) Math.min(ENC_DOMAIN_SIZE_MASK, Math.max(1, size / ENC_DOMAIN_SIZE_MULTIPLIER));
return new DocumentMetadata(avgSentLength, rank, encSize, topology, year, sets, quality, flags);
}
private static byte encodeFlags(Set<DocumentFlags> flags) {
byte ret = 0;
for (var flag : flags) { ret |= flag.asBit(); }
return ret;
}
public boolean hasFlag(DocumentFlags flag) {
return (flags & flag.asBit()) != 0;
}
public DocumentMetadata(long value) {
this(
(int) ((value >>> ASL_SHIFT) & ASL_MASK),
(int) ((value >>> RANK_SHIFT) & RANK_MASK),
(int) ((value >>> ENC_DOMAIN_SIZE_SHIFT) & ENC_DOMAIN_SIZE_MASK),
(int) ((value >>> TOPOLOGY_SHIFT) & TOPOLOGY_MASK),
(int) ((value >>> YEAR_SHIFT) & YEAR_MASK),
(int) ((value >>> SETS_SHIFT) & SETS_MASK),
(int) ((value >>> QUALITY_SHIFT) & QUALITY_MASK),
(byte) (value & 0xFF)
);
}
public static boolean hasFlags(long encoded, long metadataBitMask) {
return ((encoded & 0xFF) & metadataBitMask) == metadataBitMask;
}
public long encode() {
long ret = 0;
ret |= Byte.toUnsignedLong(flags);
ret |= min(QUALITY_MASK, max(0, quality)) << QUALITY_SHIFT;
ret |= min(SETS_MASK, max(0, sets)) << SETS_SHIFT;
ret |= min(YEAR_MASK, max(0, year)) << YEAR_SHIFT;
ret |= min(TOPOLOGY_MASK, max(0, topology)) << TOPOLOGY_SHIFT;
ret |= min(ENC_DOMAIN_SIZE_MASK, max(0, encDomainSize)) << ENC_DOMAIN_SIZE_SHIFT;
ret |= min(RANK_MASK, max(0, rank)) << RANK_SHIFT;
ret |= min(ASL_MASK, max(0, avgSentLength)) << ASL_SHIFT;
return ret;
}
public boolean isEmpty() {
return avgSentLength == 0 && encDomainSize == 0 && topology == 0 && sets == 0 && quality == 0 && year == 0 && flags == 0 && rank == 0;
}
public static int decodeQuality(long encoded) {
return (int) ((encoded >>> QUALITY_SHIFT) & QUALITY_MASK);
}
public static int decodeTopology(long encoded) {
return (int) ((encoded >>> TOPOLOGY_SHIFT) & TOPOLOGY_MASK);
}
public static int decodeAvgSentenceLength(long encoded) {
return (int) ((encoded >>> ASL_SHIFT) & ASL_MASK);
}
public static int decodeYear(long encoded) {
return PubDate.fromYearByte((int) ((encoded >>> YEAR_SHIFT) & YEAR_MASK));
}
public int size() {
return ENC_DOMAIN_SIZE_MULTIPLIER * encDomainSize;
}
public static int decodeSize(long encoded) {
return ENC_DOMAIN_SIZE_MULTIPLIER * (int) ((encoded >>> ENC_DOMAIN_SIZE_SHIFT) & ENC_DOMAIN_SIZE_MASK);
}
public static int decodeRank(long encoded) {
return (int) ((encoded >>> RANK_SHIFT) & RANK_MASK);
}
public static long encodeRank(long encoded, int rank) {
return encoded | min(RANK_MASK, max(0, rank)) << RANK_SHIFT;
}
public EnumSet<DocumentFlags> flagSet() {
return DocumentFlags.decode(flags);
}
}