1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00

(big-string) Remove the unused bigstring library

This commit is contained in:
Viktor Lofgren
2024-05-18 13:40:03 +02:00
parent 19163fa883
commit b867eadbef
16 changed files with 0 additions and 238 deletions

View File

@@ -13,7 +13,6 @@ java {
apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation project(':code:libraries:big-string')
implementation project(':code:libraries:braille-block-punch-cards')
implementation libs.bundles.slf4j

View File

@@ -2,8 +2,6 @@ package nu.marginalia.model.gson;
import com.google.gson.*;
import marcono1234.gson.recordadapter.RecordTypeAdapterFactory;
import nu.marginalia.bigstring.BigString;
import nu.marginalia.bigstring.CompressedBigString;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
@@ -23,9 +21,6 @@ public class GsonFactory {
}
})
.registerTypeAdapter(EdgeDomain.class, (JsonDeserializer<EdgeDomain>) (json, typeOfT, context) -> new EdgeDomain(json.getAsString()))
.registerTypeAdapter(BigString.class, (JsonDeserializer<BigString>) (json, typeOfT, context) -> BigString.encode(json.getAsString()))
.registerTypeAdapter(BigString.class, (JsonSerializer<BigString>) (src, typeOfT, context) -> new JsonPrimitive(src.decode()))
.registerTypeAdapter(CompressedBigString.class, (JsonSerializer<BigString>) (src, typeOfT, context) -> new JsonPrimitive(src.decode()))
.serializeSpecialFloatingPointValues()
.create();
}

View File

@@ -1,26 +0,0 @@
plugins {
id 'java'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
}
}
apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation libs.bundles.slf4j
implementation libs.notnull
implementation libs.lz4
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
}
test {
useJUnitPlatform()
}

View File

@@ -1,18 +0,0 @@
package nu.marginalia.bigstring;
public interface BigString {
boolean disableBigString = Boolean.getBoolean("bigstring.disabled");
static BigString encode(String stringValue) {
if (!disableBigString && stringValue.length() > 64) {
return new CompressedBigString(stringValue);
}
else {
return new PlainBigString(stringValue);
}
}
String decode();
int length();
}

View File

@@ -1,27 +0,0 @@
package nu.marginalia.bigstring;
import java.nio.ByteBuffer;
public class CompressedBigString implements BigString {
private final int originalSize;
private final int length;
private final ByteBuffer encoded;
private final static CompressionBufferPool bufferPool = new CompressionBufferPool();
public CompressedBigString(String stringValue) {
encoded = bufferPool.bufferForThread().compress(stringValue);
originalSize = encoded.position();
length = stringValue.length();
}
@Override
public String decode() {
return bufferPool.bufferForThread().decompress(encoded, length, originalSize);
}
@Override
public int length() {
return length;
}
}

View File

@@ -1,67 +0,0 @@
package nu.marginalia.bigstring;
import net.jpountz.lz4.LZ4Compressor;
import net.jpountz.lz4.LZ4Factory;
import net.jpountz.lz4.LZ4FastDecompressor;
import java.nio.ByteBuffer;
/** Buffers for compression and decompression of strings.
* Operations are synchronized on the buffers.
* <p>
* @see CompressionBufferPool CompressionBufferPool */
public class CompressionBuffer {
private static final int BUFFER_SIZE = 8_000_000;
private final ByteBuffer buffer;
private static final LZ4Factory lz4Factory = LZ4Factory.fastestInstance();
private static final LZ4Compressor compressor = lz4Factory.fastCompressor();
private static final LZ4FastDecompressor decompressor = lz4Factory.fastDecompressor();
public CompressionBuffer() {
this.buffer = ByteBuffer.allocateDirect(BUFFER_SIZE);
}
/**
* @param stringValue the string to compress
* @return a compressed version of the string in a newly allocated ByteBuffer
*/
public synchronized ByteBuffer compress(String stringValue) {
final int splitPoint = stringValue.length() * 2;
buffer.clear();
var rawBuffer = buffer.slice(0, splitPoint);
var compressedBuffer = buffer.slice(splitPoint, BUFFER_SIZE - splitPoint);
rawBuffer.clear();
rawBuffer.asCharBuffer().append(stringValue);
// can't flip here because position and limit is in the CharBuffer representation
rawBuffer.position(0);
rawBuffer.limit(stringValue.length() * 2);
compressedBuffer.clear();
compressor.compress(rawBuffer, compressedBuffer);
compressedBuffer.flip();
ByteBuffer retBuffer = ByteBuffer.allocate(compressedBuffer.limit());
retBuffer.put(compressedBuffer);
return retBuffer;
}
public synchronized String decompress(ByteBuffer encoded, int length, int originalSize) {
buffer.position(0);
buffer.limit(length * 2);
encoded.position(0);
encoded.limit(originalSize);
decompressor.decompress(encoded, buffer);
buffer.flip();
return buffer.asCharBuffer().toString();
}
}

View File

@@ -1,26 +0,0 @@
package nu.marginalia.bigstring;
import java.util.Arrays;
import java.util.concurrent.ThreadLocalRandom;
/** To avoid contention on the compression buffer, while keeping allocation churn low,
* we use a pool of buffers, randomly selected allocated upon invocation
* <p>
* @see CompressionBuffer CompressionBuffer
* */
public class CompressionBufferPool {
private static final int BUFFER_COUNT = 16;
private final CompressionBuffer[] destBuffer;
public CompressionBufferPool() {
destBuffer = new CompressionBuffer[BUFFER_COUNT];
Arrays.setAll(destBuffer, i -> new CompressionBuffer());
}
/** Get the buffer for the current thread */
public CompressionBuffer bufferForThread() {
int idx = ThreadLocalRandom.current().nextInt(0, BUFFER_COUNT);
return destBuffer[idx];
}
}

View File

@@ -1,19 +0,0 @@
package nu.marginalia.bigstring;
public class PlainBigString implements BigString {
private final String value;
public PlainBigString(String value) {
this.value = value;
}
@Override
public String decode() {
return value;
}
@Override
public int length() {
return value.length();
}
}

View File

@@ -1,27 +0,0 @@
# Big String
Microlibrary that offers string compression. This is useful when having to load tens of thousands
of HTML documents in memory during conversion. XML has been described as the opposite of a compression scheme,
and as a result, HTML compresses ridiculously well.
## Configuration
If the Java property 'bigstring.disabled' is set to true, the BigString class will not compress strings.
## Demo
```java
List<BigString> manyBigStrings = new ArrayList<>();
for (var file : files) {
// BigString.encode may or may not compress the string
// depeneding on its size
manyBigStrings.add(BigString.encode(readFile(file)));
}
for (var bs : manyBigStrings) {
String decompressedString = bs.decompress();
byte[] bytes = bs.getBytes();
int len = bs.getLength();
}
```

View File

@@ -1,16 +0,0 @@
package nu.marginalia.bigstring;
import nu.marginalia.bigstring.CompressedBigString;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.*;
class CompressedBigStringTest {
@Test
public void testCompressDecompress() {
String testString = "This is a test string that is longer than 64 characters. It should be compressed.";
var bigString = new CompressedBigString(testString);
assertEquals(testString, bigString.decode());
}
}

View File

@@ -16,7 +16,6 @@ bad support for. It's designed to be able to easily replaced when *Java's Foreig
* [easy-lsh](easy-lsh/) is a simple locality-sensitive hash for document deduplication
* [guarded-regex](guarded-regex/) makes predicated regular expressions clearer
* [big-string](big-string/) offers seamless string compression
* [random-write-funnel](random-write-funnel/) is a tool for reducing write amplification when constructing
large files out of order.
* [next-prime](next-prime/) naive brute force prime sieve.

View File

@@ -19,7 +19,6 @@ dependencies {
implementation project(':code:common:db')
implementation project(':code:common:config')
implementation project(':code:common:process')
implementation project(':code:libraries:big-string')
implementation project(':code:index:api')
implementation project(':code:features-crawl:content-type')
implementation project(':code:libraries:language-processing')

View File

@@ -39,7 +39,6 @@ dependencies {
implementation project(':code:libraries:guarded-regex')
implementation project(':code:libraries:easy-lsh')
implementation project(':code:libraries:geo-ip')
implementation project(':code:libraries:big-string')
implementation project(':code:libraries:language-processing')
implementation project(':code:process-models:processed-data')

View File

@@ -27,7 +27,6 @@ dependencies {
implementation project(':code:common:model')
implementation project(':code:common:config')
implementation project(':code:common:service')
implementation project(':code:libraries:big-string')
implementation project(':code:libraries:blocking-thread-pool')
implementation project(':code:index:api')
implementation project(':code:process-mqapi')

View File

@@ -32,7 +32,6 @@ dependencies {
implementation project(':code:common:service')
implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:term-frequency-dict')
implementation project(':code:libraries:big-string')
implementation project(':code:processes:converting-process')
implementation project(':code:process-models:crawling-model')

View File

@@ -41,7 +41,6 @@ include 'code:libraries:geo-ip'
include 'code:libraries:btree'
include 'code:libraries:easy-lsh'
include 'code:libraries:guarded-regex'
include 'code:libraries:big-string'
include 'code:libraries:random-write-funnel'
include 'code:libraries:next-prime'
include 'code:libraries:blocking-thread-pool'