mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
(big-string) Remove the unused bigstring library
This commit is contained in:
@@ -13,7 +13,6 @@ java {
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:libraries:big-string')
|
||||
implementation project(':code:libraries:braille-block-punch-cards')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
|
@@ -2,8 +2,6 @@ package nu.marginalia.model.gson;
|
||||
|
||||
import com.google.gson.*;
|
||||
import marcono1234.gson.recordadapter.RecordTypeAdapterFactory;
|
||||
import nu.marginalia.bigstring.BigString;
|
||||
import nu.marginalia.bigstring.CompressedBigString;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
|
||||
@@ -23,9 +21,6 @@ public class GsonFactory {
|
||||
}
|
||||
})
|
||||
.registerTypeAdapter(EdgeDomain.class, (JsonDeserializer<EdgeDomain>) (json, typeOfT, context) -> new EdgeDomain(json.getAsString()))
|
||||
.registerTypeAdapter(BigString.class, (JsonDeserializer<BigString>) (json, typeOfT, context) -> BigString.encode(json.getAsString()))
|
||||
.registerTypeAdapter(BigString.class, (JsonSerializer<BigString>) (src, typeOfT, context) -> new JsonPrimitive(src.decode()))
|
||||
.registerTypeAdapter(CompressedBigString.class, (JsonSerializer<BigString>) (src, typeOfT, context) -> new JsonPrimitive(src.decode()))
|
||||
.serializeSpecialFloatingPointValues()
|
||||
.create();
|
||||
}
|
||||
|
@@ -1,26 +0,0 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
implementation libs.notnull
|
||||
implementation libs.lz4
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
}
|
||||
|
||||
test {
|
||||
useJUnitPlatform()
|
||||
}
|
@@ -1,18 +0,0 @@
|
||||
package nu.marginalia.bigstring;
|
||||
|
||||
public interface BigString {
|
||||
|
||||
boolean disableBigString = Boolean.getBoolean("bigstring.disabled");
|
||||
|
||||
static BigString encode(String stringValue) {
|
||||
if (!disableBigString && stringValue.length() > 64) {
|
||||
return new CompressedBigString(stringValue);
|
||||
}
|
||||
else {
|
||||
return new PlainBigString(stringValue);
|
||||
}
|
||||
}
|
||||
String decode();
|
||||
|
||||
int length();
|
||||
}
|
@@ -1,27 +0,0 @@
|
||||
package nu.marginalia.bigstring;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
|
||||
public class CompressedBigString implements BigString {
|
||||
private final int originalSize;
|
||||
private final int length;
|
||||
private final ByteBuffer encoded;
|
||||
|
||||
private final static CompressionBufferPool bufferPool = new CompressionBufferPool();
|
||||
|
||||
public CompressedBigString(String stringValue) {
|
||||
encoded = bufferPool.bufferForThread().compress(stringValue);
|
||||
originalSize = encoded.position();
|
||||
length = stringValue.length();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String decode() {
|
||||
return bufferPool.bufferForThread().decompress(encoded, length, originalSize);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int length() {
|
||||
return length;
|
||||
}
|
||||
}
|
@@ -1,67 +0,0 @@
|
||||
package nu.marginalia.bigstring;
|
||||
|
||||
import net.jpountz.lz4.LZ4Compressor;
|
||||
import net.jpountz.lz4.LZ4Factory;
|
||||
import net.jpountz.lz4.LZ4FastDecompressor;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
|
||||
/** Buffers for compression and decompression of strings.
|
||||
* Operations are synchronized on the buffers.
|
||||
* <p>
|
||||
* @see CompressionBufferPool CompressionBufferPool */
|
||||
public class CompressionBuffer {
|
||||
private static final int BUFFER_SIZE = 8_000_000;
|
||||
private final ByteBuffer buffer;
|
||||
|
||||
private static final LZ4Factory lz4Factory = LZ4Factory.fastestInstance();
|
||||
private static final LZ4Compressor compressor = lz4Factory.fastCompressor();
|
||||
private static final LZ4FastDecompressor decompressor = lz4Factory.fastDecompressor();
|
||||
|
||||
|
||||
public CompressionBuffer() {
|
||||
this.buffer = ByteBuffer.allocateDirect(BUFFER_SIZE);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param stringValue the string to compress
|
||||
* @return a compressed version of the string in a newly allocated ByteBuffer
|
||||
*/
|
||||
public synchronized ByteBuffer compress(String stringValue) {
|
||||
final int splitPoint = stringValue.length() * 2;
|
||||
|
||||
buffer.clear();
|
||||
|
||||
var rawBuffer = buffer.slice(0, splitPoint);
|
||||
var compressedBuffer = buffer.slice(splitPoint, BUFFER_SIZE - splitPoint);
|
||||
|
||||
rawBuffer.clear();
|
||||
rawBuffer.asCharBuffer().append(stringValue);
|
||||
|
||||
// can't flip here because position and limit is in the CharBuffer representation
|
||||
rawBuffer.position(0);
|
||||
rawBuffer.limit(stringValue.length() * 2);
|
||||
|
||||
compressedBuffer.clear();
|
||||
compressor.compress(rawBuffer, compressedBuffer);
|
||||
compressedBuffer.flip();
|
||||
|
||||
ByteBuffer retBuffer = ByteBuffer.allocate(compressedBuffer.limit());
|
||||
retBuffer.put(compressedBuffer);
|
||||
return retBuffer;
|
||||
}
|
||||
|
||||
public synchronized String decompress(ByteBuffer encoded, int length, int originalSize) {
|
||||
buffer.position(0);
|
||||
buffer.limit(length * 2);
|
||||
|
||||
encoded.position(0);
|
||||
encoded.limit(originalSize);
|
||||
|
||||
decompressor.decompress(encoded, buffer);
|
||||
|
||||
buffer.flip();
|
||||
|
||||
return buffer.asCharBuffer().toString();
|
||||
}
|
||||
}
|
@@ -1,26 +0,0 @@
|
||||
package nu.marginalia.bigstring;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.concurrent.ThreadLocalRandom;
|
||||
|
||||
/** To avoid contention on the compression buffer, while keeping allocation churn low,
|
||||
* we use a pool of buffers, randomly selected allocated upon invocation
|
||||
* <p>
|
||||
* @see CompressionBuffer CompressionBuffer
|
||||
* */
|
||||
public class CompressionBufferPool {
|
||||
private static final int BUFFER_COUNT = 16;
|
||||
private final CompressionBuffer[] destBuffer;
|
||||
|
||||
public CompressionBufferPool() {
|
||||
destBuffer = new CompressionBuffer[BUFFER_COUNT];
|
||||
Arrays.setAll(destBuffer, i -> new CompressionBuffer());
|
||||
}
|
||||
|
||||
/** Get the buffer for the current thread */
|
||||
public CompressionBuffer bufferForThread() {
|
||||
int idx = ThreadLocalRandom.current().nextInt(0, BUFFER_COUNT);
|
||||
|
||||
return destBuffer[idx];
|
||||
}
|
||||
}
|
@@ -1,19 +0,0 @@
|
||||
package nu.marginalia.bigstring;
|
||||
|
||||
public class PlainBigString implements BigString {
|
||||
private final String value;
|
||||
|
||||
public PlainBigString(String value) {
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String decode() {
|
||||
return value;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int length() {
|
||||
return value.length();
|
||||
}
|
||||
}
|
@@ -1,27 +0,0 @@
|
||||
# Big String
|
||||
|
||||
Microlibrary that offers string compression. This is useful when having to load tens of thousands
|
||||
of HTML documents in memory during conversion. XML has been described as the opposite of a compression scheme,
|
||||
and as a result, HTML compresses ridiculously well.
|
||||
|
||||
## Configuration
|
||||
|
||||
If the Java property 'bigstring.disabled' is set to true, the BigString class will not compress strings.
|
||||
|
||||
## Demo
|
||||
|
||||
```java
|
||||
List<BigString> manyBigStrings = new ArrayList<>();
|
||||
|
||||
for (var file : files) {
|
||||
// BigString.encode may or may not compress the string
|
||||
// depeneding on its size
|
||||
manyBigStrings.add(BigString.encode(readFile(file)));
|
||||
}
|
||||
|
||||
for (var bs : manyBigStrings) {
|
||||
String decompressedString = bs.decompress();
|
||||
byte[] bytes = bs.getBytes();
|
||||
int len = bs.getLength();
|
||||
}
|
||||
```
|
@@ -1,16 +0,0 @@
|
||||
package nu.marginalia.bigstring;
|
||||
|
||||
import nu.marginalia.bigstring.CompressedBigString;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
class CompressedBigStringTest {
|
||||
|
||||
@Test
|
||||
public void testCompressDecompress() {
|
||||
String testString = "This is a test string that is longer than 64 characters. It should be compressed.";
|
||||
var bigString = new CompressedBigString(testString);
|
||||
assertEquals(testString, bigString.decode());
|
||||
}
|
||||
}
|
@@ -16,7 +16,6 @@ bad support for. It's designed to be able to easily replaced when *Java's Foreig
|
||||
|
||||
* [easy-lsh](easy-lsh/) is a simple locality-sensitive hash for document deduplication
|
||||
* [guarded-regex](guarded-regex/) makes predicated regular expressions clearer
|
||||
* [big-string](big-string/) offers seamless string compression
|
||||
* [random-write-funnel](random-write-funnel/) is a tool for reducing write amplification when constructing
|
||||
large files out of order.
|
||||
* [next-prime](next-prime/) naive brute force prime sieve.
|
||||
|
@@ -19,7 +19,6 @@ dependencies {
|
||||
implementation project(':code:common:db')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:process')
|
||||
implementation project(':code:libraries:big-string')
|
||||
implementation project(':code:index:api')
|
||||
implementation project(':code:features-crawl:content-type')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
|
@@ -39,7 +39,6 @@ dependencies {
|
||||
implementation project(':code:libraries:guarded-regex')
|
||||
implementation project(':code:libraries:easy-lsh')
|
||||
implementation project(':code:libraries:geo-ip')
|
||||
implementation project(':code:libraries:big-string')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
|
||||
implementation project(':code:process-models:processed-data')
|
||||
|
@@ -27,7 +27,6 @@ dependencies {
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:libraries:big-string')
|
||||
implementation project(':code:libraries:blocking-thread-pool')
|
||||
implementation project(':code:index:api')
|
||||
implementation project(':code:process-mqapi')
|
||||
|
@@ -32,7 +32,6 @@ dependencies {
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
implementation project(':code:libraries:big-string')
|
||||
implementation project(':code:processes:converting-process')
|
||||
implementation project(':code:process-models:crawling-model')
|
||||
|
||||
|
@@ -41,7 +41,6 @@ include 'code:libraries:geo-ip'
|
||||
include 'code:libraries:btree'
|
||||
include 'code:libraries:easy-lsh'
|
||||
include 'code:libraries:guarded-regex'
|
||||
include 'code:libraries:big-string'
|
||||
include 'code:libraries:random-write-funnel'
|
||||
include 'code:libraries:next-prime'
|
||||
include 'code:libraries:blocking-thread-pool'
|
||||
|
Reference in New Issue
Block a user