1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 07:32:38 +02:00

Compare commits

...

4 Commits

Author SHA1 Message Date
Viktor Lofgren
567e4e1237 (crawler) Fast detection and bail-out for crawler traps
Improve logging and exclude robots.txt from this logic.
2025-01-18 15:28:54 +01:00
Viktor Lofgren
4342e42722 (crawler) Fast detection and bail-out for crawler traps
Nephentes has been doing the rounds in social media, adding an easy detection and mitigation mechanism for this type of trap, as sadly not all webmasters set up their robots.txt correctly.  Out of the box crawl limits will also deal with this type of attack, but this fix is faster.
2025-01-17 13:02:57 +01:00
Viktor Lofgren
bc818056e6 (run) Fix templates for mariadb
Apparently the docker image contract changed at some point, and now we should spawn mariadbd and not mysqld; mariadb-admin and not mysqladmin.
2025-01-16 15:27:02 +01:00
Viktor Lofgren
de2feac238 (chore) Upgrade jib from 3.4.3 to 3.4.4 2025-01-16 15:10:45 +01:00
6 changed files with 30 additions and 9 deletions

View File

@@ -47,7 +47,7 @@ ext {
dockerImageBase='container-registry.oracle.com/graalvm/jdk:23'
dockerImageTag='latest'
dockerImageRegistry='marginalia'
jibVersion = '3.4.3'
jibVersion = '3.4.4'
}

View File

@@ -22,6 +22,7 @@ import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.security.NoSuchAlgorithmException;
import java.time.Duration;
import java.time.Instant;
import java.util.*;
@@ -89,6 +90,7 @@ public class WarcRecorder implements AutoCloseable {
var call = client.newCall(request);
cookieInformation.update(client, request.url());
try (var response = call.execute();
@@ -167,6 +169,25 @@ public class WarcRecorder implements AutoCloseable {
warcRequest.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
writer.write(warcRequest);
if (Duration.between(date, Instant.now()).compareTo(Duration.ofSeconds(9)) > 0
&& inputBuffer.size() < 2048
&& !request.url().encodedPath().endsWith("robots.txt")) // don't bail on robots.txt
{
// Fast detection and mitigation of crawler traps that respond with slow
// small responses, with a high branching factor
// Note we bail *after* writing the warc records, this will effectively only
// prevent link extraction from the document.
logger.warn("URL {} took too long to fetch ({}s) and was too small for the effort ({}b)",
requestUri,
Duration.between(date, Instant.now()).getSeconds(),
inputBuffer.size()
);
return new HttpFetchResult.ResultException(new IOException("Likely crawler trap"));
}
return new HttpFetchResult.ResultOk(responseUri,
response.code(),
inputBuffer.headers(),

View File

@@ -72,11 +72,11 @@ services:
image: "mariadb:lts"
container_name: "mariadb"
env_file: "${INSTALL_DIR}/env/mariadb.env"
command: ['mysqld', '--character-set-server=utf8mb4', '--collation-server=utf8mb4_unicode_ci']
command: ['mariadbd', '--character-set-server=utf8mb4', '--collation-server=utf8mb4_unicode_ci']
ports:
- "127.0.0.1:3306:3306/tcp"
healthcheck:
test: mysqladmin ping -h 127.0.0.1 -u ${uval} --password=${pval}
test: mariadb-admin ping -h 127.0.0.1 -u ${uval} --password=${pval}
start_period: 5s
interval: 5s
timeout: 5s

View File

@@ -103,11 +103,11 @@ services:
image: "mariadb:lts"
container_name: "mariadb"
env_file: "${INSTALL_DIR}/env/mariadb.env"
command: ['mysqld', '--character-set-server=utf8mb4', '--collation-server=utf8mb4_unicode_ci']
command: ['mariadbd', '--character-set-server=utf8mb4', '--collation-server=utf8mb4_unicode_ci']
ports:
- "127.0.0.1:3306:3306/tcp"
healthcheck:
test: mysqladmin ping -h 127.0.0.1 -u ${uval} --password=${pval}
test: mariadb-admin ping -h 127.0.0.1 -u ${uval} --password=${pval}
start_period: 5s
interval: 5s
timeout: 5s

View File

@@ -129,11 +129,11 @@ services:
image: "mariadb:lts"
container_name: "mariadb"
env_file: "${INSTALL_DIR}/env/mariadb.env"
command: ['mysqld', '--character-set-server=utf8mb4', '--collation-server=utf8mb4_unicode_ci']
command: ['mariadbd', '--character-set-server=utf8mb4', '--collation-server=utf8mb4_unicode_ci']
ports:
- "127.0.0.1:3306:3306/tcp"
healthcheck:
test: mysqladmin ping -h 127.0.0.1 -u ${uval} --password=${pval}
test: mariadb-admin ping -h 127.0.0.1 -u ${uval} --password=${pval}
start_period: 5s
interval: 5s
timeout: 5s

View File

@@ -3,11 +3,11 @@ services:
image: "mariadb:lts"
container_name: "mariadb"
env_file: "${INSTALL_DIR}/env/mariadb.env"
command: ['mysqld', '--character-set-server=utf8mb4', '--collation-server=utf8mb4_unicode_ci']
command: ['mariadbd', '--character-set-server=utf8mb4', '--collation-server=utf8mb4_unicode_ci']
ports:
- "127.0.0.1:3306:3306/tcp"
healthcheck:
test: mysqladmin ping -h 127.0.0.1 -u ${uval} --password=${pval}
test: mariadb-admin ping -h 127.0.0.1 -u ${uval} --password=${pval}
start_period: 5s
interval: 5s
timeout: 5s