1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00

Tests for crawler specialization + testdata

This commit is contained in:
Viktor Lofgren
2023-06-26 14:14:39 +02:00
committed by Viktor
parent ec940e36d0
commit e7af77e151
14 changed files with 276 additions and 4 deletions

View File

@@ -53,6 +53,8 @@ dependencies {
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
testImplementation project(':code:processes:test-data')
}
test {

View File

@@ -4,21 +4,23 @@ import crawlercommons.robots.SimpleRobotRules;
import lombok.SneakyThrows;
import nu.marginalia.bigstring.BigString;
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
import nu.marginalia.crawl.retreival.RateLimitException;
import nu.marginalia.crawl.retreival.fetcher.FetchResult;
import nu.marginalia.crawl.retreival.fetcher.FetchResultState;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
import nu.marginalia.crawl.retreival.fetcher.SitemapRetriever;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawlerDocumentStatus;
import nu.marginalia.crawling.model.SerializableCrawlData;
import nu.marginalia.crawling.model.spec.CrawlingSpecification;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
@@ -32,6 +34,7 @@ public class CrawlerMockFetcherTest {
Map<EdgeUrl, CrawledDocument> mockData = new HashMap<>();
HttpFetcher fetcherMock = new MockFetcher();
SitemapRetriever sitemapRetriever = new SitemapRetriever();
@AfterEach
public void tearDown() {
@@ -74,6 +77,7 @@ public class CrawlerMockFetcherTest {
registerUrlClasspathData(new EdgeUrl("https://startrek.website/"), "mock-crawl-data/lemmy/index.html");
registerUrlClasspathData(new EdgeUrl("https://startrek.website/c/startrek"), "mock-crawl-data/lemmy/c_startrek.html");
registerUrlClasspathData(new EdgeUrl("https://startrek.website/post/108995"), "mock-crawl-data/lemmy/108995.html");
new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 10, "startrek.website", new ArrayList<>()), out::add)
.withNoDelay()

View File

@@ -1,12 +1,12 @@
package nu.marginalia.crawling.retreival;
import nu.marginalia.WmsaHome;
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.spec.CrawlingSpecification;
import nu.marginalia.crawling.model.SerializableCrawlData;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
@@ -19,24 +19,30 @@ class CrawlerRetreiverTest {
@Test
public void testEmptySet() throws IOException {
System.setProperty("http.agent", WmsaHome.getUserAgent().uaString());
// Tests the case when there are no URLs provided in the crawl set and the
// crawler needs to guess the protocol
var specs = new CrawlingSpecification("1", 5, "www.marginalia.nu", new ArrayList<>());
HttpFetcher fetcher = new HttpFetcherImpl("test.marginalia.nu");
HttpFetcher fetcher = new HttpFetcherImpl("search.marginalia.nu; testing a bit :D");
List<SerializableCrawlData> data = new ArrayList<>();
new CrawlerRetreiver(fetcher, specs, data::add).fetch();
data.stream().filter(CrawledDocument.class::isInstance)
.map(CrawledDocument.class::cast)
.forEach(doc -> System.out.println(doc.url + "\t" + doc.crawlerStatus + "\t" + doc.httpStatus));
/*
Assertions.assertTrue(
data.stream().filter(CrawledDocument.class::isInstance)
.map(CrawledDocument.class::cast)
.filter(doc -> "OK".equals(doc.crawlerStatus))
.count() > 1
);
*/
}
}

View File

@@ -0,0 +1,14 @@
plugins {
id 'java'
id "io.freefair.lombok" version "5.3.3.3"
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(17))
}
}
dependencies {
}

View File

@@ -0,0 +1,3 @@
## test-data
This package contains test data for various internet software.

File diff suppressed because one or more lines are too long

View File

@@ -31,6 +31,7 @@ dependencies {
implementation project(':code:libraries:term-frequency-dict')
implementation project(':code:libraries:big-string')
implementation project(':code:processes:converting-process')
implementation project(':code:process-models:converting-model')
implementation project(':code:process-models:crawling-model')
implementation project(':code:features-convert:adblock')

View File

@@ -56,6 +56,7 @@ include 'code:common:process'
include 'code:processes:converting-process'
include 'code:processes:crawling-process'
include 'code:processes:loading-process'
include 'code:processes:test-data'
include 'code:process-models:converting-model'
include 'code:process-models:crawling-model'