mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
Tests for crawler specialization + testdata
This commit is contained in:
@@ -53,6 +53,8 @@ dependencies {
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
|
||||
testImplementation project(':code:processes:test-data')
|
||||
}
|
||||
|
||||
test {
|
||||
|
@@ -4,21 +4,23 @@ import crawlercommons.robots.SimpleRobotRules;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.bigstring.BigString;
|
||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||
import nu.marginalia.crawl.retreival.RateLimitException;
|
||||
import nu.marginalia.crawl.retreival.fetcher.FetchResult;
|
||||
import nu.marginalia.crawl.retreival.fetcher.FetchResultState;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.retreival.fetcher.SitemapRetriever;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.crawling.model.CrawlerDocumentStatus;
|
||||
import nu.marginalia.crawling.model.SerializableCrawlData;
|
||||
import nu.marginalia.crawling.model.spec.CrawlingSpecification;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
@@ -32,6 +34,7 @@ public class CrawlerMockFetcherTest {
|
||||
|
||||
Map<EdgeUrl, CrawledDocument> mockData = new HashMap<>();
|
||||
HttpFetcher fetcherMock = new MockFetcher();
|
||||
SitemapRetriever sitemapRetriever = new SitemapRetriever();
|
||||
|
||||
@AfterEach
|
||||
public void tearDown() {
|
||||
@@ -74,6 +77,7 @@ public class CrawlerMockFetcherTest {
|
||||
|
||||
registerUrlClasspathData(new EdgeUrl("https://startrek.website/"), "mock-crawl-data/lemmy/index.html");
|
||||
registerUrlClasspathData(new EdgeUrl("https://startrek.website/c/startrek"), "mock-crawl-data/lemmy/c_startrek.html");
|
||||
registerUrlClasspathData(new EdgeUrl("https://startrek.website/post/108995"), "mock-crawl-data/lemmy/108995.html");
|
||||
|
||||
new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 10, "startrek.website", new ArrayList<>()), out::add)
|
||||
.withNoDelay()
|
||||
|
@@ -1,12 +1,12 @@
|
||||
package nu.marginalia.crawling.retreival;
|
||||
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.crawling.model.spec.CrawlingSpecification;
|
||||
import nu.marginalia.crawling.model.SerializableCrawlData;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
@@ -19,24 +19,30 @@ class CrawlerRetreiverTest {
|
||||
|
||||
@Test
|
||||
public void testEmptySet() throws IOException {
|
||||
System.setProperty("http.agent", WmsaHome.getUserAgent().uaString());
|
||||
// Tests the case when there are no URLs provided in the crawl set and the
|
||||
// crawler needs to guess the protocol
|
||||
|
||||
var specs = new CrawlingSpecification("1", 5, "www.marginalia.nu", new ArrayList<>());
|
||||
|
||||
HttpFetcher fetcher = new HttpFetcherImpl("test.marginalia.nu");
|
||||
|
||||
HttpFetcher fetcher = new HttpFetcherImpl("search.marginalia.nu; testing a bit :D");
|
||||
|
||||
List<SerializableCrawlData> data = new ArrayList<>();
|
||||
|
||||
new CrawlerRetreiver(fetcher, specs, data::add).fetch();
|
||||
|
||||
data.stream().filter(CrawledDocument.class::isInstance)
|
||||
.map(CrawledDocument.class::cast)
|
||||
.forEach(doc -> System.out.println(doc.url + "\t" + doc.crawlerStatus + "\t" + doc.httpStatus));
|
||||
|
||||
/*
|
||||
Assertions.assertTrue(
|
||||
data.stream().filter(CrawledDocument.class::isInstance)
|
||||
.map(CrawledDocument.class::cast)
|
||||
.filter(doc -> "OK".equals(doc.crawlerStatus))
|
||||
.count() > 1
|
||||
);
|
||||
*/
|
||||
}
|
||||
|
||||
}
|
14
code/processes/test-data/build.gradle
Normal file
14
code/processes/test-data/build.gradle
Normal file
@@ -0,0 +1,14 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
id "io.freefair.lombok" version "5.3.3.3"
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(17))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
dependencies {
|
||||
}
|
3
code/processes/test-data/readme.md
Normal file
3
code/processes/test-data/readme.md
Normal file
@@ -0,0 +1,3 @@
|
||||
## test-data
|
||||
|
||||
This package contains test data for various internet software.
|
File diff suppressed because one or more lines are too long
@@ -31,6 +31,7 @@ dependencies {
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
implementation project(':code:libraries:big-string')
|
||||
implementation project(':code:processes:converting-process')
|
||||
implementation project(':code:process-models:converting-model')
|
||||
implementation project(':code:process-models:crawling-model')
|
||||
|
||||
implementation project(':code:features-convert:adblock')
|
||||
|
@@ -56,6 +56,7 @@ include 'code:common:process'
|
||||
include 'code:processes:converting-process'
|
||||
include 'code:processes:crawling-process'
|
||||
include 'code:processes:loading-process'
|
||||
include 'code:processes:test-data'
|
||||
|
||||
include 'code:process-models:converting-model'
|
||||
include 'code:process-models:crawling-model'
|
||||
|
Reference in New Issue
Block a user