mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
(index) Fix broken test case in the "slow" collection
This commit is contained in:
@@ -4,21 +4,24 @@ package nu.marginalia.converting;
|
|||||||
import com.google.inject.Guice;
|
import com.google.inject.Guice;
|
||||||
import com.google.inject.Injector;
|
import com.google.inject.Injector;
|
||||||
import nu.marginalia.converting.model.ProcessedDocument;
|
import nu.marginalia.converting.model.ProcessedDocument;
|
||||||
|
import nu.marginalia.converting.model.ProcessedDomain;
|
||||||
import nu.marginalia.converting.processor.DomainProcessor;
|
import nu.marginalia.converting.processor.DomainProcessor;
|
||||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||||
import nu.marginalia.model.DocumentFormat;
|
import nu.marginalia.model.DocumentFormat;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||||
|
import nu.marginalia.model.crawl.HtmlFeature;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.crawl.UrlIndexingState;
|
import nu.marginalia.model.crawl.UrlIndexingState;
|
||||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||||
import nu.marginalia.model.crawldata.CrawledDomain;
|
import nu.marginalia.model.crawldata.CrawledDomain;
|
||||||
import nu.marginalia.model.crawldata.SerializableCrawlData;
|
import nu.marginalia.model.crawldata.SerializableCrawlData;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
import org.junit.jupiter.api.Tag;
|
import org.junit.jupiter.api.Tag;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.time.LocalTime;
|
import java.time.LocalTime;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
@@ -28,10 +31,10 @@ import static org.junit.jupiter.api.Assertions.*;
|
|||||||
@Tag("slow")
|
@Tag("slow")
|
||||||
public class ConvertingIntegrationTest {
|
public class ConvertingIntegrationTest {
|
||||||
|
|
||||||
private DomainProcessor domainProcessor;
|
private static DomainProcessor domainProcessor;
|
||||||
|
|
||||||
@BeforeEach
|
@BeforeAll
|
||||||
public void setUp() {
|
public static void setUp() {
|
||||||
Injector injector = Guice.createInjector(
|
Injector injector = Guice.createInjector(
|
||||||
new ConvertingIntegrationTestModule()
|
new ConvertingIntegrationTestModule()
|
||||||
);
|
);
|
||||||
@@ -51,6 +54,25 @@ public class ConvertingIntegrationTest {
|
|||||||
assertEquals(ret.domain, new EdgeDomain("memex.marginalia.nu"));
|
assertEquals(ret.domain, new EdgeDomain("memex.marginalia.nu"));
|
||||||
assertTrue(ret.documents.isEmpty());
|
assertTrue(ret.documents.isEmpty());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testBuggyCase() throws IOException {
|
||||||
|
|
||||||
|
// Test used to inspect processing of crawl data, change path below to use
|
||||||
|
Path problemCase = Path.of("/home/vlofgren/TestEnv/index-1/storage/crawl-data__25-09-15T16_33_57.245/46/64/4664ef43-blog.fermi.chat.slop.zip");
|
||||||
|
if (!Files.exists(problemCase))
|
||||||
|
return;
|
||||||
|
|
||||||
|
ProcessedDomain result = domainProcessor.fullProcessing(SerializableCrawlDataStream.openDataStream(problemCase));
|
||||||
|
for (ProcessedDocument doc : result.documents) {
|
||||||
|
System.out.println(doc.url);
|
||||||
|
if (doc.details == null) continue;
|
||||||
|
|
||||||
|
System.out.println(doc.details.features);
|
||||||
|
System.out.println(HtmlFeature.encode(doc.details.features) & HtmlFeature.AFFILIATE_LINK.getFeatureBit());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testMemexMarginaliaNuDateInternalConsistency() throws IOException {
|
public void testMemexMarginaliaNuDateInternalConsistency() throws IOException {
|
||||||
var ret = domainProcessor.fullProcessing(asSerializableCrawlData(readMarginaliaWorkingSet()));
|
var ret = domainProcessor.fullProcessing(asSerializableCrawlData(readMarginaliaWorkingSet()));
|
||||||
|
@@ -4,6 +4,7 @@ import com.google.inject.AbstractModule;
|
|||||||
import com.google.inject.name.Names;
|
import com.google.inject.name.Names;
|
||||||
import nu.marginalia.LanguageModels;
|
import nu.marginalia.LanguageModels;
|
||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
|
import nu.marginalia.api.domsample.DomSampleClient;
|
||||||
import nu.marginalia.converting.processor.ConverterDomainTypes;
|
import nu.marginalia.converting.processor.ConverterDomainTypes;
|
||||||
import nu.marginalia.process.ProcessConfiguration;
|
import nu.marginalia.process.ProcessConfiguration;
|
||||||
import nu.marginalia.service.module.ServiceConfiguration;
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
@@ -23,5 +24,8 @@ public class ConvertingIntegrationTestModule extends AbstractModule {
|
|||||||
));
|
));
|
||||||
bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels());
|
bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels());
|
||||||
bind(ConverterDomainTypes.class).toInstance(Mockito.mock(ConverterDomainTypes.class));
|
bind(ConverterDomainTypes.class).toInstance(Mockito.mock(ConverterDomainTypes.class));
|
||||||
|
|
||||||
|
DomSampleClient domSampleClientMock = Mockito.mock(DomSampleClient.class);
|
||||||
|
bind(DomSampleClient.class).toInstance(domSampleClientMock);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user