1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 07:32:38 +02:00

Compare commits

...

40 Commits

Author SHA1 Message Date
Viktor Lofgren
4a8a028118 (deploy) Deploy assistant and browserless 2025-05-28 15:50:26 +02:00
Viktor
a25bc647be Merge pull request #201 from MarginaliaSearch/website-capture
Capture website snapshots
2025-05-28 15:49:03 +02:00
Viktor Lofgren
a720dba3a2 (deploy) Add browserless to deploy script 2025-05-28 15:48:32 +02:00
Viktor Lofgren
284f382867 (dom-sample) Fix initialization to work the same as screenshot capture 2025-05-28 15:40:09 +02:00
Viktor Lofgren
a80717f138 (dom-sample) Cleanup 2025-05-28 15:32:54 +02:00
Viktor Lofgren
d6da715fa4 (dom-sample) Add basic retrieval logic
First iteration is single threaded for simplicity
2025-05-28 15:18:15 +02:00
Viktor Lofgren
c1ec7aa491 (dom-sample) Add a boolean to the sample db when we've accepted a cookie dialogue 2025-05-28 14:45:19 +02:00
Viktor Lofgren
3daf37e283 (dom-sample) Improve storage of DOM sample data 2025-05-28 14:34:34 +02:00
Viktor Lofgren
44a774d3a8 (browserless) Add --pull option to Docker build command
This ensures we fetch the latest base image when we build.
2025-05-28 14:09:32 +02:00
Viktor Lofgren
597aeaf496 (website-capture) Correct manifest
run_at is set at the content_script level, not the root object.
2025-05-28 14:05:16 +02:00
Viktor Lofgren
06df7892c2 (website-capture) Clean up code 2025-05-27 15:56:59 +02:00
Viktor Lofgren
dc26854268 (website-capture) Add a marker to the network log when we've accepted a cookie dialog 2025-05-27 15:21:02 +02:00
Viktor Lofgren
9f16326cba (website-capture) Add logic that automatically identifies and agrees to cookie consent popovers
Oftentimes, ads don't load until after you've agreed to the popover.
2025-05-27 15:11:47 +02:00
Viktor Lofgren
ed66d0b3a7 (website-capture) Amend the extension to also capture web request information 2025-05-26 14:00:43 +02:00
Viktor Lofgren
c3afc82dad (website-capture) Rename scripts to be more consistent with extension terminology 2025-05-26 13:13:11 +02:00
Viktor Lofgren
08e25e539e (website-capture) Minor cleanups 2025-05-21 14:55:03 +02:00
Viktor Lofgren
4946044dd0 (website-capture) Update BrowserlesClient to use the new image 2025-05-21 14:14:18 +02:00
Viktor Lofgren
edf382e1c5 (website-capture) Add a custom docker image with a new custom extension for DOM capture
The original approach of injecting javascript into the page directly didn't work with pages that reloaded themselves.  To work around this, a chrome extension is used instead that does the same work, but subscribes to reload events and re-installs the change listener.
2025-05-21 14:13:54 +02:00
Viktor Lofgren
644cba32e4 (website-capture) Remove dead imports 2025-05-20 16:08:48 +02:00
Viktor Lofgren
34b76390b2 (website-capture) Add storage object for DOM samples 2025-05-20 16:05:54 +02:00
Viktor Lofgren
43cd507971 (crawler) Add a migration workaround so we can still open old slop crawl data with the new column added 2025-05-19 14:47:38 +02:00
Viktor Lofgren
cc40e99fdc (crawler) Add a migration workaround so we can still open old slop crawl data with the new column added 2025-05-19 14:37:59 +02:00
Viktor Lofgren
8a944cf4c6 (crawler) Add request time to crawl data
This is an interesting indicator of website quality.
2025-05-19 14:07:41 +02:00
Viktor Lofgren
1c128e6d82 (crawler) Add request time to crawl data
This is an interesting indicator of website quality.
2025-05-19 14:02:03 +02:00
Viktor Lofgren
be039d1a8c (live-capture) Add a new function for capturing the DOM of a website after rendering
The new code injects a javascript that attempts to trigger popovers, and then alters the DOM to add attributes containing CSS elements with position and visibility.
2025-05-19 13:26:07 +02:00
Viktor Lofgren
4edc0d3267 (converter) Increase work buffer for converter
Conversion on index node  7 in production is crashing ostensibly because this buffer is too small.
2025-05-18 13:22:44 +02:00
Viktor Lofgren
890f521d0d (pdf) Fix crash for some bold lines 2025-05-18 13:05:05 +02:00
Viktor Lofgren
b1814a30f7 (deploy) Redeploy all services. 2025-05-17 13:11:51 +02:00
Viktor Lofgren
f59a9eb025 (legacy-search) Soften domain limit constraints in URL deduplication 2025-05-17 00:04:27 +02:00
Viktor Lofgren
599534806b (search) Soften domain limit constraints in URL deduplication 2025-05-17 00:00:42 +02:00
Viktor Lofgren
7e8253dac7 (search) Clean up debug logging 2025-05-17 00:00:28 +02:00
Viktor Lofgren
97a6780ea3 (search) Add debug logging for specific query 2025-05-16 23:41:35 +02:00
Viktor Lofgren
eb634beec8 (search) Add debug logging for specific query 2025-05-16 23:34:03 +02:00
Viktor Lofgren
269ebd1654 Revert "(query) Add debug logging for specific query"
This reverts commit 39ce40bfeb.
2025-05-16 23:29:06 +02:00
Viktor Lofgren
39ce40bfeb (query) Add debug logging for specific query 2025-05-16 23:23:53 +02:00
Viktor Lofgren
c187b2e1c1 (search) Re-enable clustering 2025-05-16 23:20:16 +02:00
Viktor Lofgren
42eaa4588b (search) Disable clustering for a moment 2025-05-16 23:17:01 +02:00
Viktor Lofgren
4f40a5fbeb (search) Reduce log spam 2025-05-16 23:15:07 +02:00
Viktor Lofgren
3f3d42bc01 (search) Re-enable deduplication 2025-05-16 23:14:54 +02:00
Viktor Lofgren
61c8d53e1b (search) Disable deduplication for a moment 2025-05-16 23:10:32 +02:00
32 changed files with 1342 additions and 44 deletions

View File

@@ -112,14 +112,6 @@ public class EdgeDomain implements Serializable {
return topDomain;
}
public String getDomainKey() {
int cutPoint = topDomain.indexOf('.');
if (cutPoint < 0) {
return topDomain;
}
return topDomain.substring(0, cutPoint).toLowerCase();
}
/** If possible, try to provide an alias domain,
* i.e. a domain name that is very likely to link to this one
* */

View File

@@ -8,14 +8,6 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
class EdgeDomainTest {
@Test
public void testSkepdic() throws URISyntaxException {
var domain = new EdgeUrl("http://www.skepdic.com/astrology.html");
assertEquals("skepdic", domain.getDomain().getDomainKey());
var domain2 = new EdgeUrl("http://skepdic.com/astrology.html");
assertEquals("skepdic", domain2.getDomain().getDomainKey());
}
@Test
public void testHkDomain() throws URISyntaxException {
var domain = new EdgeUrl("http://l7072i3.l7c.net");

View File

@@ -25,9 +25,9 @@ dependencies {
implementation project(':code:execution:api')
implementation project(':code:processes:crawling-process:ft-content-type')
implementation project(':third-party:rssreader')
implementation libs.jsoup
implementation project(':third-party:rssreader')
implementation libs.opencsv
implementation libs.slop
implementation libs.sqlite
@@ -57,8 +57,6 @@ dependencies {
implementation libs.bundles.gson
implementation libs.bundles.mariadb
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito

View File

@@ -0,0 +1,119 @@
package nu.marginalia.domsample;
import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource;
import jakarta.inject.Named;
import nu.marginalia.domsample.db.DomSampleDb;
import nu.marginalia.livecapture.BrowserlessClient;
import nu.marginalia.service.module.ServiceConfiguration;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.TimeUnit;
public class DomSampleService {
private final DomSampleDb db;
private final HikariDataSource mariadbDataSource;
private final URI browserlessURI;
private static final Logger logger = LoggerFactory.getLogger(DomSampleService.class);
@Inject
public DomSampleService(DomSampleDb db,
HikariDataSource mariadbDataSource,
@Named("browserless-uri") String browserlessAddress,
ServiceConfiguration serviceConfiguration)
throws URISyntaxException
{
this.db = db;
this.mariadbDataSource = mariadbDataSource;
if (StringUtils.isEmpty(browserlessAddress) || serviceConfiguration.node() > 1) {
logger.warn("Live capture service will not run");
browserlessURI = null; // satisfy final
}
else {
browserlessURI = new URI(browserlessAddress);
Thread.ofPlatform().daemon().start(this::run);
}
}
public void syncDomains() {
Set<String> dbDomains = new HashSet<>();
logger.info("Fetching domains from database...");
try (var conn = mariadbDataSource.getConnection();
var stmt = conn.prepareStatement("""
SELECT DOMAIN_NAME
FROM EC_DOMAIN
WHERE NODE_AFFINITY>0
""")
) {
var rs = stmt.executeQuery();
while (rs.next()) {
dbDomains.add(rs.getString("DOMAIN_NAME"));
}
} catch (Exception e) {
throw new RuntimeException("Failed to sync domains", e);
}
logger.info("Found {} domains in database", dbDomains.size());
db.syncDomains(dbDomains);
logger.info("Synced domains to sqlite");
}
public void run() {
try (var client = new BrowserlessClient(browserlessURI)) {
while (!Thread.currentThread().isInterrupted()) {
try {
// Grace sleep in case we're operating on an empty domain list
TimeUnit.SECONDS.sleep(15);
syncDomains();
var domains = db.getScheduledDomains();
for (var domain : domains) {
updateDomain(client, domain);
}
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
logger.info("DomSampleService interrupted, stopping...");
return;
} catch (Exception e) {
logger.error("Error in DomSampleService run loop", e);
}
}
}
}
private void updateDomain(BrowserlessClient client, String domain) {
var rootUrl = "https://" + domain + "/";
try {
var content = client.annotatedContent(rootUrl,
BrowserlessClient.GotoOptions.defaultValues());
if (content.isPresent()) {
db.saveSample(domain, rootUrl, content.get());
}
} catch (Exception e) {
logger.error("Failed to process domain: " + domain, e);
}
finally {
db.flagDomainAsFetched(domain);
}
}
}

View File

@@ -0,0 +1,174 @@
package nu.marginalia.domsample.db;
import nu.marginalia.WmsaHome;
import org.jsoup.Jsoup;
import java.nio.file.Path;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.util.*;
public class DomSampleDb implements AutoCloseable {
private static final String dbFileName = "dom-sample.db";
private final Connection connection;
public DomSampleDb() throws SQLException{
this(WmsaHome.getDataPath().resolve(dbFileName));
}
public DomSampleDb(Path dbPath) throws SQLException {
String dbUrl = "jdbc:sqlite:" + dbPath.toAbsolutePath();
connection = DriverManager.getConnection(dbUrl);
try (var stmt = connection.createStatement()) {
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS samples (url TEXT PRIMARY KEY, domain TEXT, sample BLOB, requests BLOB, accepted_popover BOOLEAN DEFAULT FALSE)");
stmt.executeUpdate("CREATE INDEX IF NOT EXISTS domain_index ON samples (domain)");
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS schedule (domain TEXT PRIMARY KEY, last_fetch TIMESTAMP DEFAULT NULL)");
}
}
public void syncDomains(Set<String> domains) {
Set<String> currentDomains = new HashSet<>();
try (var stmt = connection.prepareStatement("SELECT domain FROM schedule")) {
var rs = stmt.executeQuery();
while (rs.next()) {
currentDomains.add(rs.getString("domain"));
}
} catch (SQLException e) {
throw new RuntimeException("Failed to sync domains", e);
}
Set<String> toRemove = new HashSet<>(currentDomains);
Set<String> toAdd = new HashSet<>(domains);
toRemove.removeAll(domains);
toAdd.removeAll(currentDomains);
try (var removeStmt = connection.prepareStatement("DELETE FROM schedule WHERE domain = ?");
var addStmt = connection.prepareStatement("INSERT OR IGNORE INTO schedule (domain) VALUES (?)")
) {
for (String domain : toRemove) {
removeStmt.setString(1, domain);
removeStmt.executeUpdate();
}
for (String domain : toAdd) {
addStmt.setString(1, domain);
addStmt.executeUpdate();
}
} catch (SQLException e) {
throw new RuntimeException("Failed to remove domains", e);
}
}
public List<String> getScheduledDomains() {
List<String> domains = new ArrayList<>();
try (var stmt = connection.prepareStatement("SELECT domain FROM schedule ORDER BY last_fetch IS NULL DESC, last_fetch ASC")) {
var rs = stmt.executeQuery();
while (rs.next()) {
domains.add(rs.getString("domain"));
}
} catch (SQLException e) {
throw new RuntimeException("Failed to get scheduled domains", e);
}
return domains;
}
public void flagDomainAsFetched(String domain) {
try (var stmt = connection.prepareStatement("INSERT OR REPLACE INTO schedule (domain, last_fetch) VALUES (?, CURRENT_TIMESTAMP)")) {
stmt.setString(1, domain);
stmt.executeUpdate();
} catch (SQLException e) {
throw new RuntimeException("Failed to flag domain as fetched", e);
}
}
public record Sample(String url, String domain, String sample, String requests, boolean acceptedPopover) {}
public List<Sample> getSamples(String domain) throws SQLException {
List<Sample> samples = new ArrayList<>();
try (var stmt = connection.prepareStatement("""
SELECT url, sample, requests, accepted_popover
FROM samples
WHERE domain = ?
"""))
{
stmt.setString(1, domain);
var rs = stmt.executeQuery();
while (rs.next()) {
samples.add(
new Sample(
rs.getString("url"),
domain,
rs.getString("sample"),
rs.getString("requests"),
rs.getBoolean("accepted_popover")
)
);
}
}
return samples;
}
public void saveSample(String domain, String url, String rawContent) throws SQLException {
var doc = Jsoup.parse(rawContent);
var networkRequests = doc.getElementById("marginalia-network-requests");
boolean acceptedPopover = false;
StringBuilder requestTsv = new StringBuilder();
if (networkRequests != null) {
acceptedPopover = !networkRequests.getElementsByClass("marginalia-agreed-cookies").isEmpty();
for (var request : networkRequests.getElementsByClass("network-request")) {
String method = request.attr("data-method");
String urlAttr = request.attr("data-url");
String timestamp = request.attr("data-timestamp");
requestTsv
.append(method)
.append('\t')
.append(timestamp)
.append('\t')
.append(urlAttr.replace('\n', ' '))
.append("\n");
}
networkRequests.remove();
}
doc.body().removeAttr("id");
String sample = doc.html();
saveSampleRaw(domain, url, sample, requestTsv.toString().trim(), acceptedPopover);
}
record Request(String url, String method, String timestamp, boolean acceptedPopover) {}
public void saveSampleRaw(String domain, String url, String sample, String requests, boolean acceptedPopover) throws SQLException {
try (var stmt = connection.prepareStatement("""
INSERT OR REPLACE
INTO samples (domain, url, sample, requests, accepted_popover)
VALUES (?, ?, ?, ?, ?)
""")) {
stmt.setString(1, domain);
stmt.setString(2, url);
stmt.setString(3, sample);
stmt.setString(4, requests);
stmt.setBoolean(5, acceptedPopover);
stmt.executeUpdate();
}
}
public void close() throws SQLException {
connection.close();
}
}

View File

@@ -8,10 +8,13 @@ import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.URI;
import java.net.URLEncoder;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.nio.charset.StandardCharsets;
import java.time.Duration;
import java.util.List;
import java.util.Map;
import java.util.Optional;
@@ -60,6 +63,42 @@ public class BrowserlessClient implements AutoCloseable {
return Optional.of(rsp.body());
}
/** Fetches content with a marginalia hack extension loaded that decorates the DOM with attributes for
* certain CSS attributes, to be able to easier identify popovers and other nuisance elements.
*/
public Optional<String> annotatedContent(String url, GotoOptions gotoOptions) throws IOException, InterruptedException {
Map<String, Object> requestData = Map.of(
"url", url,
"userAgent", userAgent,
"gotoOptions", gotoOptions,
"waitForSelector", Map.of("selector", "#marginaliahack", "timeout", 15000)
);
// Launch parameters for the browserless instance to load the extension
Map<String, Object> launchParameters = Map.of(
"args", List.of("--load-extension=/dom-export")
);
String launchParametersStr = URLEncoder.encode(gson.toJson(launchParameters), StandardCharsets.UTF_8);
var request = HttpRequest.newBuilder()
.uri(browserlessURI.resolve("/content?token="+BROWSERLESS_TOKEN+"&launch="+launchParametersStr))
.method("POST", HttpRequest.BodyPublishers.ofString(
gson.toJson(requestData)
))
.header("Content-type", "application/json")
.build();
var rsp = httpClient.send(request, HttpResponse.BodyHandlers.ofString());
if (rsp.statusCode() >= 300) {
logger.info("Failed to fetch annotated content for {}, status {}", url, rsp.statusCode());
return Optional.empty();
}
return Optional.of(rsp.body());
}
public byte[] screenshot(String url, GotoOptions gotoOptions, ScreenshotOptions screenshotOptions)
throws IOException, InterruptedException {
@@ -102,7 +141,7 @@ public class BrowserlessClient implements AutoCloseable {
public record GotoOptions(String waitUntil, long timeout) {
public static GotoOptions defaultValues() {
return new GotoOptions("networkidle2", Duration.ofSeconds(10).toMillis());
return new GotoOptions("load", Duration.ofSeconds(10).toMillis());
}
}

View File

@@ -0,0 +1,113 @@
package nu.marginalia.domsample.db;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.testcontainers.shaded.org.apache.commons.io.FileUtils;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
import static org.junit.jupiter.api.Assertions.*;
class DomSampleDbTest {
Path tempDir;
@BeforeEach
void setUp() throws Exception {
tempDir = Files.createTempDirectory("test");
}
@AfterEach
void tearDown() throws IOException {
FileUtils.deleteDirectory(tempDir.toFile());
}
@Test
public void testSetUp() {
var dbPath = tempDir.resolve("test.db");
try (var db = new DomSampleDb(dbPath)) {
}
catch (Exception e) {
fail("Failed to set up database: " + e.getMessage());
}
}
@Test
public void testSyncDomains() {
var dbPath = tempDir.resolve("test.db");
try (var db = new DomSampleDb(dbPath)) {
db.syncDomains(Set.of("example.com", "test.com", "foobar.com"));
assertEquals(Set.of("example.com", "test.com", "foobar.com"), new HashSet<>(db.getScheduledDomains()));
db.syncDomains(Set.of("example.com", "test.com"));
assertEquals(Set.of("example.com", "test.com"), new HashSet<>(db.getScheduledDomains()));
db.syncDomains(Set.of("foobar.com", "test.com"));
assertEquals(Set.of("foobar.com", "test.com"), new HashSet<>(db.getScheduledDomains()));
}
catch (Exception e) {
fail("Failed to sync domains: " + e.getMessage());
}
}
@Test
public void testFetchDomains() {
var dbPath = tempDir.resolve("test.db");
try (var db = new DomSampleDb(dbPath)) {
db.syncDomains(Set.of("example.com", "test.com", "foobar.com"));
db.flagDomainAsFetched("example.com");
db.flagDomainAsFetched("test.com");
db.flagDomainAsFetched("foobar.com");
assertEquals(List.of("example.com", "test.com", "foobar.com"), db.getScheduledDomains());
db.flagDomainAsFetched("test.com");
assertEquals(List.of("example.com", "foobar.com", "test.com"), db.getScheduledDomains());
}
catch (Exception e) {
fail("Failed to sync domains: " + e.getMessage());
}
}
@Test
public void saveLoadSingle() {
var dbPath = tempDir.resolve("test.db");
try (var db = new DomSampleDb(dbPath)) {
db.saveSampleRaw("example.com", "http://example.com/sample", "sample data", "requests data", true);
var samples = db.getSamples("example.com");
assertEquals(1, samples.size());
var sample = samples.getFirst();
assertEquals("example.com", sample.domain());
assertEquals("http://example.com/sample", sample.url());
assertEquals("sample data", sample.sample());
assertEquals("requests data", sample.requests());
assertTrue(sample.acceptedPopover());
}
catch (Exception e) {
fail("Failed to save/load sample: " + e.getMessage());
}
}
@Test
public void saveLoadTwo() {
var dbPath = tempDir.resolve("test.db");
try (var db = new DomSampleDb(dbPath)) {
db.saveSampleRaw("example.com", "http://example.com/sample", "sample data", "r1", true);
db.saveSampleRaw("example.com", "http://example.com/sample2", "sample data2", "r2", false);
var samples = db.getSamples("example.com");
assertEquals(2, samples.size());
Map<String, String> samplesByUrl = new HashMap<>();
for (var sample : samples) {
samplesByUrl.put(sample.url(), sample.sample());
}
assertEquals("sample data", samplesByUrl.get("http://example.com/sample"));
assertEquals("sample data2", samplesByUrl.get("http://example.com/sample2"));
}
catch (Exception e) {
fail("Failed to save/load sample: " + e.getMessage());
}
}
}

View File

@@ -3,17 +3,21 @@ package nu.marginalia.livecapture;
import com.github.tomakehurst.wiremock.WireMockServer;
import com.github.tomakehurst.wiremock.core.WireMockConfiguration;
import nu.marginalia.WmsaHome;
import nu.marginalia.domsample.db.DomSampleDb;
import nu.marginalia.service.module.ServiceConfigurationModule;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
import org.testcontainers.containers.GenericContainer;
import org.testcontainers.images.PullPolicy;
import org.testcontainers.junit.jupiter.Testcontainers;
import org.testcontainers.utility.DockerImageName;
import java.io.IOException;
import java.net.URI;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Map;
import static com.github.tomakehurst.wiremock.client.WireMock.*;
@@ -22,9 +26,14 @@ import static com.github.tomakehurst.wiremock.client.WireMock.*;
@Testcontainers
@Tag("slow")
public class BrowserlessClientTest {
static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("browserless/chrome"))
// Run gradle docker if this image is not available
static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("marginalia-browserless"))
.withEnv(Map.of("TOKEN", "BROWSERLESS_TOKEN"))
.withImagePullPolicy(PullPolicy.defaultPolicy())
.withNetworkMode("bridge")
.withLogConsumer(frame -> {
System.out.print(frame.getUtf8String());
})
.withExposedPorts(3000);
static WireMockServer wireMockServer =
@@ -34,6 +43,7 @@ public class BrowserlessClientTest {
static String localIp;
static URI browserlessURI;
static URI browserlessWssURI;
@BeforeAll
public static void setup() throws IOException {
@@ -44,6 +54,12 @@ public class BrowserlessClientTest {
container.getMappedPort(3000))
);
browserlessWssURI = URI.create(String.format("ws://%s:%d/?token=BROWSERLESS_TOKEN",
container.getHost(),
container.getMappedPort(3000))
);
wireMockServer.start();
wireMockServer.stubFor(get("/").willReturn(aResponse().withStatus(200).withBody("Ok")));
@@ -85,6 +101,30 @@ public class BrowserlessClientTest {
}
}
@Test
public void testAnnotatedContent() throws Exception {
try (var client = new BrowserlessClient(browserlessURI);
DomSampleDb dbop = new DomSampleDb(Path.of("/tmp/dom-sample.db"))
) {
var content = client.annotatedContent("https://marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues()).orElseThrow();
dbop.saveSample("marginalia.nu", "https://www.thesodacanstove.com/alcohol-stove/how-to-build/", content);
System.out.println(content);
Assertions.assertFalse(content.isBlank(), "Content should not be empty");
dbop.getSamples("marginalia.nu").forEach(sample -> {
System.out.println("Sample URL: " + sample.url());
System.out.println("Sample Content: " + sample.sample());
System.out.println("Sample Requests: " + sample.requests());
System.out.println("Accepted Popover: " + sample.acceptedPopover());
});
}
finally {
Files.deleteIfExists(Path.of("/tmp/dom-sample.db"));
}
}
@Test
public void testScreenshot() throws Exception {
try (var client = new BrowserlessClient(browserlessURI)) {

View File

@@ -84,7 +84,7 @@ public class ForwardIndexConverter {
LongArray docFileData = LongArrayFactory.mmapForWritingConfined(outputFileDocsData, ForwardIndexParameters.ENTRY_SIZE * docsFileId.size());
ByteBuffer workArea = ByteBuffer.allocate(65536);
ByteBuffer workArea = ByteBuffer.allocate(1024*1024*100);
for (var instance : journal.pages()) {
try (var slopTable = new SlopTable(instance.baseDir(), instance.page()))
{

View File

@@ -53,6 +53,7 @@ public class SideloaderProcessing {
"",
body.getBytes(StandardCharsets.UTF_8),
false,
-1,
null,
null
);

View File

@@ -2002,12 +2002,11 @@ public class HeadingAwarePDFTextStripper extends LegacyPDFStreamEngine
float minFontWeight = Integer.MAX_VALUE;
for (var word : line)
{
int i = 0;
for (var textPosition : word.getTextPositions())
{
if (word.text.charAt(i++) == ' ') {
continue;
}
// Skip empty text positions as they may have a different font
if (word.text.isBlank()) continue;
var font = textPosition.getFont();
if (font == null) continue;
var descriptor = font.getFontDescriptor();

View File

@@ -148,6 +148,7 @@ public class ConvertingIntegrationTest {
"",
readClassPathFile(p.toString()).getBytes(),
false,
-1,
null,
null
);

View File

@@ -50,7 +50,7 @@ class PdfDocumentProcessorPluginTest {
));
}
public AbstractDocumentProcessorPlugin.DetailsWithWords testPdfFile(byte[] pdfBytes) throws Exception {
var doc = new CrawledDocument("test", "https://www.example.com/sample.pdf", "application/pdf", Instant.now().toString(), 200, "OK", "OK", "", pdfBytes, false, null, null);
var doc = new CrawledDocument("test", "https://www.example.com/sample.pdf", "application/pdf", Instant.now().toString(), 200, "OK", "OK", "", pdfBytes, false, -1, null, null);
return plugin.createDetails(doc, new LinkTexts(), DocumentClass.NORMAL);
}

View File

@@ -10,6 +10,7 @@ import java.net.http.HttpClient;
import java.net.http.HttpHeaders;
import java.net.http.HttpResponse;
import java.nio.charset.StandardCharsets;
import java.time.Duration;
import java.util.*;
import java.util.stream.Collectors;
@@ -90,8 +91,8 @@ public class WarcProtocolReconstructor {
return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n";
}
static String getResponseHeader(ClassicHttpResponse response, long size) {
String headerString = getHeadersAsString(response.getHeaders(), size);
static String getResponseHeader(ClassicHttpResponse response, Duration responseDuration, long size) {
String headerString = getHeadersAsString(response.getHeaders(), responseDuration, size);
return response.getVersion().format() + " " + response.getCode() + " " + response.getReasonPhrase() + "\r\n" + headerString + "\r\n\r\n";
}
@@ -160,7 +161,7 @@ public class WarcProtocolReconstructor {
static private String getHeadersAsString(Header[] headers, long responseSize) {
static private String getHeadersAsString(Header[] headers, Duration responseDuration, long responseSize) {
StringJoiner joiner = new StringJoiner("\r\n");
for (var header : headers) {
@@ -176,6 +177,7 @@ public class WarcProtocolReconstructor {
if (headerCapitalized.equals("Content-Encoding"))
continue;
// Since we're transparently decoding gzip, we need to update the Content-Length header
// to reflect the actual size of the response body. We'll do this at the end.
if (headerCapitalized.equals("Content-Length"))
@@ -184,6 +186,7 @@ public class WarcProtocolReconstructor {
joiner.add(headerCapitalized + ": " + header.getValue());
}
joiner.add("X-Marginalia-Response-Time: " + responseDuration.toMillis());
joiner.add("Content-Length: " + responseSize);
return joiner.toString();

View File

@@ -93,7 +93,7 @@ public class WarcRecorder implements AutoCloseable {
WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
Instant date = Instant.now();
Instant requestDate = Instant.now();
// Not entirely sure why we need to do this, but keeping it due to Chesterton's Fence
Map<String, List<String>> extraHeaders = new HashMap<>(request.getHeaders().length);
@@ -108,6 +108,8 @@ public class WarcRecorder implements AutoCloseable {
try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response, request, timeout);
InputStream inputStream = inputBuffer.read()) {
Instant responseDate = Instant.now();
cookies.updateCookieStore(response);
// Build and write the request
@@ -126,7 +128,7 @@ public class WarcRecorder implements AutoCloseable {
WarcRequest warcRequest = new WarcRequest.Builder(requestUri)
.blockDigest(requestDigestBuilder.build())
.date(date)
.date(requestDate)
.body(MediaType.HTTP_REQUEST, httpRequestString)
.build();
@@ -138,7 +140,9 @@ public class WarcRecorder implements AutoCloseable {
response.addHeader("X-Has-Cookies", 1);
}
byte[] responseHeaders = WarcProtocolReconstructor.getResponseHeader(response, inputBuffer.size()).getBytes(StandardCharsets.UTF_8);
byte[] responseHeaders = WarcProtocolReconstructor.getResponseHeader(response,
Duration.between(requestDate, responseDate),
inputBuffer.size()).getBytes(StandardCharsets.UTF_8);
ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(inputBuffer.size() + responseHeaders.length);
@@ -169,7 +173,7 @@ public class WarcRecorder implements AutoCloseable {
WarcResponse.Builder responseBuilder = new WarcResponse.Builder(responseUri)
.blockDigest(responseDigestBuilder.build())
.date(date)
.date(responseDate)
.concurrentTo(warcRequest.id())
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
@@ -184,7 +188,7 @@ public class WarcRecorder implements AutoCloseable {
warcResponse.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
writer.write(warcResponse);
if (Duration.between(date, Instant.now()).compareTo(Duration.ofSeconds(9)) > 0
if (Duration.between(requestDate, Instant.now()).compareTo(Duration.ofSeconds(9)) > 0
&& inputBuffer.size() < 2048
&& !requestUri.getPath().endsWith("robots.txt")) // don't bail on robots.txt
{
@@ -196,7 +200,7 @@ public class WarcRecorder implements AutoCloseable {
logger.warn("URL {} took too long to fetch ({}s) and was too small for the effort ({}b)",
requestUri,
Duration.between(date, Instant.now()).getSeconds(),
Duration.between(requestDate, Instant.now()).getSeconds(),
inputBuffer.size()
);

View File

@@ -148,6 +148,7 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
nextRecord.body,
// this field isn't actually used, maybe we can skip calculating it?
nextRecord.cookies,
-1,
lastModified,
etag));
}

View File

@@ -166,6 +166,7 @@ public class SlopSerializableCrawlDataStream implements AutoCloseable, Serializa
nextRecord.body(),
// this field isn't actually used, maybe we can skip calculating it?
nextRecord.cookies(),
nextRecord.requestTimeMs(),
null,
null));
}

View File

@@ -23,6 +23,7 @@ public final class CrawledDocument implements SerializableCrawlData {
public String crawlerStatus;
public String crawlerStatusDesc;
public int requestTimeMs;
@Nullable
public String headers;
@@ -82,7 +83,7 @@ public final class CrawledDocument implements SerializableCrawlData {
public String lastModifiedMaybe;
public String etagMaybe;
public CrawledDocument(String crawlId, String url, String contentType, String timestamp, int httpStatus, String crawlerStatus, String crawlerStatusDesc, @Nullable String headers, byte[] documentBodyBytes, Boolean hasCookies, String lastModifiedMaybe, String etagMaybe) {
public CrawledDocument(String crawlId, String url, String contentType, String timestamp, int httpStatus, String crawlerStatus, String crawlerStatusDesc, @Nullable String headers, byte[] documentBodyBytes, Boolean hasCookies, int requestTimeMs, String lastModifiedMaybe, String etagMaybe) {
this.crawlId = crawlId;
this.url = url;
this.contentType = contentType;
@@ -94,6 +95,7 @@ public final class CrawledDocument implements SerializableCrawlData {
this.documentBodyBytes = Objects.requireNonNullElse(documentBodyBytes, new byte[] {});
this.hasCookies = hasCookies;
this.lastModifiedMaybe = lastModifiedMaybe;
this.requestTimeMs = requestTimeMs;
this.etagMaybe = etagMaybe;
}
@@ -173,6 +175,7 @@ public final class CrawledDocument implements SerializableCrawlData {
private byte[] documentBodyBytes = new byte[0];
private String recrawlState;
private Boolean hasCookies;
private int requestTimeMs;
private String lastModifiedMaybe;
private String etagMaybe;
@@ -248,8 +251,13 @@ public final class CrawledDocument implements SerializableCrawlData {
return this;
}
public CrawledDocumentBuilder requestTimeMs(int requestTimeMs) {
this.requestTimeMs = requestTimeMs;
return this;
}
public CrawledDocument build() {
return new CrawledDocument(this.crawlId, this.url, this.contentType, this.timestamp, this.httpStatus, this.crawlerStatus, this.crawlerStatusDesc, this.headers, this.documentBodyBytes, this.hasCookies, this.lastModifiedMaybe, this.etagMaybe);
return new CrawledDocument(this.crawlId, this.url, this.contentType, this.timestamp, this.httpStatus, this.crawlerStatus, this.crawlerStatusDesc, this.headers, this.documentBodyBytes, this.hasCookies, this.requestTimeMs, this.lastModifiedMaybe, this.etagMaybe);
}
public String toString() {

View File

@@ -9,6 +9,7 @@ import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecord;
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader;
import nu.marginalia.slop.column.array.ByteArrayColumn;
import nu.marginalia.slop.column.primitive.ByteColumn;
import nu.marginalia.slop.column.primitive.IntColumn;
import nu.marginalia.slop.column.primitive.LongColumn;
import nu.marginalia.slop.column.primitive.ShortColumn;
import nu.marginalia.slop.column.string.EnumColumn;
@@ -39,6 +40,7 @@ public record SlopCrawlDataRecord(String domain,
long timestamp,
String contentType,
byte[] body,
int requestTimeMs,
String headers)
{
private static final EnumColumn domainColumn = new EnumColumn("domain", StandardCharsets.UTF_8, StorageType.ZSTD);
@@ -49,6 +51,7 @@ public record SlopCrawlDataRecord(String domain,
private static final LongColumn timestampColumn = new LongColumn("timestamp");
private static final EnumColumn contentTypeColumn = new EnumColumn("contentType", StandardCharsets.UTF_8);
private static final ByteArrayColumn bodyColumn = new ByteArrayColumn("body", StorageType.ZSTD);
private static final ShortColumn requestTimeColumn = new ShortColumn("requestTimeMs");
private static final StringColumn headerColumn = new StringColumn("header", StandardCharsets.UTF_8, StorageType.ZSTD);
public SlopCrawlDataRecord(CrawledDocumentParquetRecord parquetRecord) {
@@ -60,6 +63,7 @@ public record SlopCrawlDataRecord(String domain,
parquetRecord.timestamp.toEpochMilli(),
parquetRecord.contentType,
parquetRecord.body,
-1,
parquetRecord.headers
);
}
@@ -74,6 +78,7 @@ public record SlopCrawlDataRecord(String domain,
date.toEpochMilli(),
"x-marginalia/advisory;state=redirect",
new byte[0],
-1,
""
);
}
@@ -87,6 +92,7 @@ public record SlopCrawlDataRecord(String domain,
date.toEpochMilli(),
"x-marginalia/advisory;state=error",
errorStatus.getBytes(),
-1,
""
);
}
@@ -100,6 +106,7 @@ public record SlopCrawlDataRecord(String domain,
date.toEpochMilli(),
errorStatus,
new byte[0],
-1,
""
);
}
@@ -321,6 +328,7 @@ public record SlopCrawlDataRecord(String domain,
private final LongColumn.Writer timestampColumnWriter;
private final EnumColumn.Writer contentTypeColumnWriter;
private final ByteArrayColumn.Writer bodyColumnWriter;
private final ShortColumn.Writer requestTimeColumnWriter;
private final StringColumn.Writer headerColumnWriter;
public Writer(Path path) throws IOException {
@@ -334,6 +342,7 @@ public record SlopCrawlDataRecord(String domain,
timestampColumnWriter = timestampColumn.create(this);
contentTypeColumnWriter = contentTypeColumn.create(this);
bodyColumnWriter = bodyColumn.create(this);
requestTimeColumnWriter = requestTimeColumn.create(this);
headerColumnWriter = headerColumn.create(this);
}
@@ -346,6 +355,7 @@ public record SlopCrawlDataRecord(String domain,
timestampColumnWriter.put(record.timestamp);
contentTypeColumnWriter.put(record.contentType);
bodyColumnWriter.put(record.body);
requestTimeColumnWriter.put((short) record.requestTimeMs);
headerColumnWriter.put(record.headers);
}
@@ -391,10 +401,20 @@ public record SlopCrawlDataRecord(String domain,
String headersStr;
StringJoiner headersStrBuilder = new StringJoiner("\n");
int requestTimeMs = -1;
for (var header : headers) {
if (header.getName().equalsIgnoreCase("X-Cookies") && "1".equals(header.getValue())) {
hasCookies = true;
}
if (header.getName().equals("X-Marginalia-Response-Time")) {
try {
requestTimeMs = Integer.parseInt(header.getValue());
}
catch (NumberFormatException ex) {
logger.warn("Failed to parse X-Marginalia-Response-Time header: {}", header.getValue());
}
continue;
}
headersStrBuilder.add(header.getName() + ": " + header.getValue());
}
headersStr = headersStrBuilder.toString();
@@ -409,6 +429,7 @@ public record SlopCrawlDataRecord(String domain,
response.date().toEpochMilli(),
contentType,
bodyBytes,
requestTimeMs,
headersStr
)
);
@@ -461,6 +482,7 @@ public record SlopCrawlDataRecord(String domain,
private final LongColumn.Reader timestampColumnReader;
private final EnumColumn.Reader contentTypeColumnReader;
private final ByteArrayColumn.Reader bodyColumnReader;
private final ShortColumn.Reader requestTimeColumnReader;
private final StringColumn.Reader headerColumnReader;
public Reader(Path path) throws IOException {
@@ -475,6 +497,17 @@ public record SlopCrawlDataRecord(String domain,
contentTypeColumnReader = contentTypeColumn.open(this);
bodyColumnReader = bodyColumn.open(this);
headerColumnReader = headerColumn.open(this);
// FIXME: After 2025-06-XX, we can remove this migration workaround
ShortColumn.Reader timeColumnReader;
try {
timeColumnReader = requestTimeColumn.open(this);
}
catch (Exception ex) {
// Migration workaround
timeColumnReader = null;
}
requestTimeColumnReader = timeColumnReader;
}
public SlopCrawlDataRecord get() throws IOException {
@@ -487,6 +520,7 @@ public record SlopCrawlDataRecord(String domain,
timestampColumnReader.get(),
contentTypeColumnReader.get(),
bodyColumnReader.get(),
requestTimeColumnReader != null ? requestTimeColumnReader.get() : -1,
headerColumnReader.get()
);
}
@@ -506,6 +540,7 @@ public record SlopCrawlDataRecord(String domain,
private final LongColumn.Reader timestampColumnReader;
private final EnumColumn.Reader contentTypeColumnReader;
private final ByteArrayColumn.Reader bodyColumnReader;
private final ShortColumn.Reader requestTimeColumnReader;
private final StringColumn.Reader headerColumnReader;
private SlopCrawlDataRecord next = null;
@@ -522,6 +557,17 @@ public record SlopCrawlDataRecord(String domain,
contentTypeColumnReader = contentTypeColumn.open(this);
bodyColumnReader = bodyColumn.open(this);
headerColumnReader = headerColumn.open(this);
// FIXME: After 2025-06-XX, we can remove this migration workaround
ShortColumn.Reader timeColumnReader;
try {
timeColumnReader = requestTimeColumn.open(this);
}
catch (Exception ex) {
// Migration workaround
timeColumnReader = null;
}
requestTimeColumnReader = timeColumnReader;
}
public abstract boolean filter(String url, int status, String contentType);
@@ -548,6 +594,7 @@ public record SlopCrawlDataRecord(String domain,
boolean cookies = cookiesColumnReader.get() == 1;
int status = statusColumnReader.get();
long timestamp = timestampColumnReader.get();
int requestTimeMs = requestTimeColumnReader != null ? requestTimeColumnReader.get() : -1;
String contentType = contentTypeColumnReader.get();
LargeItem<byte[]> body = bodyColumnReader.getLarge();
@@ -555,7 +602,7 @@ public record SlopCrawlDataRecord(String domain,
if (filter(url, status, contentType)) {
next = new SlopCrawlDataRecord(
domain, url, ip, cookies, status, timestamp, contentType, body.get(), headers.get()
domain, url, ip, cookies, status, timestamp, contentType, body.get(), requestTimeMs, headers.get()
);
return true;
}

View File

@@ -195,6 +195,7 @@ public class LiveCrawlDataSet implements AutoCloseable {
headers,
body,
false,
-1,
"",
""
));

View File

@@ -61,7 +61,7 @@ public class UrlDeduplicator {
private boolean limitResultsPerDomain(DecoratedSearchResultItem details) {
final var domain = details.getUrl().getDomain();
final String key = domain.getDomainKey();
final String key = domain.toString();
return keyCount.adjustOrPutValue(key, 1, 1) <= resultsPerKey;
}

View File

@@ -23,7 +23,7 @@ public class SearchResultClusterer {
}
/** No clustering, just return the results as is */
private static List<ClusteredUrlDetails> noOp(List<UrlDetails> results, int total) {
public static List<ClusteredUrlDetails> noOp(List<UrlDetails> results, int total) {
if (results.isEmpty())
return List.of();

View File

@@ -85,7 +85,6 @@ public class SearchService extends JoobyService {
String emptySvg = "<svg xmlns=\"http://www.w3.org/2000/svg\"></svg>";
jooby.get("/site/{domain}/favicon", ctx -> {
String domain = ctx.path("domain").value();
logger.info("Finding icon for domain {}", domain);
try {
DbDomainQueries.DomainIdWithNode domainIdWithNode = domainQueries.getDomainIdWithNode(new EdgeDomain(domain));
var faviconMaybe = faviconClient.getFavicon(domain, domainIdWithNode.nodeAffinity());

View File

@@ -25,6 +25,7 @@ public class UrlDeduplicator {
}
public boolean shouldRemove(DecoratedSearchResultItem details) {
if (!deduplicateOnSuperficialHash(details))
return true;
if (!deduplicateOnLSH(details))
@@ -61,7 +62,7 @@ public class UrlDeduplicator {
private boolean limitResultsPerDomain(DecoratedSearchResultItem details) {
final var domain = details.getUrl().getDomain();
final String key = domain.getDomainKey();
final String key = domain.toString();
return keyCount.adjustOrPutValue(key, 1, 1) <= resultsPerKey;
}

View File

@@ -0,0 +1,3 @@
FROM ghcr.io/browserless/chromium:latest
COPY extension/ /dom-export

View File

@@ -0,0 +1,45 @@
plugins {
id 'base'
}
def imageName = 'marginalia-browserless'
def imageTag = project.hasProperty('imageTag') ? project.getProperty('imageTag') : 'latest'
tasks.register('docker', Exec) {
group = 'Docker'
description = 'Builds a Docker image using the Dockerfile in project root'
workingDir = projectDir
// Build the Docker command
commandLine 'docker', 'build',
'-t', "${imageName}:${imageTag}",
'-f', 'Dockerfile',
'--pull',
'--build-arg', "BASE_DIR=.",
'.'
// Add optional parameters if specified
if (project.hasProperty('noCache') && project.getProperty('noCache').toBoolean()) {
commandLine += '--no-cache'
}
doFirst {
println "Building Docker image '${imageName}:${imageTag}'..."
}
doLast {
println "Docker image '${imageName}:${imageTag}' has been built successfully."
}
}
// Add task to ensure the extension folder is included in the Docker context
tasks.register('prepareExtension', Copy) {
from 'extension'
into "${buildDir}/docker/extension"
}
// Make the docker task depend on prepareExtension
tasks.named('docker').configure {
dependsOn 'prepareExtension'
}

View File

@@ -0,0 +1,32 @@
// Listen to web requests and buffer them until the content script is ready
chrome.webRequest.onBeforeRequest.addListener(
(details) => {
const requestData = {
url: details.url,
method: details.method,
timestamp: Date.now()
};
console.log(requestData);
chrome.tabs.sendMessage(details.tabId, {
type: 'URL_INTERCEPTED',
...requestData
});
},
{ urls: ["<all_urls>"] }
);
// Listen to web navigation events and re-register content scripts when a page is reloaded or navigated to a new subframe
chrome.webNavigation.onCommitted.addListener(function(details) {
if (details.transitionType === 'reload' || details.transitionType === 'auto_subframe') {
chrome.scripting.registerContentScripts([{
id: "content-script",
matches : [ "<all_urls>" ],
js : [ "content.js" ]
}]);
}
});

View File

@@ -0,0 +1,646 @@
// This script runs in the context of web pages loaded by the browser extension
// Listen to messages from the background script
var networkRequests = document.createElement('div')
networkRequests.setAttribute('id', 'marginalia-network-requests');
chrome.runtime.onMessage.addListener((message, sender, sendResponse) => {
if (message.type === 'URL_INTERCEPTED') {
var request = document.createElement('div');
request.setAttribute('class', 'network-request');
request.setAttribute('data-url', message.url);
request.setAttribute('data-method', message.method);
request.setAttribute('data-timestamp', message.timestamp);
networkRequests.appendChild(request)
}
});
// Function to add styles as data attributes based on specified properties
function addStylesAsDataAttributes(propertyToAttrMap = {
'display': 'data-display',
'position': 'data-position',
'visibility': 'data-visibility',
}) {
const targetedProperties = new Set(Object.keys(propertyToAttrMap).map(prop => prop.toLowerCase()));
const styleSheets = Array.from(document.styleSheets);
try {
styleSheets.forEach(styleSheet => {
try {
if (styleSheet.href && new URL(styleSheet.href).origin !== window.location.origin) {
console.warn(`Skipping cross-origin stylesheet: ${styleSheet.href}`);
return;
}
const cssRules = styleSheet.cssRules || styleSheet.rules;
if (!cssRules) return;
for (let i = 0; i < cssRules.length; i++) {
const rule = cssRules[i];
if (rule.type === 1) {
try {
let containsTargetedProperty = false;
for (let j = 0; j < rule.style.length; j++) {
const property = rule.style[j].toLowerCase();
if (targetedProperties.has(property)) {
containsTargetedProperty = true;
break;
}
}
if (!containsTargetedProperty) continue;
const elements = document.querySelectorAll(rule.selectorText);
elements.forEach(element => {
for (let j = 0; j < rule.style.length; j++) {
const property = rule.style[j].toLowerCase();
if (targetedProperties.has(property)) {
const value = rule.style.getPropertyValue(property);
const dataAttrName = propertyToAttrMap[property];
element.setAttribute(dataAttrName, value);
}
}
});
} catch (selectorError) {
console.error(`Error processing selector "${rule.selectorText}": ${selectorError.message}`);
}
}
}
} catch (sheetError) {
console.error(`Error processing stylesheet: ${sheetError.message}`);
}
});
} catch (error) {
console.error(`Error adding data attributes: ${error.message}`);
}
}
class CookieConsentHandler {
constructor() {
// Keywords that strongly indicate cookie consent
this.cookieKeywords = [
'cookie', 'cookies', 'consent', 'gdpr', 'privacy policy', 'privacy notice',
'data protection', 'tracking', 'analytics', 'personalization', 'advertising',
'essential cookies', 'functional cookies', 'performance cookies'
];
// Keywords that indicate newsletter/subscription popups
this.newsletterKeywords = [
'newsletter', 'subscribe', 'email', 'signup', 'sign up', 'updates',
'notifications', 'discount', 'offer', 'deal', 'promo', 'exclusive'
];
// Common button text for accepting cookies
this.acceptButtonTexts = [
'accept', 'accept all', 'allow all', 'agree', 'ok', 'got it',
'i agree', 'continue', 'yes', 'enable', 'allow cookies',
'accept cookies', 'accept all cookies', 'i understand'
];
// Common button text for rejecting (to avoid clicking these)
this.rejectButtonTexts = [
'reject', 'decline', 'deny', 'refuse', 'no thanks', 'no',
'reject all', 'decline all', 'manage preferences', 'customize',
'settings', 'options', 'learn more'
];
// Special patterns that strongly indicate cookie consent
this.acceptButtonStyles = [
/primary/,
];
}
analyzePopover(element) {
if (!element || !element.textContent) {
return { category: 'unknown', action: 'none', reason: 'Invalid element' };
}
const textContent = element.textContent.toLowerCase();
const category = this.categorizePopover(textContent, element);
let result = {
category: category,
action: 'none',
reason: '',
element: element
};
if (category === 'cookie_consent') {
const acceptResult = this.tryAcceptCookies(element);
result.action = acceptResult.action;
result.reason = acceptResult.reason;
result.buttonClicked = acceptResult.buttonClicked;
}
return result;
}
categorizePopover(textContent, element) {
let cookieScore = 0;
let newsletterScore = 0;
// Score based on keyword presence
this.cookieKeywords.forEach(keyword => {
if (textContent.includes(keyword)) {
cookieScore += keyword === 'cookie' || keyword === 'cookies' ? 3 : 1;
}
});
this.newsletterKeywords.forEach(keyword => {
if (textContent.includes(keyword)) {
newsletterScore += keyword === 'newsletter' || keyword === 'subscribe' ? 3 : 1;
}
});
// Additional heuristics
if (this.hasPrivacyPolicyLink(element)) cookieScore += 2;
if (this.hasManagePreferencesButton(element)) cookieScore += 2;
if (this.hasEmailInput(element)) newsletterScore += 3;
if (this.hasDiscountMention(textContent)) newsletterScore += 2;
// Special patterns that strongly indicate cookie consent
const strongCookiePatterns = [
/we use cookies/,
/this website uses cookies/,
/by continuing to use/,
/essential.*cookies/,
/improve.*experience/,
/gdpr/,
/data protection/
];
if (strongCookiePatterns.some(pattern => pattern.test(textContent))) {
cookieScore += 5;
}
// Determine category
if (cookieScore > newsletterScore && cookieScore >= 2) {
return 'cookie_consent';
} else if (newsletterScore > cookieScore && newsletterScore >= 2) {
return 'newsletter';
} else {
return 'other';
}
}
tryAcceptCookies(element) {
const buttons = this.findButtons(element);
if (buttons.length === 0) {
return { action: 'no_buttons_found', reason: 'No clickable buttons found' };
}
// First, try to find explicit accept buttons
const acceptButton = this.findAcceptButton(buttons);
if (acceptButton) {
try {
acceptButton.click();
return {
action: 'clicked_accept',
reason: 'Found and clicked accept button',
buttonClicked: acceptButton.textContent.trim()
};
} catch (error) {
return {
action: 'click_failed',
reason: `Failed to click button: ${error.message}`,
buttonClicked: acceptButton.textContent.trim()
};
}
}
// If no explicit accept button, try to find the most likely candidate
const likelyButton = this.findMostLikelyAcceptButton(buttons);
if (likelyButton) {
try {
likelyButton.click();
return {
action: 'clicked_likely',
reason: 'Clicked most likely accept button',
buttonClicked: likelyButton.textContent.trim()
};
} catch (error) {
return {
action: 'click_failed',
reason: `Failed to click button: ${error.message}`,
buttonClicked: likelyButton.textContent.trim()
};
}
}
return {
action: 'no_accept_button',
reason: 'Could not identify accept button',
availableButtons: buttons.map(btn => btn.textContent.trim())
};
}
findButtons(element) {
const selectors = [
'button',
'input[type="button"]',
'input[type="submit"]',
'[role="button"]',
'a[href="#"]',
'.button',
'.btn',
'.btn-primary'
];
const buttons = [];
selectors.forEach(selector => {
const found = element.querySelectorAll(selector);
buttons.push(...Array.from(found));
});
// Remove duplicates and filter visible buttons
return [...new Set(buttons)].filter(btn =>
btn.offsetWidth > 0 && btn.offsetHeight > 0
);
}
findAcceptButton(buttons) {
var byClass = buttons.find(button => {
var classes = button.className.toLowerCase();
if (this.acceptButtonStyles.some(pattern => pattern.test(classes))) {
return true;
}
});
if (byClass != null) {
return byClass;
}
return buttons.find(button => {
const text = button.textContent.toLowerCase().trim();
return this.acceptButtonTexts.some(acceptText =>
text === acceptText || text.includes(acceptText)
) && !this.rejectButtonTexts.some(rejectText =>
text.includes(rejectText)
);
});
}
findMostLikelyAcceptButton(buttons) {
if (buttons.length === 1) {
const text = buttons[0].textContent.toLowerCase();
// If there's only one button and it's not explicitly a reject button, assume it's accept
if (!this.rejectButtonTexts.some(rejectText => text.includes(rejectText))) {
return buttons[0];
}
}
// Look for buttons with positive styling (often green, primary, etc.)
const positiveButton = buttons.find(button => {
const classes = button.className.toLowerCase();
const styles = window.getComputedStyle(button);
const bgColor = styles.backgroundColor;
return classes.includes('primary') ||
classes.includes('accept') ||
classes.includes('green') ||
bgColor.includes('rgb(0, 128, 0)') || // green variations
bgColor.includes('rgb(40, 167, 69)'); // bootstrap success
});
return positiveButton || null;
}
hasPrivacyPolicyLink(element) {
const links = element.querySelectorAll('a');
return Array.from(links).some(link =>
link.textContent.toLowerCase().includes('privacy') ||
link.href.toLowerCase().includes('privacy')
);
}
hasManagePreferencesButton(element) {
const buttons = this.findButtons(element);
return buttons.some(button => {
const text = button.textContent.toLowerCase();
return text.includes('manage') || text.includes('preferences') ||
text.includes('settings') || text.includes('customize');
});
}
hasEmailInput(element) {
const inputs = element.querySelectorAll('input[type="email"], input[placeholder*="email" i]');
return inputs.length > 0;
}
hasDiscountMention(textContent) {
const discountTerms = ['discount', 'off', '%', 'save', 'deal', 'offer'];
return discountTerms.some(term => textContent.includes(term));
}
}
var agreedToPopover = false;
// Usage example:
function handlePopover(popoverElement) {
const handler = new CookieConsentHandler();
const result = handler.analyzePopover(popoverElement);
console.log('Popover analysis result:', result);
switch (result.category) {
case 'cookie_consent':
console.log('Detected cookie consent popover');
if (result.action === 'clicked_accept') {
console.log('Successfully accepted cookies');
agreedToPopover = true;
} else {
console.log('Could not accept cookies:', result.reason);
}
break;
case 'newsletter':
console.log('Detected newsletter popover - no action taken');
break;
default:
console.log('Unknown popover type - no action taken');
}
return result;
}
function finalizeMarginaliaHack() {
addStylesAsDataAttributes();
// Find all likely popover elements
const fixedElements = document.querySelectorAll('[data-position="fixed"]');
// Attempt to agree to cookie consent popups
fixedElements.forEach(element => {
handlePopover(element);
});
// If we found a popover and agreed to it, add a notice
if (agreedToPopover) {
var notice = document.createElement('div');
notice.setAttribute('class', 'marginalia-agreed-cookies');
networkRequests.appendChild(notice);
}
var finalize = () => {
// Add a container for network requests
document.body.appendChild(networkRequests);
document.body.setAttribute('id', 'marginaliahack');
}
// If we have a popover and agreed to it, wait a bit before finalizing
// to let the ad networks load so we can capture their requests
if (agreedToPopover) {
setTimeout(finalize, 2500);
}
else {
finalize();
}
}
class EventSimulator {
constructor() {}
// Simulate smooth scrolling down the page
simulateScrollDown(duration = 2000, distance = null) {
return new Promise((resolve) => {
const startTime = Date.now();
const startScrollY = window.scrollY;
const maxScroll = document.documentElement.scrollHeight - window.innerHeight;
const targetDistance = distance || Math.min(window.innerHeight * 3, maxScroll - startScrollY);
if (targetDistance <= 0) {
resolve();
return;
}
const animate = () => {
const elapsed = Date.now() - startTime;
const progress = Math.min(elapsed / duration, 1);
// Ease-out function for smooth scrolling
const easeOut = 1 - Math.pow(1 - progress, 3);
const currentDistance = targetDistance * easeOut;
const newScrollY = startScrollY + currentDistance;
// Dispatch scroll events as we go
window.scrollTo(0, newScrollY);
// Fire custom scroll event
const scrollEvent = new Event('scroll', {
bubbles: true,
cancelable: true
});
// Add custom properties to track simulation
scrollEvent.simulated = true;
scrollEvent.scrollY = newScrollY;
scrollEvent.progress = progress;
window.dispatchEvent(scrollEvent);
document.dispatchEvent(scrollEvent);
if (progress < 1) {
requestAnimationFrame(animate);
} else {
resolve();
}
};
requestAnimationFrame(animate);
});
}
// Simulate mouse movement toward URL bar
simulateMouseToURLBar(duration = 1500) {
return new Promise((resolve) => {
const startTime = Date.now();
// Get current mouse position (or start from center of viewport)
const startX = window.innerWidth / 2;
const startY = window.innerHeight / 2;
// URL bar is typically at the top center of the browser
// Since we can't access actual browser chrome, we'll simulate movement
// toward the top of the viewport where the URL bar would be
const targetX = window.innerWidth / 2; // Center horizontally
const targetY = -50; // Above the viewport (simulating URL bar position)
const deltaX = targetX - startX;
const deltaY = targetY - startY;
let lastMouseEvent = null;
const animate = () => {
const elapsed = Date.now() - startTime;
const progress = Math.min(elapsed / duration, 1);
// Ease-in-out function for natural mouse movement
const easeInOut = progress < 0.5
? 2 * progress * progress
: 1 - Math.pow(-2 * progress + 2, 3) / 2;
const currentX = startX + (deltaX * easeInOut);
const currentY = startY + (deltaY * easeInOut);
// Create mouse move event
const mouseMoveEvent = new MouseEvent('mousemove', {
bubbles: true,
cancelable: true,
clientX: currentX,
clientY: currentY,
screenX: currentX,
screenY: currentY,
movementX: lastMouseEvent ? currentX - lastMouseEvent.clientX : 0,
movementY: lastMouseEvent ? currentY - lastMouseEvent.clientY : 0,
buttons: 0,
button: -1
});
// Add custom properties to track simulation
mouseMoveEvent.simulated = true;
mouseMoveEvent.progress = progress;
mouseMoveEvent.targetType = 'urlbar';
// Find element under mouse and dispatch event
const elementUnderMouse = document.elementFromPoint(currentX, currentY);
if (elementUnderMouse) {
elementUnderMouse.dispatchEvent(mouseMoveEvent);
// Also fire mouseenter/mouseleave events if element changed
if (lastMouseEvent) {
const lastElement = document.elementFromPoint(
lastMouseEvent.clientX,
lastMouseEvent.clientY
);
if (lastElement && lastElement !== elementUnderMouse) {
// Mouse left previous element
const mouseLeaveEvent = new MouseEvent('mouseleave', {
bubbles: false, // mouseleave doesn't bubble
cancelable: true,
clientX: currentX,
clientY: currentY,
relatedTarget: elementUnderMouse
});
mouseLeaveEvent.simulated = true;
lastElement.dispatchEvent(mouseLeaveEvent);
// Mouse entered new element
const mouseEnterEvent = new MouseEvent('mouseenter', {
bubbles: false, // mouseenter doesn't bubble
cancelable: true,
clientX: currentX,
clientY: currentY,
relatedTarget: lastElement
});
mouseEnterEvent.simulated = true;
elementUnderMouse.dispatchEvent(mouseEnterEvent);
}
}
}
// Also dispatch on document and window
document.dispatchEvent(mouseMoveEvent);
window.dispatchEvent(mouseMoveEvent);
lastMouseEvent = mouseMoveEvent;
if (progress < 1) {
requestAnimationFrame(animate);
} else {
resolve();
}
};
requestAnimationFrame(animate);
});
}
// Simulate realistic mouse movement with slight randomness
simulateNaturalMouseMovement(targetX, targetY, duration = 1000) {
return new Promise((resolve) => {
const startTime = Date.now();
const startX = window.innerWidth / 2;
const startY = window.innerHeight / 2;
const basePathX = targetX - startX;
const basePathY = targetY - startY;
const animate = () => {
const elapsed = Date.now() - startTime;
const progress = Math.min(elapsed / duration, 1);
// Add some randomness to make movement more natural
const randomOffsetX = (Math.random() - 0.5) * 10 * (1 - progress);
const randomOffsetY = (Math.random() - 0.5) * 10 * (1 - progress);
// Bezier curve for more natural movement
const t = progress;
const bezierProgress = t * t * (3.0 - 2.0 * t);
const currentX = startX + (basePathX * bezierProgress) + randomOffsetX;
const currentY = startY + (basePathY * bezierProgress) + randomOffsetY;
const mouseMoveEvent = new MouseEvent('mousemove', {
bubbles: true,
cancelable: true,
clientX: currentX,
clientY: currentY,
screenX: currentX,
screenY: currentY
});
mouseMoveEvent.simulated = true;
mouseMoveEvent.natural = true;
document.dispatchEvent(mouseMoveEvent);
if (progress < 1) {
requestAnimationFrame(animate);
} else {
resolve();
}
};
requestAnimationFrame(animate);
});
}
// Combined simulation: scroll down while moving mouse toward URL bar
async simulateBrowsingBehavior() {
// Start both animations simultaneously
const scrollPromise = this.simulateScrollDown(300);
const mousePromise = this.simulateMouseToURLBar(200);
// Wait for both to complete
await Promise.all([scrollPromise, mousePromise]);
// Add a small pause
await new Promise(resolve => setTimeout(resolve, 100));
// Simulate some additional natural mouse movement
await this.simulateNaturalMouseMovement(
window.innerWidth * 0.3,
window.innerHeight * 0.1,
100
);
console.log('Browsing behavior simulation completed');
}
}
// Usage examples:
const simulator = new EventSimulator();
function simulateUserBehavior() {
simulator.simulateBrowsingBehavior().then(() => {
console.log('User behavior simulation finished');
});
}
window.addEventListener("load", (e) => simulateUserBehavior());
window.addEventListener("load", (e) => setTimeout(finalizeMarginaliaHack, 2000));

View File

@@ -0,0 +1,29 @@
{
"manifest_version": 3,
"name": "Marginalia DOM Interceptor",
"version": "1.0",
"description": "Makes DOM export better",
"permissions": [
"activeTab",
"scripting",
"webNavigation",
"webRequest"
],
"host_permissions": [
"<all_urls>"
],
"background": {
"service_worker": "background.js",
"type": "module"
},
"content_scripts": [
{
"js": ["content.js"],
"run_at": "document_start",
"matches": [
"<all_urls>"
]
}
]
}

View File

@@ -7,4 +7,6 @@
2025-05-04: Deploy qs, search and api-services.
2025-05-05: Deploy executor partition 4.
2025-05-05: Deploy control.
2025-05-08: Deploy assistant.
2025-05-08: Deploy assistant.
2025-05-17: Redeploy all.
2025-05-28: Deploy assistant and browserless.

View File

@@ -93,6 +93,7 @@ include 'code:tools:experiment-runner'
include 'code:tools:screenshot-capture-tool'
include 'code:tools:load-test'
include 'code:tools:integration-test'
include 'code:tools:browserless'
include 'third-party:porterstemmer'
include 'third-party:symspell'

View File

@@ -272,6 +272,13 @@ if __name__ == '__main__':
deploy_tier=1,
groups={"all", "core"}
),
'browserless': ServiceConfig(
gradle_target=':code:tools:browserless:docker',
docker_name='browserless',
instances=None,
deploy_tier=2,
groups={"all", "core"}
),
'assistant': ServiceConfig(
gradle_target=':code:services-core:assistant-service:docker',
docker_name='assistant-service',