mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 17:32:39 +02:00
Compare commits
26 Commits
deploy-019
...
deploy-020
Author | SHA1 | Date | |
---|---|---|---|
|
7330bc489d | ||
|
ea23f33738 | ||
|
4a8a028118 | ||
|
a25bc647be | ||
|
a720dba3a2 | ||
|
284f382867 | ||
|
a80717f138 | ||
|
d6da715fa4 | ||
|
c1ec7aa491 | ||
|
3daf37e283 | ||
|
44a774d3a8 | ||
|
597aeaf496 | ||
|
06df7892c2 | ||
|
dc26854268 | ||
|
9f16326cba | ||
|
ed66d0b3a7 | ||
|
c3afc82dad | ||
|
08e25e539e | ||
|
4946044dd0 | ||
|
edf382e1c5 | ||
|
644cba32e4 | ||
|
34b76390b2 | ||
|
43cd507971 | ||
|
cc40e99fdc | ||
|
8a944cf4c6 | ||
|
be039d1a8c |
@@ -25,9 +25,9 @@ dependencies {
|
||||
|
||||
implementation project(':code:execution:api')
|
||||
implementation project(':code:processes:crawling-process:ft-content-type')
|
||||
implementation project(':third-party:rssreader')
|
||||
|
||||
implementation libs.jsoup
|
||||
implementation project(':third-party:rssreader')
|
||||
implementation libs.opencsv
|
||||
implementation libs.slop
|
||||
implementation libs.sqlite
|
||||
@@ -57,8 +57,6 @@ dependencies {
|
||||
implementation libs.bundles.gson
|
||||
implementation libs.bundles.mariadb
|
||||
|
||||
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
|
@@ -0,0 +1,119 @@
|
||||
package nu.marginalia.domsample;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import jakarta.inject.Named;
|
||||
import nu.marginalia.domsample.db.DomSampleDb;
|
||||
import nu.marginalia.livecapture.BrowserlessClient;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.net.URI;
|
||||
import java.net.URISyntaxException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
public class DomSampleService {
|
||||
private final DomSampleDb db;
|
||||
private final HikariDataSource mariadbDataSource;
|
||||
private final URI browserlessURI;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(DomSampleService.class);
|
||||
|
||||
@Inject
|
||||
public DomSampleService(DomSampleDb db,
|
||||
HikariDataSource mariadbDataSource,
|
||||
@Named("browserless-uri") String browserlessAddress,
|
||||
ServiceConfiguration serviceConfiguration)
|
||||
throws URISyntaxException
|
||||
{
|
||||
this.db = db;
|
||||
this.mariadbDataSource = mariadbDataSource;
|
||||
|
||||
if (StringUtils.isEmpty(browserlessAddress) || serviceConfiguration.node() > 1) {
|
||||
logger.warn("Live capture service will not run");
|
||||
browserlessURI = null; // satisfy final
|
||||
}
|
||||
else {
|
||||
browserlessURI = new URI(browserlessAddress);
|
||||
|
||||
Thread.ofPlatform().daemon().start(this::run);
|
||||
}
|
||||
}
|
||||
|
||||
public void syncDomains() {
|
||||
Set<String> dbDomains = new HashSet<>();
|
||||
|
||||
logger.info("Fetching domains from database...");
|
||||
|
||||
try (var conn = mariadbDataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
SELECT DOMAIN_NAME
|
||||
FROM EC_DOMAIN
|
||||
WHERE NODE_AFFINITY>0
|
||||
""")
|
||||
) {
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
dbDomains.add(rs.getString("DOMAIN_NAME"));
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("Failed to sync domains", e);
|
||||
}
|
||||
|
||||
logger.info("Found {} domains in database", dbDomains.size());
|
||||
|
||||
db.syncDomains(dbDomains);
|
||||
|
||||
logger.info("Synced domains to sqlite");
|
||||
}
|
||||
|
||||
public void run() {
|
||||
|
||||
try (var client = new BrowserlessClient(browserlessURI)) {
|
||||
|
||||
while (!Thread.currentThread().isInterrupted()) {
|
||||
|
||||
try {
|
||||
// Grace sleep in case we're operating on an empty domain list
|
||||
TimeUnit.SECONDS.sleep(15);
|
||||
|
||||
syncDomains();
|
||||
var domains = db.getScheduledDomains();
|
||||
|
||||
for (var domain : domains) {
|
||||
updateDomain(client, domain);
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
logger.info("DomSampleService interrupted, stopping...");
|
||||
return;
|
||||
} catch (Exception e) {
|
||||
logger.error("Error in DomSampleService run loop", e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
private void updateDomain(BrowserlessClient client, String domain) {
|
||||
var rootUrl = "https://" + domain + "/";
|
||||
try {
|
||||
var content = client.annotatedContent(rootUrl,
|
||||
BrowserlessClient.GotoOptions.defaultValues());
|
||||
|
||||
if (content.isPresent()) {
|
||||
db.saveSample(domain, rootUrl, content.get());
|
||||
}
|
||||
} catch (Exception e) {
|
||||
logger.error("Failed to process domain: " + domain, e);
|
||||
}
|
||||
finally {
|
||||
db.flagDomainAsFetched(domain);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@@ -0,0 +1,174 @@
|
||||
package nu.marginalia.domsample.db;
|
||||
|
||||
import nu.marginalia.WmsaHome;
|
||||
import org.jsoup.Jsoup;
|
||||
|
||||
import java.nio.file.Path;
|
||||
import java.sql.Connection;
|
||||
import java.sql.DriverManager;
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
|
||||
public class DomSampleDb implements AutoCloseable {
|
||||
private static final String dbFileName = "dom-sample.db";
|
||||
private final Connection connection;
|
||||
|
||||
public DomSampleDb() throws SQLException{
|
||||
this(WmsaHome.getDataPath().resolve(dbFileName));
|
||||
}
|
||||
|
||||
public DomSampleDb(Path dbPath) throws SQLException {
|
||||
String dbUrl = "jdbc:sqlite:" + dbPath.toAbsolutePath();
|
||||
|
||||
connection = DriverManager.getConnection(dbUrl);
|
||||
|
||||
try (var stmt = connection.createStatement()) {
|
||||
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS samples (url TEXT PRIMARY KEY, domain TEXT, sample BLOB, requests BLOB, accepted_popover BOOLEAN DEFAULT FALSE)");
|
||||
stmt.executeUpdate("CREATE INDEX IF NOT EXISTS domain_index ON samples (domain)");
|
||||
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS schedule (domain TEXT PRIMARY KEY, last_fetch TIMESTAMP DEFAULT NULL)");
|
||||
}
|
||||
}
|
||||
|
||||
public void syncDomains(Set<String> domains) {
|
||||
Set<String> currentDomains = new HashSet<>();
|
||||
try (var stmt = connection.prepareStatement("SELECT domain FROM schedule")) {
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
currentDomains.add(rs.getString("domain"));
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
throw new RuntimeException("Failed to sync domains", e);
|
||||
}
|
||||
|
||||
Set<String> toRemove = new HashSet<>(currentDomains);
|
||||
Set<String> toAdd = new HashSet<>(domains);
|
||||
|
||||
toRemove.removeAll(domains);
|
||||
toAdd.removeAll(currentDomains);
|
||||
|
||||
try (var removeStmt = connection.prepareStatement("DELETE FROM schedule WHERE domain = ?");
|
||||
var addStmt = connection.prepareStatement("INSERT OR IGNORE INTO schedule (domain) VALUES (?)")
|
||||
) {
|
||||
for (String domain : toRemove) {
|
||||
removeStmt.setString(1, domain);
|
||||
removeStmt.executeUpdate();
|
||||
}
|
||||
|
||||
for (String domain : toAdd) {
|
||||
addStmt.setString(1, domain);
|
||||
addStmt.executeUpdate();
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
throw new RuntimeException("Failed to remove domains", e);
|
||||
}
|
||||
}
|
||||
|
||||
public List<String> getScheduledDomains() {
|
||||
List<String> domains = new ArrayList<>();
|
||||
try (var stmt = connection.prepareStatement("SELECT domain FROM schedule ORDER BY last_fetch IS NULL DESC, last_fetch ASC")) {
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
domains.add(rs.getString("domain"));
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
throw new RuntimeException("Failed to get scheduled domains", e);
|
||||
}
|
||||
return domains;
|
||||
}
|
||||
|
||||
public void flagDomainAsFetched(String domain) {
|
||||
try (var stmt = connection.prepareStatement("INSERT OR REPLACE INTO schedule (domain, last_fetch) VALUES (?, CURRENT_TIMESTAMP)")) {
|
||||
stmt.setString(1, domain);
|
||||
stmt.executeUpdate();
|
||||
} catch (SQLException e) {
|
||||
throw new RuntimeException("Failed to flag domain as fetched", e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public record Sample(String url, String domain, String sample, String requests, boolean acceptedPopover) {}
|
||||
|
||||
public List<Sample> getSamples(String domain) throws SQLException {
|
||||
List<Sample> samples = new ArrayList<>();
|
||||
|
||||
try (var stmt = connection.prepareStatement("""
|
||||
SELECT url, sample, requests, accepted_popover
|
||||
FROM samples
|
||||
WHERE domain = ?
|
||||
"""))
|
||||
{
|
||||
stmt.setString(1, domain);
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
samples.add(
|
||||
new Sample(
|
||||
rs.getString("url"),
|
||||
domain,
|
||||
rs.getString("sample"),
|
||||
rs.getString("requests"),
|
||||
rs.getBoolean("accepted_popover")
|
||||
)
|
||||
);
|
||||
}
|
||||
}
|
||||
return samples;
|
||||
}
|
||||
|
||||
public void saveSample(String domain, String url, String rawContent) throws SQLException {
|
||||
var doc = Jsoup.parse(rawContent);
|
||||
|
||||
var networkRequests = doc.getElementById("marginalia-network-requests");
|
||||
|
||||
boolean acceptedPopover = false;
|
||||
|
||||
StringBuilder requestTsv = new StringBuilder();
|
||||
if (networkRequests != null) {
|
||||
|
||||
acceptedPopover = !networkRequests.getElementsByClass("marginalia-agreed-cookies").isEmpty();
|
||||
|
||||
for (var request : networkRequests.getElementsByClass("network-request")) {
|
||||
String method = request.attr("data-method");
|
||||
String urlAttr = request.attr("data-url");
|
||||
String timestamp = request.attr("data-timestamp");
|
||||
|
||||
requestTsv
|
||||
.append(method)
|
||||
.append('\t')
|
||||
.append(timestamp)
|
||||
.append('\t')
|
||||
.append(urlAttr.replace('\n', ' '))
|
||||
.append("\n");
|
||||
}
|
||||
|
||||
networkRequests.remove();
|
||||
}
|
||||
|
||||
doc.body().removeAttr("id");
|
||||
|
||||
String sample = doc.html();
|
||||
|
||||
saveSampleRaw(domain, url, sample, requestTsv.toString().trim(), acceptedPopover);
|
||||
|
||||
}
|
||||
|
||||
record Request(String url, String method, String timestamp, boolean acceptedPopover) {}
|
||||
|
||||
public void saveSampleRaw(String domain, String url, String sample, String requests, boolean acceptedPopover) throws SQLException {
|
||||
try (var stmt = connection.prepareStatement("""
|
||||
INSERT OR REPLACE
|
||||
INTO samples (domain, url, sample, requests, accepted_popover)
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
""")) {
|
||||
stmt.setString(1, domain);
|
||||
stmt.setString(2, url);
|
||||
stmt.setString(3, sample);
|
||||
stmt.setString(4, requests);
|
||||
stmt.setBoolean(5, acceptedPopover);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
|
||||
public void close() throws SQLException {
|
||||
connection.close();
|
||||
}
|
||||
}
|
@@ -8,10 +8,13 @@ import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
import java.net.URLEncoder;
|
||||
import java.net.http.HttpClient;
|
||||
import java.net.http.HttpRequest;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.time.Duration;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
|
||||
@@ -60,6 +63,42 @@ public class BrowserlessClient implements AutoCloseable {
|
||||
return Optional.of(rsp.body());
|
||||
}
|
||||
|
||||
/** Fetches content with a marginalia hack extension loaded that decorates the DOM with attributes for
|
||||
* certain CSS attributes, to be able to easier identify popovers and other nuisance elements.
|
||||
*/
|
||||
public Optional<String> annotatedContent(String url, GotoOptions gotoOptions) throws IOException, InterruptedException {
|
||||
Map<String, Object> requestData = Map.of(
|
||||
"url", url,
|
||||
"userAgent", userAgent,
|
||||
"gotoOptions", gotoOptions,
|
||||
"waitForSelector", Map.of("selector", "#marginaliahack", "timeout", 15000)
|
||||
);
|
||||
|
||||
// Launch parameters for the browserless instance to load the extension
|
||||
Map<String, Object> launchParameters = Map.of(
|
||||
"args", List.of("--load-extension=/dom-export")
|
||||
);
|
||||
|
||||
String launchParametersStr = URLEncoder.encode(gson.toJson(launchParameters), StandardCharsets.UTF_8);
|
||||
|
||||
var request = HttpRequest.newBuilder()
|
||||
.uri(browserlessURI.resolve("/content?token="+BROWSERLESS_TOKEN+"&launch="+launchParametersStr))
|
||||
.method("POST", HttpRequest.BodyPublishers.ofString(
|
||||
gson.toJson(requestData)
|
||||
))
|
||||
.header("Content-type", "application/json")
|
||||
.build();
|
||||
|
||||
var rsp = httpClient.send(request, HttpResponse.BodyHandlers.ofString());
|
||||
|
||||
if (rsp.statusCode() >= 300) {
|
||||
logger.info("Failed to fetch annotated content for {}, status {}", url, rsp.statusCode());
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
return Optional.of(rsp.body());
|
||||
}
|
||||
|
||||
public byte[] screenshot(String url, GotoOptions gotoOptions, ScreenshotOptions screenshotOptions)
|
||||
throws IOException, InterruptedException {
|
||||
|
||||
@@ -102,7 +141,7 @@ public class BrowserlessClient implements AutoCloseable {
|
||||
|
||||
public record GotoOptions(String waitUntil, long timeout) {
|
||||
public static GotoOptions defaultValues() {
|
||||
return new GotoOptions("networkidle2", Duration.ofSeconds(10).toMillis());
|
||||
return new GotoOptions("load", Duration.ofSeconds(10).toMillis());
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -0,0 +1,113 @@
|
||||
package nu.marginalia.domsample.db;
|
||||
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.testcontainers.shaded.org.apache.commons.io.FileUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.*;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
class DomSampleDbTest {
|
||||
Path tempDir;
|
||||
|
||||
@BeforeEach
|
||||
void setUp() throws Exception {
|
||||
tempDir = Files.createTempDirectory("test");
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
void tearDown() throws IOException {
|
||||
FileUtils.deleteDirectory(tempDir.toFile());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSetUp() {
|
||||
var dbPath = tempDir.resolve("test.db");
|
||||
try (var db = new DomSampleDb(dbPath)) {
|
||||
}
|
||||
catch (Exception e) {
|
||||
fail("Failed to set up database: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSyncDomains() {
|
||||
var dbPath = tempDir.resolve("test.db");
|
||||
try (var db = new DomSampleDb(dbPath)) {
|
||||
|
||||
db.syncDomains(Set.of("example.com", "test.com", "foobar.com"));
|
||||
assertEquals(Set.of("example.com", "test.com", "foobar.com"), new HashSet<>(db.getScheduledDomains()));
|
||||
db.syncDomains(Set.of("example.com", "test.com"));
|
||||
assertEquals(Set.of("example.com", "test.com"), new HashSet<>(db.getScheduledDomains()));
|
||||
db.syncDomains(Set.of("foobar.com", "test.com"));
|
||||
assertEquals(Set.of("foobar.com", "test.com"), new HashSet<>(db.getScheduledDomains()));
|
||||
}
|
||||
catch (Exception e) {
|
||||
fail("Failed to sync domains: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFetchDomains() {
|
||||
var dbPath = tempDir.resolve("test.db");
|
||||
try (var db = new DomSampleDb(dbPath)) {
|
||||
|
||||
db.syncDomains(Set.of("example.com", "test.com", "foobar.com"));
|
||||
db.flagDomainAsFetched("example.com");
|
||||
db.flagDomainAsFetched("test.com");
|
||||
db.flagDomainAsFetched("foobar.com");
|
||||
assertEquals(List.of("example.com", "test.com", "foobar.com"), db.getScheduledDomains());
|
||||
db.flagDomainAsFetched("test.com");
|
||||
assertEquals(List.of("example.com", "foobar.com", "test.com"), db.getScheduledDomains());
|
||||
}
|
||||
catch (Exception e) {
|
||||
fail("Failed to sync domains: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void saveLoadSingle() {
|
||||
var dbPath = tempDir.resolve("test.db");
|
||||
try (var db = new DomSampleDb(dbPath)) {
|
||||
db.saveSampleRaw("example.com", "http://example.com/sample", "sample data", "requests data", true);
|
||||
var samples = db.getSamples("example.com");
|
||||
assertEquals(1, samples.size());
|
||||
var sample = samples.getFirst();
|
||||
assertEquals("example.com", sample.domain());
|
||||
assertEquals("http://example.com/sample", sample.url());
|
||||
assertEquals("sample data", sample.sample());
|
||||
assertEquals("requests data", sample.requests());
|
||||
assertTrue(sample.acceptedPopover());
|
||||
}
|
||||
catch (Exception e) {
|
||||
fail("Failed to save/load sample: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void saveLoadTwo() {
|
||||
var dbPath = tempDir.resolve("test.db");
|
||||
try (var db = new DomSampleDb(dbPath)) {
|
||||
db.saveSampleRaw("example.com", "http://example.com/sample", "sample data", "r1", true);
|
||||
db.saveSampleRaw("example.com", "http://example.com/sample2", "sample data2", "r2", false);
|
||||
var samples = db.getSamples("example.com");
|
||||
assertEquals(2, samples.size());
|
||||
|
||||
Map<String, String> samplesByUrl = new HashMap<>();
|
||||
for (var sample : samples) {
|
||||
samplesByUrl.put(sample.url(), sample.sample());
|
||||
}
|
||||
|
||||
assertEquals("sample data", samplesByUrl.get("http://example.com/sample"));
|
||||
assertEquals("sample data2", samplesByUrl.get("http://example.com/sample2"));
|
||||
}
|
||||
catch (Exception e) {
|
||||
fail("Failed to save/load sample: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
@@ -3,17 +3,21 @@ package nu.marginalia.livecapture;
|
||||
import com.github.tomakehurst.wiremock.WireMockServer;
|
||||
import com.github.tomakehurst.wiremock.core.WireMockConfiguration;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.domsample.db.DomSampleDb;
|
||||
import nu.marginalia.service.module.ServiceConfigurationModule;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.testcontainers.containers.GenericContainer;
|
||||
import org.testcontainers.images.PullPolicy;
|
||||
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||
import org.testcontainers.utility.DockerImageName;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Map;
|
||||
|
||||
import static com.github.tomakehurst.wiremock.client.WireMock.*;
|
||||
@@ -22,9 +26,14 @@ import static com.github.tomakehurst.wiremock.client.WireMock.*;
|
||||
@Testcontainers
|
||||
@Tag("slow")
|
||||
public class BrowserlessClientTest {
|
||||
static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("browserless/chrome"))
|
||||
// Run gradle docker if this image is not available
|
||||
static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("marginalia-browserless"))
|
||||
.withEnv(Map.of("TOKEN", "BROWSERLESS_TOKEN"))
|
||||
.withImagePullPolicy(PullPolicy.defaultPolicy())
|
||||
.withNetworkMode("bridge")
|
||||
.withLogConsumer(frame -> {
|
||||
System.out.print(frame.getUtf8String());
|
||||
})
|
||||
.withExposedPorts(3000);
|
||||
|
||||
static WireMockServer wireMockServer =
|
||||
@@ -34,6 +43,7 @@ public class BrowserlessClientTest {
|
||||
static String localIp;
|
||||
|
||||
static URI browserlessURI;
|
||||
static URI browserlessWssURI;
|
||||
|
||||
@BeforeAll
|
||||
public static void setup() throws IOException {
|
||||
@@ -44,6 +54,12 @@ public class BrowserlessClientTest {
|
||||
container.getMappedPort(3000))
|
||||
);
|
||||
|
||||
browserlessWssURI = URI.create(String.format("ws://%s:%d/?token=BROWSERLESS_TOKEN",
|
||||
container.getHost(),
|
||||
container.getMappedPort(3000))
|
||||
);
|
||||
|
||||
|
||||
wireMockServer.start();
|
||||
wireMockServer.stubFor(get("/").willReturn(aResponse().withStatus(200).withBody("Ok")));
|
||||
|
||||
@@ -85,6 +101,30 @@ public class BrowserlessClientTest {
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAnnotatedContent() throws Exception {
|
||||
|
||||
try (var client = new BrowserlessClient(browserlessURI);
|
||||
DomSampleDb dbop = new DomSampleDb(Path.of("/tmp/dom-sample.db"))
|
||||
) {
|
||||
var content = client.annotatedContent("https://marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues()).orElseThrow();
|
||||
dbop.saveSample("marginalia.nu", "https://www.thesodacanstove.com/alcohol-stove/how-to-build/", content);
|
||||
System.out.println(content);
|
||||
Assertions.assertFalse(content.isBlank(), "Content should not be empty");
|
||||
|
||||
dbop.getSamples("marginalia.nu").forEach(sample -> {
|
||||
System.out.println("Sample URL: " + sample.url());
|
||||
System.out.println("Sample Content: " + sample.sample());
|
||||
System.out.println("Sample Requests: " + sample.requests());
|
||||
System.out.println("Accepted Popover: " + sample.acceptedPopover());
|
||||
});
|
||||
}
|
||||
finally {
|
||||
Files.deleteIfExists(Path.of("/tmp/dom-sample.db"));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testScreenshot() throws Exception {
|
||||
try (var client = new BrowserlessClient(browserlessURI)) {
|
||||
|
@@ -328,6 +328,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
private final LongColumn.Writer timestampColumnWriter;
|
||||
private final EnumColumn.Writer contentTypeColumnWriter;
|
||||
private final ByteArrayColumn.Writer bodyColumnWriter;
|
||||
private final ShortColumn.Writer requestTimeColumnWriter;
|
||||
private final StringColumn.Writer headerColumnWriter;
|
||||
|
||||
public Writer(Path path) throws IOException {
|
||||
@@ -341,6 +342,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
timestampColumnWriter = timestampColumn.create(this);
|
||||
contentTypeColumnWriter = contentTypeColumn.create(this);
|
||||
bodyColumnWriter = bodyColumn.create(this);
|
||||
requestTimeColumnWriter = requestTimeColumn.create(this);
|
||||
headerColumnWriter = headerColumn.create(this);
|
||||
}
|
||||
|
||||
@@ -353,6 +355,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
timestampColumnWriter.put(record.timestamp);
|
||||
contentTypeColumnWriter.put(record.contentType);
|
||||
bodyColumnWriter.put(record.body);
|
||||
requestTimeColumnWriter.put((short) record.requestTimeMs);
|
||||
headerColumnWriter.put(record.headers);
|
||||
}
|
||||
|
||||
@@ -493,8 +496,18 @@ public record SlopCrawlDataRecord(String domain,
|
||||
timestampColumnReader = timestampColumn.open(this);
|
||||
contentTypeColumnReader = contentTypeColumn.open(this);
|
||||
bodyColumnReader = bodyColumn.open(this);
|
||||
requestTimeColumnReader = requestTimeColumn.open(this);
|
||||
headerColumnReader = headerColumn.open(this);
|
||||
|
||||
// FIXME: After 2025-06-XX, we can remove this migration workaround
|
||||
ShortColumn.Reader timeColumnReader;
|
||||
try {
|
||||
timeColumnReader = requestTimeColumn.open(this);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
// Migration workaround
|
||||
timeColumnReader = null;
|
||||
}
|
||||
requestTimeColumnReader = timeColumnReader;
|
||||
}
|
||||
|
||||
public SlopCrawlDataRecord get() throws IOException {
|
||||
@@ -507,7 +520,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
timestampColumnReader.get(),
|
||||
contentTypeColumnReader.get(),
|
||||
bodyColumnReader.get(),
|
||||
requestTimeColumnReader.get(),
|
||||
requestTimeColumnReader != null ? requestTimeColumnReader.get() : -1,
|
||||
headerColumnReader.get()
|
||||
);
|
||||
}
|
||||
@@ -543,8 +556,18 @@ public record SlopCrawlDataRecord(String domain,
|
||||
timestampColumnReader = timestampColumn.open(this);
|
||||
contentTypeColumnReader = contentTypeColumn.open(this);
|
||||
bodyColumnReader = bodyColumn.open(this);
|
||||
requestTimeColumnReader = requestTimeColumn.open(this);
|
||||
headerColumnReader = headerColumn.open(this);
|
||||
|
||||
// FIXME: After 2025-06-XX, we can remove this migration workaround
|
||||
ShortColumn.Reader timeColumnReader;
|
||||
try {
|
||||
timeColumnReader = requestTimeColumn.open(this);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
// Migration workaround
|
||||
timeColumnReader = null;
|
||||
}
|
||||
requestTimeColumnReader = timeColumnReader;
|
||||
}
|
||||
|
||||
public abstract boolean filter(String url, int status, String contentType);
|
||||
@@ -571,7 +594,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
boolean cookies = cookiesColumnReader.get() == 1;
|
||||
int status = statusColumnReader.get();
|
||||
long timestamp = timestampColumnReader.get();
|
||||
int requestTimeMs = requestTimeColumnReader.get();
|
||||
int requestTimeMs = requestTimeColumnReader != null ? requestTimeColumnReader.get() : -1;
|
||||
String contentType = contentTypeColumnReader.get();
|
||||
|
||||
LargeItem<byte[]> body = bodyColumnReader.getLarge();
|
||||
|
3
code/tools/browserless/Dockerfile
Normal file
3
code/tools/browserless/Dockerfile
Normal file
@@ -0,0 +1,3 @@
|
||||
FROM ghcr.io/browserless/chromium:latest
|
||||
|
||||
COPY extension/ /dom-export
|
45
code/tools/browserless/build.gradle
Normal file
45
code/tools/browserless/build.gradle
Normal file
@@ -0,0 +1,45 @@
|
||||
plugins {
|
||||
id 'base'
|
||||
}
|
||||
|
||||
def imageName = 'marginalia-browserless'
|
||||
def imageTag = project.hasProperty('imageTag') ? project.getProperty('imageTag') : 'latest'
|
||||
|
||||
tasks.register('docker', Exec) {
|
||||
group = 'Docker'
|
||||
description = 'Builds a Docker image using the Dockerfile in project root'
|
||||
|
||||
workingDir = projectDir
|
||||
|
||||
// Build the Docker command
|
||||
commandLine 'docker', 'build',
|
||||
'-t', "${imageName}:${imageTag}",
|
||||
'-f', 'Dockerfile',
|
||||
'--pull',
|
||||
'--build-arg', "BASE_DIR=.",
|
||||
'.'
|
||||
|
||||
// Add optional parameters if specified
|
||||
if (project.hasProperty('noCache') && project.getProperty('noCache').toBoolean()) {
|
||||
commandLine += '--no-cache'
|
||||
}
|
||||
|
||||
doFirst {
|
||||
println "Building Docker image '${imageName}:${imageTag}'..."
|
||||
}
|
||||
|
||||
doLast {
|
||||
println "Docker image '${imageName}:${imageTag}' has been built successfully."
|
||||
}
|
||||
}
|
||||
|
||||
// Add task to ensure the extension folder is included in the Docker context
|
||||
tasks.register('prepareExtension', Copy) {
|
||||
from 'extension'
|
||||
into "${buildDir}/docker/extension"
|
||||
}
|
||||
|
||||
// Make the docker task depend on prepareExtension
|
||||
tasks.named('docker').configure {
|
||||
dependsOn 'prepareExtension'
|
||||
}
|
32
code/tools/browserless/extension/background.js
Normal file
32
code/tools/browserless/extension/background.js
Normal file
@@ -0,0 +1,32 @@
|
||||
|
||||
|
||||
// Listen to web requests and buffer them until the content script is ready
|
||||
|
||||
chrome.webRequest.onBeforeRequest.addListener(
|
||||
(details) => {
|
||||
const requestData = {
|
||||
url: details.url,
|
||||
method: details.method,
|
||||
timestamp: Date.now()
|
||||
};
|
||||
console.log(requestData);
|
||||
|
||||
chrome.tabs.sendMessage(details.tabId, {
|
||||
type: 'URL_INTERCEPTED',
|
||||
...requestData
|
||||
});
|
||||
},
|
||||
{ urls: ["<all_urls>"] }
|
||||
);
|
||||
|
||||
// Listen to web navigation events and re-register content scripts when a page is reloaded or navigated to a new subframe
|
||||
|
||||
chrome.webNavigation.onCommitted.addListener(function(details) {
|
||||
if (details.transitionType === 'reload' || details.transitionType === 'auto_subframe') {
|
||||
chrome.scripting.registerContentScripts([{
|
||||
id: "content-script",
|
||||
matches : [ "<all_urls>" ],
|
||||
js : [ "content.js" ]
|
||||
}]);
|
||||
}
|
||||
});
|
646
code/tools/browserless/extension/content.js
Normal file
646
code/tools/browserless/extension/content.js
Normal file
@@ -0,0 +1,646 @@
|
||||
// This script runs in the context of web pages loaded by the browser extension
|
||||
|
||||
// Listen to messages from the background script
|
||||
var networkRequests = document.createElement('div')
|
||||
networkRequests.setAttribute('id', 'marginalia-network-requests');
|
||||
|
||||
chrome.runtime.onMessage.addListener((message, sender, sendResponse) => {
|
||||
if (message.type === 'URL_INTERCEPTED') {
|
||||
var request = document.createElement('div');
|
||||
request.setAttribute('class', 'network-request');
|
||||
request.setAttribute('data-url', message.url);
|
||||
request.setAttribute('data-method', message.method);
|
||||
request.setAttribute('data-timestamp', message.timestamp);
|
||||
networkRequests.appendChild(request)
|
||||
}
|
||||
});
|
||||
|
||||
// Function to add styles as data attributes based on specified properties
|
||||
|
||||
function addStylesAsDataAttributes(propertyToAttrMap = {
|
||||
'display': 'data-display',
|
||||
'position': 'data-position',
|
||||
'visibility': 'data-visibility',
|
||||
}) {
|
||||
const targetedProperties = new Set(Object.keys(propertyToAttrMap).map(prop => prop.toLowerCase()));
|
||||
const styleSheets = Array.from(document.styleSheets);
|
||||
|
||||
try {
|
||||
styleSheets.forEach(styleSheet => {
|
||||
try {
|
||||
if (styleSheet.href && new URL(styleSheet.href).origin !== window.location.origin) {
|
||||
console.warn(`Skipping cross-origin stylesheet: ${styleSheet.href}`);
|
||||
return;
|
||||
}
|
||||
const cssRules = styleSheet.cssRules || styleSheet.rules;
|
||||
if (!cssRules) return;
|
||||
for (let i = 0; i < cssRules.length; i++) {
|
||||
const rule = cssRules[i];
|
||||
if (rule.type === 1) {
|
||||
try {
|
||||
let containsTargetedProperty = false;
|
||||
for (let j = 0; j < rule.style.length; j++) {
|
||||
const property = rule.style[j].toLowerCase();
|
||||
if (targetedProperties.has(property)) {
|
||||
containsTargetedProperty = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!containsTargetedProperty) continue;
|
||||
const elements = document.querySelectorAll(rule.selectorText);
|
||||
elements.forEach(element => {
|
||||
for (let j = 0; j < rule.style.length; j++) {
|
||||
const property = rule.style[j].toLowerCase();
|
||||
if (targetedProperties.has(property)) {
|
||||
const value = rule.style.getPropertyValue(property);
|
||||
const dataAttrName = propertyToAttrMap[property];
|
||||
element.setAttribute(dataAttrName, value);
|
||||
}
|
||||
}
|
||||
});
|
||||
} catch (selectorError) {
|
||||
console.error(`Error processing selector "${rule.selectorText}": ${selectorError.message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (sheetError) {
|
||||
console.error(`Error processing stylesheet: ${sheetError.message}`);
|
||||
}
|
||||
});
|
||||
} catch (error) {
|
||||
console.error(`Error adding data attributes: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class CookieConsentHandler {
|
||||
constructor() {
|
||||
// Keywords that strongly indicate cookie consent
|
||||
this.cookieKeywords = [
|
||||
'cookie', 'cookies', 'consent', 'gdpr', 'privacy policy', 'privacy notice',
|
||||
'data protection', 'tracking', 'analytics', 'personalization', 'advertising',
|
||||
'essential cookies', 'functional cookies', 'performance cookies'
|
||||
];
|
||||
|
||||
// Keywords that indicate newsletter/subscription popups
|
||||
this.newsletterKeywords = [
|
||||
'newsletter', 'subscribe', 'email', 'signup', 'sign up', 'updates',
|
||||
'notifications', 'discount', 'offer', 'deal', 'promo', 'exclusive'
|
||||
];
|
||||
|
||||
// Common button text for accepting cookies
|
||||
this.acceptButtonTexts = [
|
||||
'accept', 'accept all', 'allow all', 'agree', 'ok', 'got it',
|
||||
'i agree', 'continue', 'yes', 'enable', 'allow cookies',
|
||||
'accept cookies', 'accept all cookies', 'i understand'
|
||||
];
|
||||
|
||||
// Common button text for rejecting (to avoid clicking these)
|
||||
this.rejectButtonTexts = [
|
||||
'reject', 'decline', 'deny', 'refuse', 'no thanks', 'no',
|
||||
'reject all', 'decline all', 'manage preferences', 'customize',
|
||||
'settings', 'options', 'learn more'
|
||||
];
|
||||
|
||||
// Special patterns that strongly indicate cookie consent
|
||||
this.acceptButtonStyles = [
|
||||
/primary/,
|
||||
];
|
||||
}
|
||||
|
||||
analyzePopover(element) {
|
||||
if (!element || !element.textContent) {
|
||||
return { category: 'unknown', action: 'none', reason: 'Invalid element' };
|
||||
}
|
||||
|
||||
const textContent = element.textContent.toLowerCase();
|
||||
const category = this.categorizePopover(textContent, element);
|
||||
|
||||
let result = {
|
||||
category: category,
|
||||
action: 'none',
|
||||
reason: '',
|
||||
element: element
|
||||
};
|
||||
|
||||
if (category === 'cookie_consent') {
|
||||
const acceptResult = this.tryAcceptCookies(element);
|
||||
result.action = acceptResult.action;
|
||||
result.reason = acceptResult.reason;
|
||||
result.buttonClicked = acceptResult.buttonClicked;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
categorizePopover(textContent, element) {
|
||||
let cookieScore = 0;
|
||||
let newsletterScore = 0;
|
||||
|
||||
// Score based on keyword presence
|
||||
this.cookieKeywords.forEach(keyword => {
|
||||
if (textContent.includes(keyword)) {
|
||||
cookieScore += keyword === 'cookie' || keyword === 'cookies' ? 3 : 1;
|
||||
}
|
||||
});
|
||||
|
||||
this.newsletterKeywords.forEach(keyword => {
|
||||
if (textContent.includes(keyword)) {
|
||||
newsletterScore += keyword === 'newsletter' || keyword === 'subscribe' ? 3 : 1;
|
||||
}
|
||||
});
|
||||
|
||||
// Additional heuristics
|
||||
if (this.hasPrivacyPolicyLink(element)) cookieScore += 2;
|
||||
if (this.hasManagePreferencesButton(element)) cookieScore += 2;
|
||||
if (this.hasEmailInput(element)) newsletterScore += 3;
|
||||
if (this.hasDiscountMention(textContent)) newsletterScore += 2;
|
||||
|
||||
// Special patterns that strongly indicate cookie consent
|
||||
const strongCookiePatterns = [
|
||||
/we use cookies/,
|
||||
/this website uses cookies/,
|
||||
/by continuing to use/,
|
||||
/essential.*cookies/,
|
||||
/improve.*experience/,
|
||||
/gdpr/,
|
||||
/data protection/
|
||||
];
|
||||
|
||||
if (strongCookiePatterns.some(pattern => pattern.test(textContent))) {
|
||||
cookieScore += 5;
|
||||
}
|
||||
|
||||
// Determine category
|
||||
if (cookieScore > newsletterScore && cookieScore >= 2) {
|
||||
return 'cookie_consent';
|
||||
} else if (newsletterScore > cookieScore && newsletterScore >= 2) {
|
||||
return 'newsletter';
|
||||
} else {
|
||||
return 'other';
|
||||
}
|
||||
}
|
||||
|
||||
tryAcceptCookies(element) {
|
||||
const buttons = this.findButtons(element);
|
||||
|
||||
if (buttons.length === 0) {
|
||||
return { action: 'no_buttons_found', reason: 'No clickable buttons found' };
|
||||
}
|
||||
|
||||
// First, try to find explicit accept buttons
|
||||
const acceptButton = this.findAcceptButton(buttons);
|
||||
if (acceptButton) {
|
||||
try {
|
||||
acceptButton.click();
|
||||
return {
|
||||
action: 'clicked_accept',
|
||||
reason: 'Found and clicked accept button',
|
||||
buttonClicked: acceptButton.textContent.trim()
|
||||
};
|
||||
} catch (error) {
|
||||
return {
|
||||
action: 'click_failed',
|
||||
reason: `Failed to click button: ${error.message}`,
|
||||
buttonClicked: acceptButton.textContent.trim()
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// If no explicit accept button, try to find the most likely candidate
|
||||
const likelyButton = this.findMostLikelyAcceptButton(buttons);
|
||||
if (likelyButton) {
|
||||
try {
|
||||
likelyButton.click();
|
||||
return {
|
||||
action: 'clicked_likely',
|
||||
reason: 'Clicked most likely accept button',
|
||||
buttonClicked: likelyButton.textContent.trim()
|
||||
};
|
||||
} catch (error) {
|
||||
return {
|
||||
action: 'click_failed',
|
||||
reason: `Failed to click button: ${error.message}`,
|
||||
buttonClicked: likelyButton.textContent.trim()
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
action: 'no_accept_button',
|
||||
reason: 'Could not identify accept button',
|
||||
availableButtons: buttons.map(btn => btn.textContent.trim())
|
||||
};
|
||||
}
|
||||
|
||||
findButtons(element) {
|
||||
const selectors = [
|
||||
'button',
|
||||
'input[type="button"]',
|
||||
'input[type="submit"]',
|
||||
'[role="button"]',
|
||||
'a[href="#"]',
|
||||
'.button',
|
||||
'.btn',
|
||||
'.btn-primary'
|
||||
];
|
||||
|
||||
const buttons = [];
|
||||
selectors.forEach(selector => {
|
||||
const found = element.querySelectorAll(selector);
|
||||
buttons.push(...Array.from(found));
|
||||
});
|
||||
|
||||
// Remove duplicates and filter visible buttons
|
||||
return [...new Set(buttons)].filter(btn =>
|
||||
btn.offsetWidth > 0 && btn.offsetHeight > 0
|
||||
);
|
||||
}
|
||||
|
||||
findAcceptButton(buttons) {
|
||||
var byClass = buttons.find(button => {
|
||||
var classes = button.className.toLowerCase();
|
||||
|
||||
if (this.acceptButtonStyles.some(pattern => pattern.test(classes))) {
|
||||
return true;
|
||||
}
|
||||
});
|
||||
|
||||
if (byClass != null) {
|
||||
return byClass;
|
||||
}
|
||||
|
||||
return buttons.find(button => {
|
||||
const text = button.textContent.toLowerCase().trim();
|
||||
|
||||
return this.acceptButtonTexts.some(acceptText =>
|
||||
text === acceptText || text.includes(acceptText)
|
||||
) && !this.rejectButtonTexts.some(rejectText =>
|
||||
text.includes(rejectText)
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
findMostLikelyAcceptButton(buttons) {
|
||||
if (buttons.length === 1) {
|
||||
const text = buttons[0].textContent.toLowerCase();
|
||||
// If there's only one button and it's not explicitly a reject button, assume it's accept
|
||||
if (!this.rejectButtonTexts.some(rejectText => text.includes(rejectText))) {
|
||||
return buttons[0];
|
||||
}
|
||||
}
|
||||
|
||||
// Look for buttons with positive styling (often green, primary, etc.)
|
||||
const positiveButton = buttons.find(button => {
|
||||
const classes = button.className.toLowerCase();
|
||||
const styles = window.getComputedStyle(button);
|
||||
const bgColor = styles.backgroundColor;
|
||||
|
||||
return classes.includes('primary') ||
|
||||
classes.includes('accept') ||
|
||||
classes.includes('green') ||
|
||||
bgColor.includes('rgb(0, 128, 0)') || // green variations
|
||||
bgColor.includes('rgb(40, 167, 69)'); // bootstrap success
|
||||
});
|
||||
|
||||
return positiveButton || null;
|
||||
}
|
||||
|
||||
hasPrivacyPolicyLink(element) {
|
||||
const links = element.querySelectorAll('a');
|
||||
return Array.from(links).some(link =>
|
||||
link.textContent.toLowerCase().includes('privacy') ||
|
||||
link.href.toLowerCase().includes('privacy')
|
||||
);
|
||||
}
|
||||
|
||||
hasManagePreferencesButton(element) {
|
||||
const buttons = this.findButtons(element);
|
||||
return buttons.some(button => {
|
||||
const text = button.textContent.toLowerCase();
|
||||
return text.includes('manage') || text.includes('preferences') ||
|
||||
text.includes('settings') || text.includes('customize');
|
||||
});
|
||||
}
|
||||
|
||||
hasEmailInput(element) {
|
||||
const inputs = element.querySelectorAll('input[type="email"], input[placeholder*="email" i]');
|
||||
return inputs.length > 0;
|
||||
}
|
||||
|
||||
hasDiscountMention(textContent) {
|
||||
const discountTerms = ['discount', 'off', '%', 'save', 'deal', 'offer'];
|
||||
return discountTerms.some(term => textContent.includes(term));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
var agreedToPopover = false;
|
||||
// Usage example:
|
||||
function handlePopover(popoverElement) {
|
||||
const handler = new CookieConsentHandler();
|
||||
const result = handler.analyzePopover(popoverElement);
|
||||
|
||||
console.log('Popover analysis result:', result);
|
||||
|
||||
switch (result.category) {
|
||||
case 'cookie_consent':
|
||||
console.log('Detected cookie consent popover');
|
||||
if (result.action === 'clicked_accept') {
|
||||
console.log('Successfully accepted cookies');
|
||||
agreedToPopover = true;
|
||||
} else {
|
||||
console.log('Could not accept cookies:', result.reason);
|
||||
}
|
||||
break;
|
||||
case 'newsletter':
|
||||
console.log('Detected newsletter popover - no action taken');
|
||||
break;
|
||||
default:
|
||||
console.log('Unknown popover type - no action taken');
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
function finalizeMarginaliaHack() {
|
||||
addStylesAsDataAttributes();
|
||||
|
||||
// Find all likely popover elements
|
||||
const fixedElements = document.querySelectorAll('[data-position="fixed"]');
|
||||
|
||||
// Attempt to agree to cookie consent popups
|
||||
fixedElements.forEach(element => {
|
||||
handlePopover(element);
|
||||
});
|
||||
|
||||
// If we found a popover and agreed to it, add a notice
|
||||
if (agreedToPopover) {
|
||||
var notice = document.createElement('div');
|
||||
notice.setAttribute('class', 'marginalia-agreed-cookies');
|
||||
networkRequests.appendChild(notice);
|
||||
}
|
||||
|
||||
var finalize = () => {
|
||||
// Add a container for network requests
|
||||
document.body.appendChild(networkRequests);
|
||||
document.body.setAttribute('id', 'marginaliahack');
|
||||
}
|
||||
|
||||
// If we have a popover and agreed to it, wait a bit before finalizing
|
||||
// to let the ad networks load so we can capture their requests
|
||||
if (agreedToPopover) {
|
||||
setTimeout(finalize, 2500);
|
||||
}
|
||||
else {
|
||||
finalize();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
class EventSimulator {
|
||||
constructor() {}
|
||||
|
||||
// Simulate smooth scrolling down the page
|
||||
simulateScrollDown(duration = 2000, distance = null) {
|
||||
return new Promise((resolve) => {
|
||||
const startTime = Date.now();
|
||||
const startScrollY = window.scrollY;
|
||||
const maxScroll = document.documentElement.scrollHeight - window.innerHeight;
|
||||
const targetDistance = distance || Math.min(window.innerHeight * 3, maxScroll - startScrollY);
|
||||
|
||||
if (targetDistance <= 0) {
|
||||
resolve();
|
||||
return;
|
||||
}
|
||||
|
||||
const animate = () => {
|
||||
const elapsed = Date.now() - startTime;
|
||||
const progress = Math.min(elapsed / duration, 1);
|
||||
|
||||
// Ease-out function for smooth scrolling
|
||||
const easeOut = 1 - Math.pow(1 - progress, 3);
|
||||
const currentDistance = targetDistance * easeOut;
|
||||
const newScrollY = startScrollY + currentDistance;
|
||||
|
||||
// Dispatch scroll events as we go
|
||||
window.scrollTo(0, newScrollY);
|
||||
|
||||
// Fire custom scroll event
|
||||
const scrollEvent = new Event('scroll', {
|
||||
bubbles: true,
|
||||
cancelable: true
|
||||
});
|
||||
|
||||
// Add custom properties to track simulation
|
||||
scrollEvent.simulated = true;
|
||||
scrollEvent.scrollY = newScrollY;
|
||||
scrollEvent.progress = progress;
|
||||
|
||||
window.dispatchEvent(scrollEvent);
|
||||
document.dispatchEvent(scrollEvent);
|
||||
|
||||
if (progress < 1) {
|
||||
requestAnimationFrame(animate);
|
||||
} else {
|
||||
resolve();
|
||||
}
|
||||
};
|
||||
|
||||
requestAnimationFrame(animate);
|
||||
});
|
||||
}
|
||||
|
||||
// Simulate mouse movement toward URL bar
|
||||
simulateMouseToURLBar(duration = 1500) {
|
||||
return new Promise((resolve) => {
|
||||
const startTime = Date.now();
|
||||
|
||||
// Get current mouse position (or start from center of viewport)
|
||||
const startX = window.innerWidth / 2;
|
||||
const startY = window.innerHeight / 2;
|
||||
|
||||
// URL bar is typically at the top center of the browser
|
||||
// Since we can't access actual browser chrome, we'll simulate movement
|
||||
// toward the top of the viewport where the URL bar would be
|
||||
const targetX = window.innerWidth / 2; // Center horizontally
|
||||
const targetY = -50; // Above the viewport (simulating URL bar position)
|
||||
|
||||
const deltaX = targetX - startX;
|
||||
const deltaY = targetY - startY;
|
||||
|
||||
let lastMouseEvent = null;
|
||||
|
||||
const animate = () => {
|
||||
const elapsed = Date.now() - startTime;
|
||||
const progress = Math.min(elapsed / duration, 1);
|
||||
|
||||
// Ease-in-out function for natural mouse movement
|
||||
const easeInOut = progress < 0.5
|
||||
? 2 * progress * progress
|
||||
: 1 - Math.pow(-2 * progress + 2, 3) / 2;
|
||||
|
||||
const currentX = startX + (deltaX * easeInOut);
|
||||
const currentY = startY + (deltaY * easeInOut);
|
||||
|
||||
// Create mouse move event
|
||||
const mouseMoveEvent = new MouseEvent('mousemove', {
|
||||
bubbles: true,
|
||||
cancelable: true,
|
||||
clientX: currentX,
|
||||
clientY: currentY,
|
||||
screenX: currentX,
|
||||
screenY: currentY,
|
||||
movementX: lastMouseEvent ? currentX - lastMouseEvent.clientX : 0,
|
||||
movementY: lastMouseEvent ? currentY - lastMouseEvent.clientY : 0,
|
||||
buttons: 0,
|
||||
button: -1
|
||||
});
|
||||
|
||||
// Add custom properties to track simulation
|
||||
mouseMoveEvent.simulated = true;
|
||||
mouseMoveEvent.progress = progress;
|
||||
mouseMoveEvent.targetType = 'urlbar';
|
||||
|
||||
// Find element under mouse and dispatch event
|
||||
const elementUnderMouse = document.elementFromPoint(currentX, currentY);
|
||||
if (elementUnderMouse) {
|
||||
elementUnderMouse.dispatchEvent(mouseMoveEvent);
|
||||
|
||||
// Also fire mouseenter/mouseleave events if element changed
|
||||
if (lastMouseEvent) {
|
||||
const lastElement = document.elementFromPoint(
|
||||
lastMouseEvent.clientX,
|
||||
lastMouseEvent.clientY
|
||||
);
|
||||
|
||||
if (lastElement && lastElement !== elementUnderMouse) {
|
||||
// Mouse left previous element
|
||||
const mouseLeaveEvent = new MouseEvent('mouseleave', {
|
||||
bubbles: false, // mouseleave doesn't bubble
|
||||
cancelable: true,
|
||||
clientX: currentX,
|
||||
clientY: currentY,
|
||||
relatedTarget: elementUnderMouse
|
||||
});
|
||||
mouseLeaveEvent.simulated = true;
|
||||
lastElement.dispatchEvent(mouseLeaveEvent);
|
||||
|
||||
// Mouse entered new element
|
||||
const mouseEnterEvent = new MouseEvent('mouseenter', {
|
||||
bubbles: false, // mouseenter doesn't bubble
|
||||
cancelable: true,
|
||||
clientX: currentX,
|
||||
clientY: currentY,
|
||||
relatedTarget: lastElement
|
||||
});
|
||||
mouseEnterEvent.simulated = true;
|
||||
elementUnderMouse.dispatchEvent(mouseEnterEvent);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Also dispatch on document and window
|
||||
document.dispatchEvent(mouseMoveEvent);
|
||||
window.dispatchEvent(mouseMoveEvent);
|
||||
|
||||
lastMouseEvent = mouseMoveEvent;
|
||||
|
||||
if (progress < 1) {
|
||||
requestAnimationFrame(animate);
|
||||
} else {
|
||||
resolve();
|
||||
}
|
||||
};
|
||||
|
||||
requestAnimationFrame(animate);
|
||||
});
|
||||
}
|
||||
|
||||
// Simulate realistic mouse movement with slight randomness
|
||||
simulateNaturalMouseMovement(targetX, targetY, duration = 1000) {
|
||||
return new Promise((resolve) => {
|
||||
const startTime = Date.now();
|
||||
const startX = window.innerWidth / 2;
|
||||
const startY = window.innerHeight / 2;
|
||||
|
||||
const basePathX = targetX - startX;
|
||||
const basePathY = targetY - startY;
|
||||
|
||||
const animate = () => {
|
||||
const elapsed = Date.now() - startTime;
|
||||
const progress = Math.min(elapsed / duration, 1);
|
||||
|
||||
// Add some randomness to make movement more natural
|
||||
const randomOffsetX = (Math.random() - 0.5) * 10 * (1 - progress);
|
||||
const randomOffsetY = (Math.random() - 0.5) * 10 * (1 - progress);
|
||||
|
||||
// Bezier curve for more natural movement
|
||||
const t = progress;
|
||||
const bezierProgress = t * t * (3.0 - 2.0 * t);
|
||||
|
||||
const currentX = startX + (basePathX * bezierProgress) + randomOffsetX;
|
||||
const currentY = startY + (basePathY * bezierProgress) + randomOffsetY;
|
||||
|
||||
const mouseMoveEvent = new MouseEvent('mousemove', {
|
||||
bubbles: true,
|
||||
cancelable: true,
|
||||
clientX: currentX,
|
||||
clientY: currentY,
|
||||
screenX: currentX,
|
||||
screenY: currentY
|
||||
});
|
||||
|
||||
mouseMoveEvent.simulated = true;
|
||||
mouseMoveEvent.natural = true;
|
||||
|
||||
document.dispatchEvent(mouseMoveEvent);
|
||||
|
||||
if (progress < 1) {
|
||||
requestAnimationFrame(animate);
|
||||
} else {
|
||||
resolve();
|
||||
}
|
||||
};
|
||||
|
||||
requestAnimationFrame(animate);
|
||||
});
|
||||
}
|
||||
|
||||
// Combined simulation: scroll down while moving mouse toward URL bar
|
||||
async simulateBrowsingBehavior() {
|
||||
|
||||
// Start both animations simultaneously
|
||||
const scrollPromise = this.simulateScrollDown(300);
|
||||
const mousePromise = this.simulateMouseToURLBar(200);
|
||||
|
||||
// Wait for both to complete
|
||||
await Promise.all([scrollPromise, mousePromise]);
|
||||
|
||||
// Add a small pause
|
||||
await new Promise(resolve => setTimeout(resolve, 100));
|
||||
|
||||
// Simulate some additional natural mouse movement
|
||||
await this.simulateNaturalMouseMovement(
|
||||
window.innerWidth * 0.3,
|
||||
window.innerHeight * 0.1,
|
||||
100
|
||||
);
|
||||
|
||||
console.log('Browsing behavior simulation completed');
|
||||
}
|
||||
}
|
||||
|
||||
// Usage examples:
|
||||
const simulator = new EventSimulator();
|
||||
|
||||
function simulateUserBehavior() {
|
||||
simulator.simulateBrowsingBehavior().then(() => {
|
||||
console.log('User behavior simulation finished');
|
||||
});
|
||||
}
|
||||
|
||||
window.addEventListener("load", (e) => simulateUserBehavior());
|
||||
window.addEventListener("load", (e) => setTimeout(finalizeMarginaliaHack, 2000));
|
29
code/tools/browserless/extension/manifest.json
Normal file
29
code/tools/browserless/extension/manifest.json
Normal file
@@ -0,0 +1,29 @@
|
||||
{
|
||||
"manifest_version": 3,
|
||||
"name": "Marginalia DOM Interceptor",
|
||||
"version": "1.0",
|
||||
"description": "Makes DOM export better",
|
||||
|
||||
"permissions": [
|
||||
"activeTab",
|
||||
"scripting",
|
||||
"webNavigation",
|
||||
"webRequest"
|
||||
],
|
||||
"host_permissions": [
|
||||
"<all_urls>"
|
||||
],
|
||||
"background": {
|
||||
"service_worker": "background.js",
|
||||
"type": "module"
|
||||
},
|
||||
"content_scripts": [
|
||||
{
|
||||
"js": ["content.js"],
|
||||
"run_at": "document_start",
|
||||
"matches": [
|
||||
"<all_urls>"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
@@ -8,4 +8,5 @@
|
||||
2025-05-05: Deploy executor partition 4.
|
||||
2025-05-05: Deploy control.
|
||||
2025-05-08: Deploy assistant.
|
||||
2025-05-17: Redeploy all.
|
||||
2025-05-17: Redeploy all.
|
||||
2025-05-28: Deploy assistant and browserless.
|
@@ -93,6 +93,7 @@ include 'code:tools:experiment-runner'
|
||||
include 'code:tools:screenshot-capture-tool'
|
||||
include 'code:tools:load-test'
|
||||
include 'code:tools:integration-test'
|
||||
include 'code:tools:browserless'
|
||||
|
||||
include 'third-party:porterstemmer'
|
||||
include 'third-party:symspell'
|
||||
|
@@ -272,6 +272,13 @@ if __name__ == '__main__':
|
||||
deploy_tier=1,
|
||||
groups={"all", "core"}
|
||||
),
|
||||
'browserless': ServiceConfig(
|
||||
gradle_target=':code:tools:browserless:docker',
|
||||
docker_name='browserless',
|
||||
instances=None,
|
||||
deploy_tier=2,
|
||||
groups={"all", "core"}
|
||||
),
|
||||
'assistant': ServiceConfig(
|
||||
gradle_target=':code:services-core:assistant-service:docker',
|
||||
docker_name='assistant-service',
|
||||
|
Reference in New Issue
Block a user