mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 07:32:38 +02:00
Compare commits
6 Commits
deploy-009
...
deploy-010
Author | SHA1 | Date | |
---|---|---|---|
|
9f18ced73d | ||
|
18e91269ab | ||
|
e315ca5758 | ||
|
3ceea17c1d | ||
|
b34527c1a3 | ||
|
185bf28fca |
@@ -41,7 +41,10 @@ import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.security.Security;
|
||||
import java.util.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
@@ -248,44 +251,35 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
// List of deferred tasks used to ensure beneficial scheduling of domains with regard to DomainLocks,
|
||||
// merely shuffling the domains tends to lead to a lot of threads being blocked waiting for a semphore,
|
||||
// this will more aggressively attempt to schedule the jobs to avoid blocking
|
||||
List<CrawlTask> deferredTasks = new LinkedList<>();
|
||||
List<CrawlTask> taskList = new ArrayList<>();
|
||||
|
||||
// Create crawl tasks and submit them to the pool for execution
|
||||
// Create crawl tasks
|
||||
for (CrawlSpecRecord crawlSpec : crawlSpecRecords) {
|
||||
if (workLog.isJobFinished(crawlSpec.domain()))
|
||||
if (workLog.isJobFinished(crawlSpec.domain))
|
||||
continue;
|
||||
|
||||
// Add to the end of the deferral list
|
||||
deferredTasks.addLast(new CrawlTask(
|
||||
var task = new CrawlTask(
|
||||
crawlSpec,
|
||||
anchorTagsSource,
|
||||
outputDir,
|
||||
warcArchiver,
|
||||
domainStateDb,
|
||||
workLog));
|
||||
workLog);
|
||||
|
||||
// Start every task we currently can from the deferral list
|
||||
deferredTasks.removeIf(task -> {
|
||||
if (task.canRun()) {
|
||||
if (pendingCrawlTasks.putIfAbsent(crawlSpec.domain(), task) != null) {
|
||||
return true; // task has already run, duplicate in crawl specs
|
||||
}
|
||||
|
||||
// This blocks the caller when the pool is full
|
||||
pool.submitQuietly(task);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
});
|
||||
// Try to run immediately, to avoid unnecessarily keeping the entire work set in RAM
|
||||
if (!trySubmitDeferredTask(task)) {
|
||||
// Otherwise add to the taskList for deferred execution
|
||||
taskList.add(task);
|
||||
}
|
||||
}
|
||||
|
||||
// Schedule any lingering tasks for immediate execution
|
||||
for (var task : deferredTasks) {
|
||||
if (pendingCrawlTasks.putIfAbsent(task.domain, task) != null)
|
||||
continue;
|
||||
// Schedule viable tasks for execution until list is empty
|
||||
while (!taskList.isEmpty()) {
|
||||
taskList.removeIf(this::trySubmitDeferredTask);
|
||||
|
||||
pool.submitQuietly(task);
|
||||
// Add a small pause here to avoid busy looping toward the end of the execution cycle when
|
||||
// we might have no new viable tasks to run for hours on end
|
||||
TimeUnit.MILLISECONDS.sleep(50);
|
||||
}
|
||||
|
||||
logger.info("Shutting down the pool, waiting for tasks to complete...");
|
||||
@@ -312,6 +306,28 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
}
|
||||
}
|
||||
|
||||
/** Submit a task for execution if it can be run, returns true if it was submitted
|
||||
* or if it can be discarded */
|
||||
private boolean trySubmitDeferredTask(CrawlTask task) {
|
||||
if (!task.canRun()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (pendingCrawlTasks.putIfAbsent(task.domain, task) != null) {
|
||||
return true; // task has already run, duplicate in crawl specs
|
||||
}
|
||||
|
||||
try {
|
||||
// This blocks the caller when the pool is full
|
||||
pool.submitQuietly(task);
|
||||
return true;
|
||||
}
|
||||
catch (RuntimeException ex) {
|
||||
logger.error("Failed to submit task " + task.domain, ex);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public void runForSingleDomain(String targetDomainName, FileStorageId fileStorageId) throws Exception {
|
||||
runForSingleDomain(targetDomainName, fileStorageService.getStorage(fileStorageId).asPath());
|
||||
}
|
||||
@@ -377,6 +393,11 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
@Override
|
||||
public void run() throws Exception {
|
||||
|
||||
if (workLog.isJobFinished(domain)) { // No-Op
|
||||
logger.info("Omitting task {}, as it is already run", domain);
|
||||
return;
|
||||
}
|
||||
|
||||
Path newWarcFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.LIVE);
|
||||
Path tempFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.TEMP);
|
||||
Path slopFile = CrawlerOutputFile.createSlopPath(outputDir, id, domain);
|
||||
@@ -431,7 +452,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
logger.error("Error fetching domain " + domain, e);
|
||||
}
|
||||
finally {
|
||||
// We don't need to double-count these; it's also kept int he workLog
|
||||
// We don't need to double-count these; it's also kept in the workLog
|
||||
pendingCrawlTasks.remove(domain);
|
||||
Thread.currentThread().setName("[idle]");
|
||||
|
||||
@@ -522,7 +543,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
//
|
||||
// This must be synchronized as chewing through parquet files in parallel leads to enormous memory overhead
|
||||
private synchronized Path migrateParquetData(Path inputPath, String domain, Path crawlDataRoot) throws IOException {
|
||||
if (!inputPath.endsWith(".parquet")) {
|
||||
if (!inputPath.toString().endsWith(".parquet")) {
|
||||
return inputPath;
|
||||
}
|
||||
|
||||
|
@@ -81,6 +81,7 @@ public class SearchFilters {
|
||||
),
|
||||
List.of(
|
||||
new Filter("Vintage", "fa-clock-rotate-left", SearchProfile.VINTAGE, parameters),
|
||||
new Filter("Small Web", "fa-minus", SearchProfile.SMALLWEB, parameters),
|
||||
new Filter("Plain Text", "fa-file", SearchProfile.PLAIN_TEXT, parameters),
|
||||
new Filter("Tilde", "fa-house", SearchProfile.TILDE, parameters)
|
||||
),
|
||||
|
@@ -9,6 +9,14 @@
|
||||
nicotine: '#f8f8ee',
|
||||
margeblue: '#3e5f6f',
|
||||
liteblue: '#0066cc',
|
||||
},
|
||||
screens: {
|
||||
'coarsepointer': {
|
||||
'raw': '(pointer: coarse)'
|
||||
},
|
||||
'finepointer': {
|
||||
'raw': '(pointer: fine)'
|
||||
},
|
||||
}
|
||||
},
|
||||
screens: {
|
||||
|
@@ -23,7 +23,7 @@
|
||||
@template.serp.part.searchform(query = results.getParams().query(), profile = results.getProfile(), filters = results.getFilters())
|
||||
</div>
|
||||
<div class="grow"></div>
|
||||
<button class="fixed bottom-10 right-5 sm:hidden text-sm bg-margeblue text-white p-4 rounded-xl active:text-slate-200" id="filter-button">
|
||||
<button class="fixed bottom-10 right-5 finepointer:hidden md:hidden text-sm bg-margeblue text-white p-4 rounded-xl active:text-slate-200" id="filter-button">
|
||||
<i class="fas fa-filter mr-3"></i>
|
||||
Filters
|
||||
</button>
|
||||
|
@@ -3,7 +3,7 @@
|
||||
|
||||
@param SearchFilters filters
|
||||
|
||||
<aside class="md:w-64 py-4 shrink-0 hidden sm:block">
|
||||
<aside class="md:w-64 py-4 shrink-0 hidden md:block finepointer:block">
|
||||
<div class="space-y-6 sticky top-4">
|
||||
<div class="bg-white dark:bg-gray-800 p-4 border dark:border-gray-600 border-gray-300">
|
||||
<h2 class="font-medium mb-3 flex items-center font-serif hidden md:block">
|
||||
|
@@ -9,6 +9,14 @@ module.exports = {
|
||||
nicotine: '#f8f8ee',
|
||||
margeblue: '#3e5f6f',
|
||||
liteblue: '#0066cc',
|
||||
},
|
||||
screens: {
|
||||
'coarsepointer': {
|
||||
'raw': '(pointer: coarse)'
|
||||
},
|
||||
'finepointer': {
|
||||
'raw': '(pointer: fine)'
|
||||
},
|
||||
}
|
||||
},
|
||||
screens: {
|
||||
|
Reference in New Issue
Block a user