mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
Look, this will make the git history look funny, but trimming unnecessary depth from the source tree is a very necessary sanity-preserving measure when dealing with a super-modularized codebase like this one. While it makes the project configuration a bit less conventional, it will save you several clicks every time you jump between modules. Which you'll do a lot, because it's *modul*ar. The src/main/java convention makes a lot of sense for a non-modular project though. This ain't that.
119 lines
4.0 KiB
Java
119 lines
4.0 KiB
Java
package nu.marginalia.db;
|
|
|
|
import com.zaxxer.hikari.HikariDataSource;
|
|
|
|
import java.sql.Connection;
|
|
import java.sql.PreparedStatement;
|
|
import java.sql.SQLException;
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
import java.util.OptionalInt;
|
|
|
|
/** Class used in exporting data. This is intended to be used for a brief time
|
|
* and then discarded, not kept around as a service.
|
|
*/
|
|
public class DbDomainStatsExportMultitool implements AutoCloseable {
|
|
private final Connection connection;
|
|
private final int nodeId;
|
|
private final PreparedStatement knownUrlsQuery;
|
|
private final PreparedStatement visitedUrlsQuery;
|
|
private final PreparedStatement goodUrlsQuery;
|
|
private final PreparedStatement domainNameToId;
|
|
|
|
private final PreparedStatement allDomainsQuery;
|
|
private final PreparedStatement crawlQueueDomains;
|
|
private final PreparedStatement indexedDomainsQuery;
|
|
|
|
public DbDomainStatsExportMultitool(HikariDataSource dataSource, int nodeId) throws SQLException {
|
|
this.connection = dataSource.getConnection();
|
|
this.nodeId = nodeId;
|
|
|
|
knownUrlsQuery = connection.prepareStatement("""
|
|
SELECT KNOWN_URLS
|
|
FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA
|
|
ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
|
WHERE DOMAIN_NAME=?
|
|
""");
|
|
visitedUrlsQuery = connection.prepareStatement("""
|
|
SELECT VISITED_URLS
|
|
FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA
|
|
ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
|
WHERE DOMAIN_NAME=?
|
|
""");
|
|
goodUrlsQuery = connection.prepareStatement("""
|
|
SELECT GOOD_URLS
|
|
FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA
|
|
ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
|
WHERE DOMAIN_NAME=?
|
|
""");
|
|
domainNameToId = connection.prepareStatement("""
|
|
SELECT ID
|
|
FROM EC_DOMAIN
|
|
WHERE DOMAIN_NAME=?
|
|
""");
|
|
allDomainsQuery = connection.prepareStatement("""
|
|
SELECT DOMAIN_NAME
|
|
FROM EC_DOMAIN
|
|
""");
|
|
crawlQueueDomains = connection.prepareStatement("""
|
|
SELECT DOMAIN_NAME
|
|
FROM CRAWL_QUEUE
|
|
""");
|
|
indexedDomainsQuery = connection.prepareStatement("""
|
|
SELECT DOMAIN_NAME
|
|
FROM EC_DOMAIN
|
|
WHERE INDEXED > 0
|
|
""");
|
|
}
|
|
|
|
public OptionalInt getVisitedUrls(String domainName) throws SQLException {
|
|
return executeNameToIntQuery(domainName, visitedUrlsQuery);
|
|
}
|
|
|
|
public OptionalInt getDomainId(String domainName) throws SQLException {
|
|
return executeNameToIntQuery(domainName, domainNameToId);
|
|
}
|
|
|
|
public List<String> getCrawlQueueDomains() throws SQLException {
|
|
return executeListQuery(crawlQueueDomains, 100);
|
|
}
|
|
public List<String> getAllIndexedDomains() throws SQLException {
|
|
return executeListQuery(indexedDomainsQuery, 100_000);
|
|
}
|
|
|
|
private OptionalInt executeNameToIntQuery(String domainName, PreparedStatement statement)
|
|
throws SQLException {
|
|
statement.setString(1, domainName);
|
|
var rs = statement.executeQuery();
|
|
|
|
if (rs.next()) {
|
|
return OptionalInt.of(rs.getInt(1));
|
|
}
|
|
|
|
return OptionalInt.empty();
|
|
}
|
|
|
|
private List<String> executeListQuery(PreparedStatement statement, int sizeHint) throws SQLException {
|
|
List<String> ret = new ArrayList<>(sizeHint);
|
|
|
|
var rs = statement.executeQuery();
|
|
|
|
while (rs.next()) {
|
|
ret.add(rs.getString(1));
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
@Override
|
|
public void close() throws SQLException {
|
|
knownUrlsQuery.close();
|
|
goodUrlsQuery.close();
|
|
visitedUrlsQuery.close();
|
|
allDomainsQuery.close();
|
|
crawlQueueDomains.close();
|
|
domainNameToId.close();
|
|
connection.close();
|
|
}
|
|
}
|