1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00
Files
MarginaliaSearch/code/common/db/java/nu/marginalia/db/DbDomainStatsExportMultitool.java
Viktor Lofgren 1d34224416 (refac) Remove src/main from all source code paths.
Look, this will make the git history look funny, but trimming unnecessary depth from the source tree is a very necessary sanity-preserving measure when dealing with a super-modularized codebase like this one.

While it makes the project configuration a bit less conventional, it will save you several clicks every time you jump between modules.  Which you'll do a lot, because it's *modul*ar.  The src/main/java convention makes a lot of sense for a non-modular project though.  This ain't that.
2024-02-23 16:13:40 +01:00

119 lines
4.0 KiB
Java

package nu.marginalia.db;
import com.zaxxer.hikari.HikariDataSource;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
import java.util.OptionalInt;
/** Class used in exporting data. This is intended to be used for a brief time
* and then discarded, not kept around as a service.
*/
public class DbDomainStatsExportMultitool implements AutoCloseable {
private final Connection connection;
private final int nodeId;
private final PreparedStatement knownUrlsQuery;
private final PreparedStatement visitedUrlsQuery;
private final PreparedStatement goodUrlsQuery;
private final PreparedStatement domainNameToId;
private final PreparedStatement allDomainsQuery;
private final PreparedStatement crawlQueueDomains;
private final PreparedStatement indexedDomainsQuery;
public DbDomainStatsExportMultitool(HikariDataSource dataSource, int nodeId) throws SQLException {
this.connection = dataSource.getConnection();
this.nodeId = nodeId;
knownUrlsQuery = connection.prepareStatement("""
SELECT KNOWN_URLS
FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA
ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
WHERE DOMAIN_NAME=?
""");
visitedUrlsQuery = connection.prepareStatement("""
SELECT VISITED_URLS
FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA
ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
WHERE DOMAIN_NAME=?
""");
goodUrlsQuery = connection.prepareStatement("""
SELECT GOOD_URLS
FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA
ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
WHERE DOMAIN_NAME=?
""");
domainNameToId = connection.prepareStatement("""
SELECT ID
FROM EC_DOMAIN
WHERE DOMAIN_NAME=?
""");
allDomainsQuery = connection.prepareStatement("""
SELECT DOMAIN_NAME
FROM EC_DOMAIN
""");
crawlQueueDomains = connection.prepareStatement("""
SELECT DOMAIN_NAME
FROM CRAWL_QUEUE
""");
indexedDomainsQuery = connection.prepareStatement("""
SELECT DOMAIN_NAME
FROM EC_DOMAIN
WHERE INDEXED > 0
""");
}
public OptionalInt getVisitedUrls(String domainName) throws SQLException {
return executeNameToIntQuery(domainName, visitedUrlsQuery);
}
public OptionalInt getDomainId(String domainName) throws SQLException {
return executeNameToIntQuery(domainName, domainNameToId);
}
public List<String> getCrawlQueueDomains() throws SQLException {
return executeListQuery(crawlQueueDomains, 100);
}
public List<String> getAllIndexedDomains() throws SQLException {
return executeListQuery(indexedDomainsQuery, 100_000);
}
private OptionalInt executeNameToIntQuery(String domainName, PreparedStatement statement)
throws SQLException {
statement.setString(1, domainName);
var rs = statement.executeQuery();
if (rs.next()) {
return OptionalInt.of(rs.getInt(1));
}
return OptionalInt.empty();
}
private List<String> executeListQuery(PreparedStatement statement, int sizeHint) throws SQLException {
List<String> ret = new ArrayList<>(sizeHint);
var rs = statement.executeQuery();
while (rs.next()) {
ret.add(rs.getString(1));
}
return ret;
}
@Override
public void close() throws SQLException {
knownUrlsQuery.close();
goodUrlsQuery.close();
visitedUrlsQuery.close();
allDomainsQuery.close();
crawlQueueDomains.close();
domainNameToId.close();
connection.close();
}
}