1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00

Add SOCKS proxy support for crawler processes

- Add SocksProxyConfiguration, SocksProxyManager, and SocksProxyHttpClientFactory classes
   - Integrate SOCKS proxy support into all crawler HTTP clients
   - Support round-robin and random proxy selection strategies
   - Add comprehensive documentation for SOCKS proxy configuration
   - Configure via system properties for easy deployment
This commit is contained in:
John Von Essen
2025-09-05 10:42:58 -04:00
parent b46f2e1407
commit ec5f32b1d8
8 changed files with 591 additions and 15 deletions

View File

@@ -0,0 +1,141 @@
package nu.marginalia.proxy;
import java.util.Arrays;
import java.util.List;
import java.util.Objects;
import java.util.stream.Collectors;
/**
* Configuration for SOCKS proxy settings used by crawlers to distribute IP footprint.
*/
public class SocksProxyConfiguration {
private final boolean enabled;
private final List<SocksProxy> proxies;
private final ProxySelectionStrategy strategy;
public SocksProxyConfiguration() {
this.enabled = Boolean.parseBoolean(System.getProperty("crawler.socksProxy.enabled", "false"));
this.strategy = ProxySelectionStrategy.valueOf(
System.getProperty("crawler.socksProxy.strategy", "ROUND_ROBIN")
);
this.proxies = parseProxies();
}
private List<SocksProxy> parseProxies() {
String proxyList = System.getProperty("crawler.socksProxy.list", "");
if (proxyList.isEmpty()) {
return List.of();
}
return Arrays.stream(proxyList.split(","))
.map(String::trim)
.filter(s -> !s.isEmpty())
.map(this::parseProxy)
.filter(Objects::nonNull)
.collect(Collectors.toList());
}
private SocksProxy parseProxy(String proxyString) {
try {
// Expected format: "host:port" or "host:port:username:password"
String[] parts = proxyString.split(":");
if (parts.length < 2) {
return null;
}
String host = parts[0];
int port = Integer.parseInt(parts[1]);
if (parts.length >= 4) {
String username = parts[2];
String password = parts[3];
return new SocksProxy(host, port, username, password);
} else {
return new SocksProxy(host, port);
}
} catch (Exception e) {
return null;
}
}
public boolean isEnabled() {
return enabled && !proxies.isEmpty();
}
public List<SocksProxy> getProxies() {
return proxies;
}
public ProxySelectionStrategy getStrategy() {
return strategy;
}
public enum ProxySelectionStrategy {
ROUND_ROBIN,
RANDOM
}
public static class SocksProxy {
private final String host;
private final int port;
private final String username;
private final String password;
public SocksProxy(String host, int port) {
this(host, port, null, null);
}
public SocksProxy(String host, int port, String username, String password) {
this.host = host;
this.port = port;
this.username = username;
this.password = password;
}
public String getHost() {
return host;
}
public int getPort() {
return port;
}
public String getUsername() {
return username;
}
public String getPassword() {
return password;
}
public boolean hasAuthentication() {
return username != null && password != null;
}
@Override
public String toString() {
if (hasAuthentication()) {
return String.format("%s:%d (auth: %s)", host, port, username);
} else {
return String.format("%s:%d", host, port);
}
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
SocksProxy that = (SocksProxy) o;
return port == that.port &&
Objects.equals(host, that.host) &&
Objects.equals(username, that.username) &&
Objects.equals(password, that.password);
}
@Override
public int hashCode() {
return Objects.hash(host, port, username, password);
}
}
}

View File

@@ -0,0 +1,122 @@
package nu.marginalia.proxy;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManagerBuilder;
import org.apache.hc.client5.http.socket.ConnectionSocketFactory;
import org.apache.hc.client5.http.socket.PlainConnectionSocketFactory;
import org.apache.hc.client5.http.ssl.SSLConnectionSocketFactory;
import org.apache.hc.core5.http.config.Registry;
import org.apache.hc.core5.http.config.RegistryBuilder;
import org.apache.hc.core5.http.protocol.HttpContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.net.ssl.SSLContext;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.net.Proxy;
import java.net.Socket;
import java.net.SocketAddress;
/**
* Utility class for configuring HTTP clients with SOCKS proxy support.
*/
public class SocksProxyHttpClientFactory {
private static final Logger logger = LoggerFactory.getLogger(SocksProxyHttpClientFactory.class);
/**
* Creates a SOCKS proxy socket factory for the given proxy configuration.
*/
public static ConnectionSocketFactory createSocksSocketFactory(SocksProxy proxy) {
return new ConnectionSocketFactory() {
@Override
public Socket createSocket(HttpContext context) throws IOException {
Socket socket = new Socket(new Proxy(Proxy.Type.SOCKS,
new InetSocketAddress(proxy.getHost(), proxy.getPort())));
// Set socket timeouts
socket.setSoTimeout(30000); // 30 seconds
socket.setConnectTimeout(30000); // 30 seconds
return socket;
}
@Override
public Socket connectSocket(int connectTimeout, Socket socket, HttpContext context,
InetSocketAddress remoteAddress, InetSocketAddress localAddress,
org.apache.hc.core5.util.Timeout socketTimeout) throws IOException {
if (socket == null) {
socket = createSocket(context);
}
if (localAddress != null) {
socket.bind(localAddress);
}
socket.connect(remoteAddress, connectTimeout);
return socket;
}
};
}
/**
* Creates a SOCKS proxy SSL socket factory for the given proxy configuration.
*/
public static SSLConnectionSocketFactory createSocksSslSocketFactory(SocksProxy proxy) {
try {
return new SSLConnectionSocketFactory(SSLContext.getDefault()) {
@Override
public Socket createSocket(HttpContext context) throws IOException {
Socket socket = new Socket(new Proxy(Proxy.Type.SOCKS,
new InetSocketAddress(proxy.getHost(), proxy.getPort())));
// Set socket timeouts
socket.setSoTimeout(30000); // 30 seconds
socket.setConnectTimeout(30000); // 30 seconds
return socket;
}
@Override
public Socket connectSocket(int connectTimeout, Socket socket, HttpContext context,
InetSocketAddress remoteAddress, InetSocketAddress localAddress,
org.apache.hc.core5.util.Timeout socketTimeout) throws IOException {
if (socket == null) {
socket = createSocket(context);
}
if (localAddress != null) {
socket.bind(localAddress);
}
socket.connect(remoteAddress, connectTimeout);
return socket;
}
};
} catch (Exception e) {
logger.error("Failed to create SOCKS SSL socket factory", e);
throw new RuntimeException(e);
}
}
/**
* Configures a connection manager builder with SOCKS proxy support.
* If no proxy is provided, uses default socket factories.
*/
public static void configureConnectionManager(PoolingHttpClientConnectionManagerBuilder builder,
SocksProxy proxy) {
if (proxy != null) {
logger.debug("Configuring HTTP client with SOCKS proxy: {}", proxy);
Registry<ConnectionSocketFactory> registry = RegistryBuilder.<ConnectionSocketFactory>create()
.register("http", createSocksSocketFactory(proxy))
.register("https", createSocksSslSocketFactory(proxy))
.build();
builder.setConnectionManager(registry);
} else {
logger.debug("Configuring HTTP client without proxy");
}
}
}

View File

@@ -0,0 +1,79 @@
package nu.marginalia.proxy;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.ThreadLocalRandom;
/**
* Manages SOCKS proxy selection and rotation for crawler requests.
*/
public class SocksProxyManager {
private static final Logger logger = LoggerFactory.getLogger(SocksProxyManager.class);
private final SocksProxyConfiguration config;
private final AtomicInteger roundRobinIndex = new AtomicInteger(0);
public SocksProxyManager(SocksProxyConfiguration config) {
this.config = config;
if (config.isEnabled()) {
logger.info("SOCKS proxy support enabled with {} proxies using {} strategy",
config.getProxies().size(), config.getStrategy());
for (SocksProxy proxy : config.getProxies()) {
logger.info(" - {}", proxy);
}
} else {
logger.info("SOCKS proxy support disabled");
}
}
/**
* Selects the next proxy to use based on the configured strategy.
* Returns null if proxy support is disabled or no proxies are available.
*/
public SocksProxy selectProxy() {
if (!config.isEnabled()) {
return null;
}
List<SocksProxy> proxies = config.getProxies();
if (proxies.isEmpty()) {
return null;
}
SocksProxy selectedProxy;
switch (config.getStrategy()) {
case ROUND_ROBIN:
int index = roundRobinIndex.getAndIncrement() % proxies.size();
selectedProxy = proxies.get(index);
break;
case RANDOM:
int randomIndex = ThreadLocalRandom.current().nextInt(proxies.size());
selectedProxy = proxies.get(randomIndex);
break;
default:
selectedProxy = proxies.get(0);
break;
}
logger.debug("Selected SOCKS proxy: {}", selectedProxy);
return selectedProxy;
}
/**
* Gets the current proxy configuration.
*/
public SocksProxyConfiguration getConfiguration() {
return config;
}
/**
* Checks if proxy support is enabled and proxies are available.
*/
public boolean isProxyEnabled() {
return config.isEnabled();
}
}

View File

@@ -14,6 +14,9 @@ import nu.marginalia.model.body.ContentTypeLogic;
import nu.marginalia.model.body.DocumentBodyExtractor;
import nu.marginalia.model.body.HttpFetchResult;
import nu.marginalia.model.crawldata.CrawlerDomainStatus;
import nu.marginalia.proxy.SocksProxyConfiguration;
import nu.marginalia.proxy.SocksProxyManager;
import nu.marginalia.proxy.SocksProxyHttpClientFactory;
import org.apache.hc.client5.http.ConnectionKeepAliveStrategy;
import org.apache.hc.client5.http.HttpRequestRetryStrategy;
import org.apache.hc.client5.http.classic.HttpClient;
@@ -70,6 +73,7 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
private final String userAgentIdentifier;
private final CookieStore cookies = new BasicCookieStore();
private final SocksProxyManager proxyManager;
private static final SimpleRobotRulesParser robotsParser = new SimpleRobotRulesParser();
private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
@@ -95,13 +99,17 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
.setValidateAfterInactivity(TimeValue.ofSeconds(5))
.build();
connectionManager = PoolingHttpClientConnectionManagerBuilder.create()
PoolingHttpClientConnectionManagerBuilder connectionManagerBuilder = PoolingHttpClientConnectionManagerBuilder.create()
.setMaxConnPerRoute(2)
.setMaxConnTotal(5000)
.setDefaultConnectionConfig(connectionConfig)
.setTlsSocketStrategy(new DefaultClientTlsStrategy(SSLContext.getDefault()))
.build();
.setTlsSocketStrategy(new DefaultClientTlsStrategy(SSLContext.getDefault()));
// Configure SOCKS proxy if enabled
SocksProxyConfiguration.SocksProxy selectedProxy = proxyManager.selectProxy();
SocksProxyHttpClientFactory.configureConnectionManager(connectionManagerBuilder, selectedProxy);
connectionManager = connectionManagerBuilder.build();
connectionManager.setDefaultSocketConfig(SocketConfig.custom()
.setSoLinger(TimeValue.ofSeconds(-1))
@@ -181,6 +189,7 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
@Inject
public HttpFetcherImpl(UserAgent userAgent)
{
this.proxyManager = new SocksProxyManager(new SocksProxyConfiguration());
try {
this.client = createClient();
} catch (NoSuchAlgorithmException e) {
@@ -193,6 +202,7 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
}
public HttpFetcherImpl(String userAgent) {
this.proxyManager = new SocksProxyManager(new SocksProxyConfiguration());
try {
this.client = createClient();
} catch (NoSuchAlgorithmException e) {

View File

@@ -1,6 +1,9 @@
package nu.marginalia.livecrawler.io;
import com.google.inject.Provider;
import nu.marginalia.proxy.SocksProxyConfiguration;
import nu.marginalia.proxy.SocksProxyManager;
import nu.marginalia.proxy.SocksProxyHttpClientFactory;
import org.apache.hc.client5.http.ConnectionKeepAliveStrategy;
import org.apache.hc.client5.http.classic.HttpClient;
import org.apache.hc.client5.http.config.ConnectionConfig;
@@ -29,10 +32,12 @@ import java.util.concurrent.TimeUnit;
public class HttpClientProvider implements Provider<HttpClient> {
private static final HttpClient client;
private static PoolingHttpClientConnectionManager connectionManager;
private static final SocksProxyManager proxyManager;
private static final Logger logger = LoggerFactory.getLogger(HttpClientProvider.class);
static {
proxyManager = new SocksProxyManager(new SocksProxyConfiguration());
try {
client = createClient();
} catch (Exception e) {
@@ -47,12 +52,16 @@ public class HttpClientProvider implements Provider<HttpClient> {
.setValidateAfterInactivity(TimeValue.ofSeconds(5))
.build();
connectionManager = PoolingHttpClientConnectionManagerBuilder.create()
PoolingHttpClientConnectionManagerBuilder connectionManagerBuilder = PoolingHttpClientConnectionManagerBuilder.create()
.setMaxConnPerRoute(2)
.setMaxConnTotal(50)
.setDefaultConnectionConfig(connectionConfig)
.build();
.setDefaultConnectionConfig(connectionConfig);
// Configure SOCKS proxy if enabled
SocksProxyConfiguration.SocksProxy selectedProxy = proxyManager.selectProxy();
SocksProxyHttpClientFactory.configureConnectionManager(connectionManagerBuilder, selectedProxy);
connectionManager = connectionManagerBuilder.build();
connectionManager.setDefaultSocketConfig(SocketConfig.custom()
.setSoLinger(TimeValue.ofSeconds(-1))

View File

@@ -1,6 +1,9 @@
package nu.marginalia.ndp.io;
import com.google.inject.Provider;
import nu.marginalia.proxy.SocksProxyConfiguration;
import nu.marginalia.proxy.SocksProxyManager;
import nu.marginalia.proxy.SocksProxyHttpClientFactory;
import org.apache.hc.client5.http.ConnectionKeepAliveStrategy;
import org.apache.hc.client5.http.classic.HttpClient;
import org.apache.hc.client5.http.config.ConnectionConfig;
@@ -29,10 +32,12 @@ import java.util.concurrent.TimeUnit;
public class HttpClientProvider implements Provider<HttpClient> {
private static final HttpClient client;
private static PoolingHttpClientConnectionManager connectionManager;
private static final SocksProxyManager proxyManager;
private static final Logger logger = LoggerFactory.getLogger(HttpClientProvider.class);
static {
proxyManager = new SocksProxyManager(new SocksProxyConfiguration());
try {
client = createClient();
} catch (Exception e) {
@@ -47,12 +52,16 @@ public class HttpClientProvider implements Provider<HttpClient> {
.setValidateAfterInactivity(TimeValue.ofSeconds(5))
.build();
connectionManager = PoolingHttpClientConnectionManagerBuilder.create()
PoolingHttpClientConnectionManagerBuilder connectionManagerBuilder = PoolingHttpClientConnectionManagerBuilder.create()
.setMaxConnPerRoute(2)
.setMaxConnTotal(50)
.setDefaultConnectionConfig(connectionConfig)
.build();
.setDefaultConnectionConfig(connectionConfig);
// Configure SOCKS proxy if enabled
SocksProxyConfiguration.SocksProxy selectedProxy = proxyManager.selectProxy();
SocksProxyHttpClientFactory.configureConnectionManager(connectionManagerBuilder, selectedProxy);
connectionManager = connectionManagerBuilder.build();
connectionManager.setDefaultSocketConfig(SocketConfig.custom()
.setSoLinger(TimeValue.ofSeconds(-1))

View File

@@ -1,6 +1,9 @@
package nu.marginalia.ping.io;
import com.google.inject.Provider;
import nu.marginalia.proxy.SocksProxyConfiguration;
import nu.marginalia.proxy.SocksProxyManager;
import nu.marginalia.proxy.SocksProxyHttpClientFactory;
import org.apache.hc.client5.http.ConnectionKeepAliveStrategy;
import org.apache.hc.client5.http.classic.HttpClient;
import org.apache.hc.client5.http.config.ConnectionConfig;
@@ -36,10 +39,12 @@ import java.util.concurrent.TimeUnit;
public class HttpClientProvider implements Provider<HttpClient> {
private static final HttpClient client;
private static PoolingHttpClientConnectionManager connectionManager;
private static final SocksProxyManager proxyManager;
private static final Logger logger = LoggerFactory.getLogger(HttpClientProvider.class);
static {
proxyManager = new SocksProxyManager(new SocksProxyConfiguration());
try {
client = createClient();
} catch (Exception e) {
@@ -85,13 +90,18 @@ public class HttpClientProvider implements Provider<HttpClient> {
SSLContext sslContext = SSLContextBuilder.create().build();
sslContext.init(null, new TrustManager[]{trustMeBro}, null);
connectionManager = PoolingHttpClientConnectionManagerBuilder.create()
PoolingHttpClientConnectionManagerBuilder connectionManagerBuilder = PoolingHttpClientConnectionManagerBuilder.create()
.setMaxConnPerRoute(2)
.setMaxConnTotal(50)
.setDefaultConnectionConfig(connectionConfig)
.setTlsSocketStrategy(
new DefaultClientTlsStrategy(sslContext, NoopHostnameVerifier.INSTANCE))
.build();
new DefaultClientTlsStrategy(sslContext, NoopHostnameVerifier.INSTANCE));
// Configure SOCKS proxy if enabled
SocksProxyConfiguration.SocksProxy selectedProxy = proxyManager.selectProxy();
SocksProxyHttpClientFactory.configureConnectionManager(connectionManagerBuilder, selectedProxy);
connectionManager = connectionManagerBuilder.build();
connectionManager.setDefaultSocketConfig(SocketConfig.custom()
.setSoLinger(TimeValue.ofSeconds(-1))

View File

@@ -0,0 +1,196 @@
# SOCKS Proxy Support for Crawlers
This document describes how to configure and use SOCKS proxy support in Marginalia's crawler processes to distribute IP footprint across multiple remote servers.
## Overview
The SOCKS proxy feature allows crawlers to route their HTTP requests through SOCKS proxies running on remote servers. This helps distribute the IP footprint and avoid rate limiting or blocking from target websites.
## Configuration
SOCKS proxy support is configured via system properties. The following properties are available:
### Basic Configuration
- `crawler.socksProxy.enabled` (default: `false`)
- Set to `true` to enable SOCKS proxy support
- `crawler.socksProxy.list` (default: empty)
- Comma-separated list of SOCKS proxy servers
- Format: `host:port` or `host:port:username:password`
- Example: `1.1.1.5:1080,1.1.1.10:1080,1.1.1.15:1080`
- `crawler.socksProxy.strategy` (default: `ROUND_ROBIN`)
- Proxy selection strategy: `ROUND_ROBIN` or `RANDOM`
### Example Configuration
To enable SOCKS proxy with 3 remote servers using round-robin selection:
```bash
-Dcrawler.socksProxy.enabled=true
-Dcrawler.socksProxy.list=1.1.1.5:1080,1.1.1.10:1080,1.1.1.15:1080
-Dcrawler.socksProxy.strategy=ROUND_ROBIN
```
For authenticated proxies:
```bash
-Dcrawler.socksProxy.enabled=true
-Dcrawler.socksProxy.list=1.1.1.5:1080:user1:pass1,1.1.1.10:1080:user2:pass2,1.1.1.15:1080:user3:pass3
-Dcrawler.socksProxy.strategy=RANDOM
```
## Setting Up SOCKS Proxies
### Using sockd (SOCKS5 daemon)
On each remote server (1.1.1.5, 1.1.1.10, 1.1.1.15), install and configure sockd:
1. Install sockd (varies by OS):
```bash
# Ubuntu/Debian
sudo apt-get install dante-server
# CentOS/RHEL
sudo yum install dante
```
2. Configure sockd (`/etc/sockd.conf`):
```
# Listen on port 1080
internal: 0.0.0.0 port = 1080
# Allow connections from your crawler host
external: eth0
# Authentication (optional)
user.privileged: root
user.unprivileged: nobody
# Allow connections
client pass {
from: YOUR_CRAWLER_HOST_IP/32 to: 0.0.0.0/0
log: connect disconnect
}
# Allow outgoing connections
pass {
from: 0.0.0.0/0 to: 0.0.0.0/0
protocol: tcp udp
log: connect disconnect
}
```
3. Start sockd:
```bash
sudo systemctl start sockd
sudo systemctl enable sockd
```
### Alternative: Using SSH Tunnels
You can also use SSH tunnels as SOCKS proxies:
```bash
# On your local machine, create SSH tunnels to remote servers
ssh -D 1080 -N user@1.1.1.5
ssh -D 1081 -N user@1.1.1.10
ssh -D 1082 -N user@1.1.1.15
```
Then configure:
```bash
-Dcrawler.socksProxy.list=localhost:1080,localhost:1081,localhost:1082
```
## Affected Processes
SOCKS proxy support is integrated into the following crawler processes:
1. **Main Crawler** (`crawling-process`)
- Primary web crawling functionality
- Uses `HttpFetcherImpl` with SOCKS proxy support
2. **Live Crawler** (`live-crawling-process`)
- Real-time crawling for live updates
- Uses `HttpClientProvider` with SOCKS proxy support
3. **Ping Process** (`ping-process`)
- Domain availability checking
- Uses `HttpClientProvider` with SOCKS proxy support
4. **New Domain Process** (`new-domain-process`)
- New domain discovery and validation
- Uses `HttpClientProvider` with SOCKS proxy support
## Implementation Details
### Proxy Selection Strategies
- **ROUND_ROBIN**: Cycles through proxies in order, ensuring even distribution
- **RANDOM**: Randomly selects a proxy for each request
### Connection Management
- Each HTTP client maintains its own connection pool
- SOCKS proxy connections are established per request
- Connection timeouts and retry logic remain unchanged
- SSL/TLS connections work transparently through SOCKS proxies
### Logging
The system logs proxy selection and configuration:
- Startup logs show enabled proxies and strategy
- Debug logs show which proxy is selected for each request
- Error logs indicate proxy connection failures
## Troubleshooting
### Common Issues
1. **Proxy Connection Failures**
- Verify proxy servers are running and accessible
- Check firewall rules allow connections to proxy ports
- Verify proxy authentication credentials if required
2. **No Traffic Through Proxies**
- Ensure `crawler.socksProxy.enabled=true`
- Verify proxy list is correctly formatted
- Check logs for proxy selection messages
3. **Performance Issues**
- SOCKS proxies add latency - consider geographic proximity
- Monitor proxy server resources and bandwidth
- Adjust connection pool sizes if needed
### Testing Proxy Configuration
You can test your proxy configuration using curl:
```bash
# Test individual proxies
curl --socks5 1.1.1.5:1080 http://httpbin.org/ip
curl --socks5 1.1.1.10:1080 http://httpbin.org/ip
curl --socks5 1.1.1.15:1080 http://httpbin.org/ip
# Test with authentication
curl --socks5 user:pass@1.1.1.5:1080 http://httpbin.org/ip
```
## Security Considerations
- Use authenticated proxies when possible
- Ensure proxy servers are properly secured
- Monitor proxy usage for unusual patterns
- Consider using VPNs or dedicated proxy services for production use
- Rotate proxy credentials regularly
## Performance Impact
- **Latency**: SOCKS proxies add network latency (typically 10-100ms)
- **Bandwidth**: Additional bandwidth usage for proxy connections
- **Reliability**: Proxy failures can cause request failures
- **Throughput**: May reduce overall crawling throughput
Consider these factors when planning your proxy infrastructure and crawling targets.