From 7aa56ca304678d3cc27c28eabd90a3ac5f4265cf Mon Sep 17 00:00:00 2001 From: Robert Merget Date: Tue, 20 May 2025 14:21:31 +0400 Subject: [PATCH 01/24] updated version --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index fc75358..373294b 100644 --- a/pom.xml +++ b/pom.xml @@ -125,7 +125,7 @@ de.rub.nds scanner-core - 5.5.0 + 6.1.1 org.apache.commons From 8ec3aebe101cd10e878acfdd30d7ce2679621b9d Mon Sep 17 00:00:00 2001 From: Robert Merget Date: Wed, 21 May 2025 08:44:06 +0400 Subject: [PATCH 02/24] added uuid to scan job descriptions --- .../java/de/rub/nds/crawler/data/ScanJobDescription.java | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/main/java/de/rub/nds/crawler/data/ScanJobDescription.java b/src/main/java/de/rub/nds/crawler/data/ScanJobDescription.java index 841b410..3bd92a7 100644 --- a/src/main/java/de/rub/nds/crawler/data/ScanJobDescription.java +++ b/src/main/java/de/rub/nds/crawler/data/ScanJobDescription.java @@ -12,9 +12,12 @@ import java.io.IOException; import java.io.Serializable; import java.util.Optional; +import java.util.UUID; public class ScanJobDescription implements Serializable { + private final UUID id = UUID.randomUUID(); + private final ScanTarget scanTarget; // Metadata @@ -52,6 +55,10 @@ public ScanJobDescription(ScanTarget scanTarget, BulkScan bulkScan, JobStatus st status); } + public UUID getId() { + return id; + } + private void readObject(java.io.ObjectInputStream in) throws IOException, ClassNotFoundException { // handle deserialization, cf. https://stackoverflow.com/a/3960558 From 50ef46a6ea58d9c4804203807c3a1f713111230c Mon Sep 17 00:00:00 2001 From: Robert Merget Date: Wed, 21 May 2025 09:25:12 +0400 Subject: [PATCH 03/24] Added retrieval functions --- .../persistence/IPersistenceProvider.java | 23 ++++++ .../persistence/MongoPersistenceProvider.java | 75 +++++++++++++++++++ 2 files changed, 98 insertions(+) diff --git a/src/main/java/de/rub/nds/crawler/persistence/IPersistenceProvider.java b/src/main/java/de/rub/nds/crawler/persistence/IPersistenceProvider.java index 50e3626..2e6fb81 100644 --- a/src/main/java/de/rub/nds/crawler/persistence/IPersistenceProvider.java +++ b/src/main/java/de/rub/nds/crawler/persistence/IPersistenceProvider.java @@ -11,6 +11,7 @@ import de.rub.nds.crawler.data.BulkScan; import de.rub.nds.crawler.data.ScanJobDescription; import de.rub.nds.crawler.data.ScanResult; +import java.util.List; /** * Persistence provider interface. Exposes methods to write out the different stages of a task to a @@ -40,4 +41,26 @@ public interface IPersistenceProvider { * @param bulkScan The bulk scan to update. */ void updateBulkScan(BulkScan bulkScan); + + /** + * Retrieve scan results for a specific target hostname or IP. + * + * @param dbName The database name where the scan results are stored. + * @param collectionName The collection name where the scan results are stored. + * @param target The hostname or IP address to search for. + * @param limit The maximum number of results to retrieve. If null, all results are retrieved. + * @return A list of scan results matching the target. + */ + List getScanResultsByTarget( + String dbName, String collectionName, String target); + + /** + * Retrieve a specific scan result by its ID. + * + * @param dbName The database name where the scan result is stored. + * @param collectionName The collection name where the scan result is stored. + * @param id The ID of the scan result to retrieve. + * @return The scan result, or null if not found. + */ + ScanResult getScanResultById(String dbName, String collectionName, String id); } diff --git a/src/main/java/de/rub/nds/crawler/persistence/MongoPersistenceProvider.java b/src/main/java/de/rub/nds/crawler/persistence/MongoPersistenceProvider.java index 0cb002f..078c11c 100644 --- a/src/main/java/de/rub/nds/crawler/persistence/MongoPersistenceProvider.java +++ b/src/main/java/de/rub/nds/crawler/persistence/MongoPersistenceProvider.java @@ -35,7 +35,9 @@ import java.math.BigDecimal; import java.nio.file.Files; import java.nio.file.Paths; +import java.util.ArrayList; import java.util.HashSet; +import java.util.List; import java.util.Set; import java.util.concurrent.TimeUnit; import org.apache.commons.lang3.tuple.Pair; @@ -263,4 +265,77 @@ public void insertScanResult(ScanResult scanResult, ScanJobDescription scanJobDe } } } + + @Override + public List getScanResultsByTarget( + String dbName, String collectionName, String target) { + LOGGER.info( + "Retrieving scan results for target {} from collection: {}.{}", + target, + dbName, + collectionName); + + try { + var collection = resultCollectionCache.getUnchecked(Pair.of(dbName, collectionName)); + + // Create a query that matches either hostname or IP + var query = new org.bson.Document(); + var orQuery = new ArrayList(); + orQuery.add(new org.bson.Document("scanTarget.hostname", target)); + orQuery.add(new org.bson.Document("scanTarget.ip", target)); + query.append("$or", orQuery); + + var iterable = collection.find(query); + + + + List results = new ArrayList<>(); + iterable.forEach(results::add); + + LOGGER.info( + "Retrieved {} scan results for target {} from collection: {}.{}", + results.size(), + target, + dbName, + collectionName); + + return results; + } catch (Exception e) { + LOGGER.error("Exception while retrieving scan results from MongoDB: ", e); + throw new RuntimeException("Failed to retrieve scan results for target: " + target, e); + } + } + + @Override + public ScanResult getScanResultById(String dbName, String collectionName, String id) { + LOGGER.info( + "Retrieving scan result with ID {} from collection: {}.{}", + id, + dbName, + collectionName); + + try { + var collection = resultCollectionCache.getUnchecked(Pair.of(dbName, collectionName)); + var result = collection.findOneById(id); + + if (result == null) { + LOGGER.warn( + "No scan result found with ID: {} in collection: {}.{}", + id, + dbName, + collectionName); + } else { + LOGGER.info( + "Retrieved scan result with ID: {} from collection: {}.{}", + id, + dbName, + collectionName); + } + + return result; + } catch (Exception e) { + LOGGER.error("Exception while retrieving scan result from MongoDB: ", e); + throw new RuntimeException("Failed to retrieve scan result with ID: " + id, e); + } + } } From a5280467bd89252d55578599fe23affd20cf400c Mon Sep 17 00:00:00 2001 From: Robert Merget Date: Wed, 21 May 2025 09:57:16 +0400 Subject: [PATCH 04/24] fixed slf4j warning --- pom.xml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pom.xml b/pom.xml index 373294b..3146791 100644 --- a/pom.xml +++ b/pom.xml @@ -135,6 +135,14 @@ org.apache.logging.log4j log4j-api + + org.apache.logging.log4j + log4j-core + + + org.apache.logging.log4j + log4j-slf4j-impl + org.eclipse.persistence jakarta.persistence From 487453ca327b3f4e666ddb5d4ad07783b9d0fc5a Mon Sep 17 00:00:00 2001 From: Robert Merget Date: Wed, 21 May 2025 10:33:41 +0400 Subject: [PATCH 05/24] added retrieval functions and added javadoc --- .../java/de/rub/nds/crawler/CommonMain.java | 19 ++ .../config/ControllerCommandConfig.java | 22 +++ .../crawler/config/WorkerCommandConfig.java | 45 +++++ .../config/delegate/MongoDbDelegate.java | 64 ++++++ .../config/delegate/RabbitMqDelegate.java | 80 +++++++- .../nds/crawler/constant/CruxListNumber.java | 21 ++ .../rub/nds/crawler/constant/JobStatus.java | 14 ++ .../rub/nds/crawler/core/BulkScanWorker.java | 54 +++++ .../crawler/core/BulkScanWorkerManager.java | 39 ++++ .../de/rub/nds/crawler/data/BulkScan.java | 186 ++++++++++++++++++ .../de/rub/nds/crawler/data/BulkScanInfo.java | 27 +++ .../nds/crawler/data/BulkScanJobCounters.java | 33 ++++ .../de/rub/nds/crawler/data/ScanConfig.java | 50 +++++ .../nds/crawler/data/ScanJobDescription.java | 75 +++++++ .../de/rub/nds/crawler/data/ScanResult.java | 64 ++++++ .../de/rub/nds/crawler/data/ScanTarget.java | 56 ++++++ .../crawler/denylist/IDenylistProvider.java | 10 + .../DoneNotificationConsumer.java | 10 + .../orchestration/ScanJobConsumer.java | 9 + .../persistence/IPersistenceProvider.java | 4 +- .../persistence/MongoPersistenceProvider.java | 2 - .../targetlist/ITargetListProvider.java | 9 + .../util/CanceallableThreadPoolExecutor.java | 59 ++++++ .../nds/crawler/util/CancellableFuture.java | 68 ++++++- .../dummy/DummyPersistenceProvider.java | 12 ++ 25 files changed, 1016 insertions(+), 16 deletions(-) diff --git a/src/main/java/de/rub/nds/crawler/CommonMain.java b/src/main/java/de/rub/nds/crawler/CommonMain.java index ce13f5f..995ee43 100644 --- a/src/main/java/de/rub/nds/crawler/CommonMain.java +++ b/src/main/java/de/rub/nds/crawler/CommonMain.java @@ -18,9 +18,21 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +/** + * Main entry point for the TLS-Crawler application. Provides the main method to start either a + * controller or worker instance. + */ public class CommonMain { private static final Logger LOGGER = LogManager.getLogger(); + /** + * Main entry point for the application. Parses command line arguments and starts either a + * controller or worker based on the command. + * + * @param args Command line arguments + * @param controllerCommandConfig Configuration for the controller + * @param workerCommandConfig Configuration for the worker + */ public static void main( String[] args, ControllerCommandConfig controllerCommandConfig, @@ -71,6 +83,13 @@ public static void main( } } + /** + * Convenience method to start the application with just a controller configuration. Creates a + * default worker configuration. + * + * @param args Command line arguments + * @param controllerConfig Configuration for the controller + */ public static void main(String[] args, ControllerCommandConfig controllerConfig) { main(args, controllerConfig, new WorkerCommandConfig()); } diff --git a/src/main/java/de/rub/nds/crawler/config/ControllerCommandConfig.java b/src/main/java/de/rub/nds/crawler/config/ControllerCommandConfig.java index becc425..2896166 100644 --- a/src/main/java/de/rub/nds/crawler/config/ControllerCommandConfig.java +++ b/src/main/java/de/rub/nds/crawler/config/ControllerCommandConfig.java @@ -22,6 +22,12 @@ import org.apache.commons.validator.routines.UrlValidator; import org.quartz.CronScheduleBuilder; +/** + * Configuration class for controller instances. Contains settings for the controller's behavior, + * including scan parameters, target selection, and notification settings. This abstract class + * provides the base configuration, while specific scanner implementations must extend it to provide + * scanner-specific configuration. + */ public abstract class ControllerCommandConfig { @ParametersDelegate private final RabbitMqDelegate rabbitMqDelegate; @@ -112,7 +118,15 @@ public void validate() { } } + /** Validator that ensures parameter values are positive integers. */ public static class PositiveInteger implements IParameterValidator { + /** + * Validates that the parameter value is a positive integer. + * + * @param name The parameter name + * @param value The parameter value + * @throws ParameterException If the value is not a positive integer + */ public void validate(String name, String value) throws ParameterException { int n = Integer.parseInt(value); if (n < 0) { @@ -122,7 +136,15 @@ public void validate(String name, String value) throws ParameterException { } } + /** Validator that ensures parameter values are valid cron expressions. */ public static class CronSyntax implements IParameterValidator { + /** + * Validates that the parameter value is a valid cron expression. + * + * @param name The parameter name + * @param value The parameter value + * @throws ParameterException If the value is not a valid cron expression + */ public void validate(String name, String value) throws ParameterException { CronScheduleBuilder.cronSchedule(value); } diff --git a/src/main/java/de/rub/nds/crawler/config/WorkerCommandConfig.java b/src/main/java/de/rub/nds/crawler/config/WorkerCommandConfig.java index 63dc681..5fb03e8 100644 --- a/src/main/java/de/rub/nds/crawler/config/WorkerCommandConfig.java +++ b/src/main/java/de/rub/nds/crawler/config/WorkerCommandConfig.java @@ -13,6 +13,10 @@ import de.rub.nds.crawler.config.delegate.MongoDbDelegate; import de.rub.nds.crawler.config.delegate.RabbitMqDelegate; +/** + * Configuration class for worker instances. Contains settings for the worker's behavior, including + * thread counts and timeouts, as well as MongoDB and RabbitMQ connection settings. + */ public class WorkerCommandConfig { @ParametersDelegate private final RabbitMqDelegate rabbitMqDelegate; @@ -38,39 +42,80 @@ public class WorkerCommandConfig { + "After the timeout the worker tries to shutdown the scan but a shutdown can not be guaranteed due to the TLS-Scanner implementation.") private int scanTimeout = 840000; + /** Creates a new worker command configuration with default delegate settings. */ public WorkerCommandConfig() { rabbitMqDelegate = new RabbitMqDelegate(); mongoDbDelegate = new MongoDbDelegate(); } + /** + * Gets the RabbitMQ connection delegate. + * + * @return The RabbitMQ connection settings + */ public RabbitMqDelegate getRabbitMqDelegate() { return rabbitMqDelegate; } + /** + * Gets the MongoDB connection delegate. + * + * @return The MongoDB connection settings + */ public MongoDbDelegate getMongoDbDelegate() { return mongoDbDelegate; } + /** + * Gets the number of parallel scan threads to use. + * + * @return The number of scan threads + */ public int getParallelScanThreads() { return parallelScanThreads; } + /** + * Gets the number of parallel connection threads to use per scan. + * + * @return The number of connection threads + */ public int getParallelConnectionThreads() { return parallelConnectionThreads; } + /** + * Gets the timeout for individual scan operations in milliseconds. + * + * @return The scan timeout in milliseconds + */ public int getScanTimeout() { return scanTimeout; } + /** + * Sets the number of parallel scan threads to use. + * + * @param parallelScanThreads The number of scan threads + */ public void setParallelScanThreads(int parallelScanThreads) { this.parallelScanThreads = parallelScanThreads; } + /** + * Sets the number of parallel connection threads to use per scan. + * + * @param parallelConnectionThreads The number of connection threads + */ public void setParallelConnectionThreads(int parallelConnectionThreads) { this.parallelConnectionThreads = parallelConnectionThreads; } + /** + * Sets the timeout for individual scan operations in milliseconds. + * + * @param scanTimeout The scan timeout in milliseconds + */ public void setScanTimeout(int scanTimeout) { this.scanTimeout = scanTimeout; } diff --git a/src/main/java/de/rub/nds/crawler/config/delegate/MongoDbDelegate.java b/src/main/java/de/rub/nds/crawler/config/delegate/MongoDbDelegate.java index 3cfd571..0c914b7 100644 --- a/src/main/java/de/rub/nds/crawler/config/delegate/MongoDbDelegate.java +++ b/src/main/java/de/rub/nds/crawler/config/delegate/MongoDbDelegate.java @@ -10,6 +10,10 @@ import com.beust.jcommander.Parameter; +/** + * Configuration delegate that holds MongoDB connection settings. Used by both controller and worker + * configurations to avoid code duplication. + */ public class MongoDbDelegate { @Parameter( @@ -42,50 +46,110 @@ public class MongoDbDelegate { description = "The DB within the MongoDB instance, in which the user:pass is defined.") private String mongoDbAuthSource; + /** + * Gets the MongoDB host address. + * + * @return The MongoDB host address + */ public String getMongoDbHost() { return mongoDbHost; } + /** + * Gets the MongoDB port number. + * + * @return The MongoDB port number + */ public int getMongoDbPort() { return mongoDbPort; } + /** + * Gets the MongoDB username for authentication. + * + * @return The MongoDB username + */ public String getMongoDbUser() { return mongoDbUser; } + /** + * Gets the MongoDB password for authentication. + * + * @return The MongoDB password + */ public String getMongoDbPass() { return mongoDbPass; } + /** + * Gets the file path containing the MongoDB password. + * + * @return The MongoDB password file path + */ public String getMongoDbPassFile() { return mongoDbPassFile; } + /** + * Gets the MongoDB authentication source database name. + * + * @return The authentication source database name + */ public String getMongoDbAuthSource() { return mongoDbAuthSource; } + /** + * Sets the MongoDB host address. + * + * @param mongoDbHost The MongoDB host address + */ public void setMongoDbHost(String mongoDbHost) { this.mongoDbHost = mongoDbHost; } + /** + * Sets the MongoDB port number. + * + * @param mongoDbPort The MongoDB port number + */ public void setMongoDbPort(int mongoDbPort) { this.mongoDbPort = mongoDbPort; } + /** + * Sets the MongoDB username for authentication. + * + * @param mongoDbUser The MongoDB username + */ public void setMongoDbUser(String mongoDbUser) { this.mongoDbUser = mongoDbUser; } + /** + * Sets the MongoDB password for authentication. + * + * @param mongoDbPass The MongoDB password + */ public void setMongoDbPass(String mongoDbPass) { this.mongoDbPass = mongoDbPass; } + /** + * Sets the file path containing the MongoDB password. + * + * @param mongoDbPassFile The MongoDB password file path + */ public void setMongoDbPassFile(String mongoDbPassFile) { this.mongoDbPassFile = mongoDbPassFile; } + /** + * Sets the MongoDB authentication source database name. + * + * @param mongoDbAuthSource The authentication source database name + */ public void setMongoDbAuthSource(String mongoDbAuthSource) { this.mongoDbAuthSource = mongoDbAuthSource; } diff --git a/src/main/java/de/rub/nds/crawler/config/delegate/RabbitMqDelegate.java b/src/main/java/de/rub/nds/crawler/config/delegate/RabbitMqDelegate.java index 9d89180..33d387c 100644 --- a/src/main/java/de/rub/nds/crawler/config/delegate/RabbitMqDelegate.java +++ b/src/main/java/de/rub/nds/crawler/config/delegate/RabbitMqDelegate.java @@ -10,70 +10,138 @@ import com.beust.jcommander.Parameter; +/** + * Configuration delegate that holds RabbitMQ connection settings. Used by both controller and + * worker configurations to avoid code duplication. + */ public class RabbitMqDelegate { - @Parameter(names = "-rabbitMqHost") + @Parameter(names = "-rabbitMqHost", description = "Host of the RabbitMQ instance") private String rabbitMqHost; - @Parameter(names = "-rabbitMqPort") + @Parameter(names = "-rabbitMqPort", description = "Port of the RabbitMQ instance") private int rabbitMqPort; - @Parameter(names = "-rabbitMqUser") + @Parameter(names = "-rabbitMqUser", description = "Username for RabbitMQ authentication") private String rabbitMqUser; - @Parameter(names = "-rabbitMqPass") + @Parameter(names = "-rabbitMqPass", description = "Password for RabbitMQ authentication") private String rabbitMqPass; - @Parameter(names = "-rabbitMqPassFile") + @Parameter( + names = "-rabbitMqPassFile", + description = "File containing the password for RabbitMQ authentication") private String rabbitMqPassFile; - @Parameter(names = "-rabbitMqTLS") + @Parameter( + names = "-rabbitMqTLS", + description = "Whether to use TLS for the RabbitMQ connection") private boolean rabbitMqTLS; + /** + * Gets the RabbitMQ host address. + * + * @return The RabbitMQ host address + */ public String getRabbitMqHost() { return rabbitMqHost; } + /** + * Gets the RabbitMQ port number. + * + * @return The RabbitMQ port number + */ public int getRabbitMqPort() { return rabbitMqPort; } + /** + * Gets the RabbitMQ username for authentication. + * + * @return The RabbitMQ username + */ public String getRabbitMqUser() { return rabbitMqUser; } + /** + * Gets the RabbitMQ password for authentication. + * + * @return The RabbitMQ password + */ public String getRabbitMqPass() { return rabbitMqPass; } + /** + * Gets the file path containing the RabbitMQ password. + * + * @return The RabbitMQ password file path + */ public String getRabbitMqPassFile() { return rabbitMqPassFile; } + /** + * Checks if TLS should be used for the RabbitMQ connection. + * + * @return True if TLS should be used, false otherwise + */ public boolean isRabbitMqTLS() { return rabbitMqTLS; } + /** + * Sets the RabbitMQ host address. + * + * @param rabbitMqHost The RabbitMQ host address + */ public void setRabbitMqHost(String rabbitMqHost) { this.rabbitMqHost = rabbitMqHost; } + /** + * Sets the RabbitMQ port number. + * + * @param rabbitMqPort The RabbitMQ port number + */ public void setRabbitMqPort(int rabbitMqPort) { this.rabbitMqPort = rabbitMqPort; } + /** + * Sets the RabbitMQ username for authentication. + * + * @param rabbitMqUser The RabbitMQ username + */ public void setRabbitMqUser(String rabbitMqUser) { this.rabbitMqUser = rabbitMqUser; } + /** + * Sets the RabbitMQ password for authentication. + * + * @param rabbitMqPass The RabbitMQ password + */ public void setRabbitMqPass(String rabbitMqPass) { this.rabbitMqPass = rabbitMqPass; } + /** + * Sets the file path containing the RabbitMQ password. + * + * @param rabbitMqPassFile The RabbitMQ password file path + */ public void setRabbitMqPassFile(String rabbitMqPassFile) { this.rabbitMqPassFile = rabbitMqPassFile; } + /** + * Sets whether TLS should be used for the RabbitMQ connection. + * + * @param rabbitMqTLS True if TLS should be used, false otherwise + */ public void setRabbitMqTLS(boolean rabbitMqTLS) { this.rabbitMqTLS = rabbitMqTLS; } diff --git a/src/main/java/de/rub/nds/crawler/constant/CruxListNumber.java b/src/main/java/de/rub/nds/crawler/constant/CruxListNumber.java index 8eafb0e..a5e27f0 100644 --- a/src/main/java/de/rub/nds/crawler/constant/CruxListNumber.java +++ b/src/main/java/de/rub/nds/crawler/constant/CruxListNumber.java @@ -8,21 +8,42 @@ */ package de.rub.nds.crawler.constant; +/** + * Enumeration of different Crux list sizes available for scanning. Each enum constant represents a + * specific list of top websites, with the value indicating the number of entries in that list. + */ public enum CruxListNumber { + /** Top 1,000 websites */ TOP_1k(1000), + /** Top 5,000 websites */ TOP_5K(5000), + /** Top 10,000 websites */ TOP_10K(10000), + /** Top 50,000 websites */ TOP_50K(50000), + /** Top 100,000 websites */ TOP_100K(100000), + /** Top 500,000 websites */ TOP_500k(500000), + /** Top 1,000,000 websites */ TOP_1M(1000000); private final int number; + /** + * Constructor for the enum constants. + * + * @param number The number of entries in the list + */ CruxListNumber(int number) { this.number = number; } + /** + * Gets the number of entries in this list. + * + * @return The number of entries + */ public int getNumber() { return number; } diff --git a/src/main/java/de/rub/nds/crawler/constant/JobStatus.java b/src/main/java/de/rub/nds/crawler/constant/JobStatus.java index fe6d26d..4297f0f 100644 --- a/src/main/java/de/rub/nds/crawler/constant/JobStatus.java +++ b/src/main/java/de/rub/nds/crawler/constant/JobStatus.java @@ -8,6 +8,10 @@ */ package de.rub.nds.crawler.constant; +/** + * Enumeration of possible job status values. Indicates the current state or final result of a scan + * job. + */ public enum JobStatus { /** Job is waiting to be executed. */ TO_BE_EXECUTED(false), @@ -38,10 +42,20 @@ public enum JobStatus { private final boolean isError; + /** + * Constructor for the enum constants. + * + * @param isError Whether this status represents an error condition + */ JobStatus(boolean isError) { this.isError = isError; } + /** + * Checks if this status represents an error condition. + * + * @return True if this status is an error, false otherwise + */ public boolean isError() { return isError; } diff --git a/src/main/java/de/rub/nds/crawler/core/BulkScanWorker.java b/src/main/java/de/rub/nds/crawler/core/BulkScanWorker.java index d9f5a58..040af17 100644 --- a/src/main/java/de/rub/nds/crawler/core/BulkScanWorker.java +++ b/src/main/java/de/rub/nds/crawler/core/BulkScanWorker.java @@ -19,12 +19,22 @@ import org.apache.logging.log4j.Logger; import org.bson.Document; +/** + * Abstract worker for performing bulk scanning operations. Implements thread management and + * lifecycle operations for scan workers. + * + * @param The specific ScanConfig type used by this worker + */ public abstract class BulkScanWorker { private static final Logger LOGGER = LogManager.getLogger(); private final AtomicInteger activeJobs = new AtomicInteger(0); private final AtomicBoolean initialized = new AtomicBoolean(false); private final AtomicBoolean shouldCleanupSelf = new AtomicBoolean(false); + + /** The ID of the bulk scan this worker is associated with */ protected final String bulkScanId; + + /** The scan configuration for this worker */ protected final T scanConfig; /** @@ -33,6 +43,13 @@ public abstract class BulkScanWorker { */ private final ThreadPoolExecutor timeoutExecutor; + /** + * Creates a new bulk scan worker. + * + * @param bulkScanId The ID of the bulk scan this worker is associated with + * @param scanConfig The scan configuration for this worker + * @param parallelScanThreads The number of parallel scan threads to use + */ protected BulkScanWorker(String bulkScanId, T scanConfig, int parallelScanThreads) { this.bulkScanId = bulkScanId; this.scanConfig = scanConfig; @@ -47,6 +64,14 @@ protected BulkScanWorker(String bulkScanId, T scanConfig, int parallelScanThread new NamedThreadFactory("crawler-worker: scan executor")); } + /** + * Handles a scan target by submitting it to the executor. If this is the first call, it will + * initialize the worker first. When the last job completes, it will clean up the worker if + * needed. + * + * @param scanTarget The target to scan + * @return A future that will complete when the scan is done + */ public Future handle(ScanTarget scanTarget) { // if we initialized ourself, we also clean up ourself shouldCleanupSelf.weakCompareAndSetAcquire(false, init()); @@ -61,8 +86,21 @@ public Future handle(ScanTarget scanTarget) { }); } + /** + * Scans a target and returns the result as a Document. This is the core scanning functionality + * that must be implemented by subclasses. + * + * @param scanTarget The target to scan + * @return The scan result as a Document + */ public abstract Document scan(ScanTarget scanTarget); + /** + * Initializes this worker if it hasn't been initialized yet. This method is thread-safe and + * will only initialize once. + * + * @return True if this call performed the initialization, false if already initialized + */ public final boolean init() { // synchronize such that no thread runs before being initialized // but only synchronize if not already initialized @@ -77,6 +115,12 @@ public final boolean init() { return false; } + /** + * Cleans up this worker if it has been initialized and has no active jobs. This method is + * thread-safe and will only clean up once. + * + * @return True if this call performed the cleanup, false otherwise + */ public final boolean cleanup() { // synchronize such that init and cleanup do not run simultaneously // but only synchronize if already initialized @@ -98,7 +142,17 @@ public final boolean cleanup() { return false; } + /** + * Performs the actual initialization of this worker. This method is called exactly once by + * {@link #init()} when initialization is needed. Subclasses must implement this method to + * initialize their specific resources. + */ protected abstract void initInternal(); + /** + * Performs the actual cleanup of this worker. This method is called exactly once by {@link + * #cleanup()} when cleanup is needed. Subclasses must implement this method to clean up their + * specific resources. + */ protected abstract void cleanupInternal(); } diff --git a/src/main/java/de/rub/nds/crawler/core/BulkScanWorkerManager.java b/src/main/java/de/rub/nds/crawler/core/BulkScanWorkerManager.java index d9df6cb..53580d8 100644 --- a/src/main/java/de/rub/nds/crawler/core/BulkScanWorkerManager.java +++ b/src/main/java/de/rub/nds/crawler/core/BulkScanWorkerManager.java @@ -22,10 +22,20 @@ import org.apache.logging.log4j.Logger; import org.bson.Document; +/** + * Manager class for bulk scan workers that ensures worker instances are reused and properly + * managed. Acts as a singleton factory and manager for BulkScanWorker instances. + */ public class BulkScanWorkerManager { private static final Logger LOGGER = LogManager.getLogger(); private static BulkScanWorkerManager instance; + /** + * Gets the singleton instance of the BulkScanWorkerManager. Creates the instance if it doesn't + * exist yet. + * + * @return The singleton instance + */ public static BulkScanWorkerManager getInstance() { if (instance == null) { instance = new BulkScanWorkerManager(); @@ -33,6 +43,15 @@ public static BulkScanWorkerManager getInstance() { return instance; } + /** + * Static convenience method to handle a scan job. Creates or retrieves the appropriate worker + * and submits the scan target for processing. + * + * @param scanJobDescription The scan job to handle + * @param parallelConnectionThreads The number of parallel connection threads to use + * @param parallelScanThreads The number of parallel scan threads to use + * @return A future that will complete when the scan is done + */ public static Future handleStatic( ScanJobDescription scanJobDescription, int parallelConnectionThreads, @@ -58,6 +77,17 @@ private BulkScanWorkerManager() { .build(); } + /** + * Gets or creates a bulk scan worker for the specified bulk scan. Workers are cached and reused + * to avoid excessive resource allocation. + * + * @param bulkScanId The ID of the bulk scan + * @param scanConfig The scan configuration to use + * @param parallelConnectionThreads The number of parallel connection threads to use + * @param parallelScanThreads The number of parallel scan threads to use + * @return A bulk scan worker for the specified bulk scan + * @throws UncheckedException If a worker cannot be created + */ public BulkScanWorker getBulkScanWorker( String bulkScanId, ScanConfig scanConfig, @@ -79,6 +109,15 @@ public BulkScanWorker getBulkScanWorker( } } + /** + * Handles a scan job by creating or retrieving the appropriate worker and submitting the scan + * target for processing. + * + * @param scanJobDescription The scan job to handle + * @param parallelConnectionThreads The number of parallel connection threads to use + * @param parallelScanThreads The number of parallel scan threads to use + * @return A future that will complete when the scan is done + */ public Future handle( ScanJobDescription scanJobDescription, int parallelConnectionThreads, diff --git a/src/main/java/de/rub/nds/crawler/data/BulkScan.java b/src/main/java/de/rub/nds/crawler/data/BulkScan.java index 980c089..6ff77ab 100644 --- a/src/main/java/de/rub/nds/crawler/data/BulkScan.java +++ b/src/main/java/de/rub/nds/crawler/data/BulkScan.java @@ -17,6 +17,11 @@ import java.util.Map; import javax.persistence.Id; +/** + * Represents a bulk scanning operation that manages multiple TLS scanning jobs. This class tracks + * metadata about a scan batch including scan configuration, timing information, job statistics, and + * version information. + */ public class BulkScan implements Serializable { @Id private String _id; @@ -56,6 +61,17 @@ public class BulkScan implements Serializable { @SuppressWarnings("unused") private BulkScan() {} + /** + * Creates a new bulk scan with the given parameters. + * + * @param scannerClass A scanner implementation class for retrieving version information + * @param crawlerClass A crawler implementation class for retrieving version information + * @param name The name of the bulk scan + * @param scanConfig The configuration to use for this scan + * @param startTime The start time as a timestamp in milliseconds + * @param monitored Whether this scan should be monitored for progress + * @param notifyUrl Optional URL to notify when the scan is complete + */ public BulkScan( Class scannerClass, Class crawlerClass, @@ -77,139 +93,309 @@ public BulkScan( } // Getter naming important for correct serialization, do not change! + /** + * Gets the database ID for this bulk scan. + * + * @return The database ID + */ public String get_id() { return _id; } + /** + * Gets the name of this bulk scan. + * + * @return The name + */ public String getName() { return this.name; } + /** + * Gets the collection name where scan results will be stored. + * + * @return The collection name + */ public String getCollectionName() { return this.collectionName; } + /** + * Gets the scan configuration for this bulk scan. + * + * @return The scan configuration + */ public ScanConfig getScanConfig() { return this.scanConfig; } + /** + * Checks if this bulk scan is monitored for progress. + * + * @return True if the scan is monitored, false otherwise + */ public boolean isMonitored() { return this.monitored; } + /** + * Checks if this bulk scan has finished. + * + * @return True if the scan is finished, false otherwise + */ public boolean isFinished() { return this.finished; } + /** + * Gets the start time of this bulk scan. + * + * @return The start time as a timestamp in milliseconds + */ public long getStartTime() { return this.startTime; } + /** + * Gets the end time of this bulk scan. + * + * @return The end time as a timestamp in milliseconds + */ public long getEndTime() { return this.endTime; } + /** + * Gets the total number of targets provided for this bulk scan. + * + * @return The number of targets + */ public int getTargetsGiven() { return this.targetsGiven; } + /** + * Gets the number of scan jobs published for this bulk scan. + * + * @return The number of scan jobs published + */ public long getScanJobsPublished() { return this.scanJobsPublished; } + /** + * Gets the number of successful scans completed for this bulk scan. + * + * @return The number of successful scans + */ public int getSuccessfulScans() { return this.successfulScans; } + /** + * Gets the URL to notify when this bulk scan is complete. + * + * @return The notification URL + */ public String getNotifyUrl() { return this.notifyUrl; } + /** + * Gets the version of the scanner used for this bulk scan. + * + * @return The scanner version + */ public String getScannerVersion() { return this.scannerVersion; } + /** + * Gets the version of the crawler used for this bulk scan. + * + * @return The crawler version + */ public String getCrawlerVersion() { return this.crawlerVersion; } // Setter naming important for correct serialization, do not change! + /** + * Sets the database ID for this bulk scan. + * + * @param _id The database ID + */ public void set_id(String _id) { this._id = _id; } + /** + * Sets the name of this bulk scan. + * + * @param name The name + */ public void setName(String name) { this.name = name; } + /** + * Sets the collection name where scan results will be stored. + * + * @param collectionName The collection name + */ public void setCollectionName(String collectionName) { this.collectionName = collectionName; } + /** + * Sets the scan configuration for this bulk scan. + * + * @param scanConfig The scan configuration + */ public void setScanConfig(ScanConfig scanConfig) { this.scanConfig = scanConfig; } + /** + * Sets whether this bulk scan is monitored for progress. + * + * @param monitored True if the scan should be monitored, false otherwise + */ public void setMonitored(boolean monitored) { this.monitored = monitored; } + /** + * Sets whether this bulk scan is finished. + * + * @param finished True if the scan is finished, false otherwise + */ public void setFinished(boolean finished) { this.finished = finished; } + /** + * Sets the start time of this bulk scan. + * + * @param startTime The start time as a timestamp in milliseconds + */ public void setStartTime(long startTime) { this.startTime = startTime; } + /** + * Sets the end time of this bulk scan. + * + * @param endTime The end time as a timestamp in milliseconds + */ public void setEndTime(long endTime) { this.endTime = endTime; } + /** + * Sets the total number of targets for this bulk scan. + * + * @param targetsGiven The number of targets + */ public void setTargetsGiven(int targetsGiven) { this.targetsGiven = targetsGiven; } + /** + * Sets the number of scan jobs published for this bulk scan. + * + * @param scanJobsPublished The number of scan jobs published + */ public void setScanJobsPublished(long scanJobsPublished) { this.scanJobsPublished = scanJobsPublished; } + /** + * Sets the number of successful scans completed for this bulk scan. + * + * @param successfulScans The number of successful scans + */ public void setSuccessfulScans(int successfulScans) { this.successfulScans = successfulScans; } + /** + * Sets the URL to notify when this bulk scan is complete. + * + * @param notifyUrl The notification URL + */ public void setNotifyUrl(String notifyUrl) { this.notifyUrl = notifyUrl; } + /** + * Sets the version of the scanner used for this bulk scan. + * + * @param scannerVersion The scanner version + */ public void setScannerVersion(String scannerVersion) { this.scannerVersion = scannerVersion; } + /** + * Sets the version of the crawler used for this bulk scan. + * + * @param crawlerVersion The crawler version + */ public void setCrawlerVersion(String crawlerVersion) { this.crawlerVersion = crawlerVersion; } + /** + * Gets the job status counters for this bulk scan. + * + * @return A map of job status to count + */ public Map getJobStatusCounters() { return jobStatusCounters; } + /** + * Sets the job status counters for this bulk scan. + * + * @param jobStatusCounters A map of job status to count + */ public void setJobStatusCounters(Map jobStatusCounters) { this.jobStatusCounters = jobStatusCounters; } + /** + * Gets the number of scan jobs that failed due to domain resolution errors. + * + * @return The number of resolution errors + */ public long getScanJobsResolutionErrors() { return scanJobsResolutionErrors; } + /** + * Sets the number of scan jobs that failed due to domain resolution errors. + * + * @param scanJobsResolutionErrors The number of resolution errors + */ public void setScanJobsResolutionErrors(long scanJobsResolutionErrors) { this.scanJobsResolutionErrors = scanJobsResolutionErrors; } + /** + * Gets the number of scan jobs skipped due to denylisting. + * + * @return The number of denylisted scan jobs + */ public long getScanJobsDenylisted() { return scanJobsDenylisted; } + /** + * Sets the number of scan jobs skipped due to denylisting. + * + * @param scanJobsDenylisted The number of denylisted scan jobs + */ public void setScanJobsDenylisted(long scanJobsDenylisted) { this.scanJobsDenylisted = scanJobsDenylisted; } diff --git a/src/main/java/de/rub/nds/crawler/data/BulkScanInfo.java b/src/main/java/de/rub/nds/crawler/data/BulkScanInfo.java index 1e40e41..4937ee4 100644 --- a/src/main/java/de/rub/nds/crawler/data/BulkScanInfo.java +++ b/src/main/java/de/rub/nds/crawler/data/BulkScanInfo.java @@ -21,24 +21,51 @@ public class BulkScanInfo implements Serializable { private final boolean isMonitored; + /** + * Creates a new BulkScanInfo from a BulkScan. + * + * @param bulkScan The bulk scan to extract information from + */ public BulkScanInfo(BulkScan bulkScan) { this.bulkScanId = bulkScan.get_id(); this.scanConfig = bulkScan.getScanConfig(); this.isMonitored = bulkScan.isMonitored(); } + /** + * Gets the ID of the bulk scan. + * + * @return The bulk scan ID + */ public String getBulkScanId() { return bulkScanId; } + /** + * Gets the scan configuration for this bulk scan. + * + * @return The scan configuration + */ public ScanConfig getScanConfig() { return scanConfig; } + /** + * Gets the scan configuration cast to a specific type. + * + * @param The type to cast the scan configuration to + * @param clazz The class of the type to cast to + * @return The scan configuration cast to the specified type + */ public T getScanConfig(Class clazz) { return clazz.cast(scanConfig); } + /** + * Checks if this bulk scan is being monitored. + * + * @return True if the scan is monitored, false otherwise + */ public boolean isMonitored() { return isMonitored; } diff --git a/src/main/java/de/rub/nds/crawler/data/BulkScanJobCounters.java b/src/main/java/de/rub/nds/crawler/data/BulkScanJobCounters.java index bfaac3a..1ea45bc 100644 --- a/src/main/java/de/rub/nds/crawler/data/BulkScanJobCounters.java +++ b/src/main/java/de/rub/nds/crawler/data/BulkScanJobCounters.java @@ -13,6 +13,10 @@ import java.util.Map; import java.util.concurrent.atomic.AtomicInteger; +/** + * Counter class for tracking job statistics during a bulk scan. This class maintains thread-safe + * counters for each job status type. + */ public class BulkScanJobCounters { private final BulkScan bulkScan; @@ -20,6 +24,12 @@ public class BulkScanJobCounters { private final AtomicInteger totalJobDoneCount = new AtomicInteger(0); private final Map jobStatusCounters = new EnumMap<>(JobStatus.class); + /** + * Creates a new BulkScanJobCounters instance for the given bulk scan. Initializes counters for + * all job statuses except TO_BE_EXECUTED. + * + * @param bulkScan The bulk scan to track counters for + */ public BulkScanJobCounters(BulkScan bulkScan) { this.bulkScan = bulkScan; for (JobStatus jobStatus : JobStatus.values()) { @@ -30,10 +40,21 @@ public BulkScanJobCounters(BulkScan bulkScan) { } } + /** + * Gets the bulk scan associated with these counters. + * + * @return The bulk scan + */ public BulkScan getBulkScan() { return bulkScan; } + /** + * Gets a copy of the job status counters as a non-atomic map. This creates a snapshot of the + * current counter values. + * + * @return A map of job status to count + */ public Map getJobStatusCountersCopy() { EnumMap ret = new EnumMap<>(JobStatus.class); for (Map.Entry entry : jobStatusCounters.entrySet()) { @@ -42,10 +63,22 @@ public Map getJobStatusCountersCopy() { return ret; } + /** + * Gets the count for a specific job status. + * + * @param jobStatus The job status to get the count for + * @return The current count for the given status + */ public int getJobStatusCount(JobStatus jobStatus) { return jobStatusCounters.get(jobStatus).get(); } + /** + * Increments the count for a specific job status and the total job count. + * + * @param jobStatus The job status to increment the count for + * @return The new total job count after incrementing + */ public int increaseJobStatusCount(JobStatus jobStatus) { jobStatusCounters.get(jobStatus).incrementAndGet(); return totalJobDoneCount.incrementAndGet(); diff --git a/src/main/java/de/rub/nds/crawler/data/ScanConfig.java b/src/main/java/de/rub/nds/crawler/data/ScanConfig.java index 8f91fc2..80ff97d 100644 --- a/src/main/java/de/rub/nds/crawler/data/ScanConfig.java +++ b/src/main/java/de/rub/nds/crawler/data/ScanConfig.java @@ -12,6 +12,10 @@ import de.rub.nds.scanner.core.config.ScannerDetail; import java.io.Serializable; +/** + * Abstract base class for scan configurations. Contains common configuration options for all + * scanner types and defines required factory methods to create workers. + */ public abstract class ScanConfig implements Serializable { private ScannerDetail scannerDetail; @@ -23,36 +27,82 @@ public abstract class ScanConfig implements Serializable { @SuppressWarnings("unused") private ScanConfig() {} + /** + * Creates a new scan configuration with the specified parameters. + * + * @param scannerDetail The level of detail for the scan + * @param reexecutions The number of times to retry failed scans + * @param timeout The timeout for each scan in seconds + */ protected ScanConfig(ScannerDetail scannerDetail, int reexecutions, int timeout) { this.scannerDetail = scannerDetail; this.reexecutions = reexecutions; this.timeout = timeout; } + /** + * Gets the scanner detail level. + * + * @return The scanner detail level + */ public ScannerDetail getScannerDetail() { return this.scannerDetail; } + /** + * Gets the number of reexecutions for failed scans. + * + * @return The number of reexecutions + */ public int getReexecutions() { return this.reexecutions; } + /** + * Gets the timeout for each scan in seconds. + * + * @return The timeout in seconds + */ public int getTimeout() { return this.timeout; } + /** + * Sets the scanner detail level. + * + * @param scannerDetail The scanner detail level + */ public void setScannerDetail(ScannerDetail scannerDetail) { this.scannerDetail = scannerDetail; } + /** + * Sets the number of reexecutions for failed scans. + * + * @param reexecutions The number of reexecutions + */ public void setReexecutions(int reexecutions) { this.reexecutions = reexecutions; } + /** + * Sets the timeout for each scan in seconds. + * + * @param timeout The timeout in seconds + */ public void setTimeout(int timeout) { this.timeout = timeout; } + /** + * Creates a worker for this scan configuration. Each implementation must provide a factory + * method to create the appropriate worker type. + * + * @param bulkScanID The ID of the bulk scan this worker is for + * @param parallelConnectionThreads The number of parallel connection threads to use + * @param parallelScanThreads The number of parallel scan threads to use + * @return A worker for this scan configuration + */ public abstract BulkScanWorker createWorker( String bulkScanID, int parallelConnectionThreads, int parallelScanThreads); } diff --git a/src/main/java/de/rub/nds/crawler/data/ScanJobDescription.java b/src/main/java/de/rub/nds/crawler/data/ScanJobDescription.java index 3bd92a7..12e7592 100644 --- a/src/main/java/de/rub/nds/crawler/data/ScanJobDescription.java +++ b/src/main/java/de/rub/nds/crawler/data/ScanJobDescription.java @@ -14,6 +14,10 @@ import java.util.Optional; import java.util.UUID; +/** + * Description of a scan job to be processed by a worker. Contains all information needed to perform + * a scan and to store its results. + */ public class ScanJobDescription implements Serializable { private final UUID id = UUID.randomUUID(); @@ -33,6 +37,15 @@ public class ScanJobDescription implements Serializable { private final String collectionName; + /** + * Creates a new scan job description with the given parameters. + * + * @param scanTarget The target to scan + * @param bulkScanInfo Information about the bulk scan this job is part of + * @param dbName The database name where results should be stored + * @param collectionName The collection name where results should be stored + * @param status The initial status of the job + */ public ScanJobDescription( ScanTarget scanTarget, BulkScanInfo bulkScanInfo, @@ -46,6 +59,14 @@ public ScanJobDescription( this.status = status; } + /** + * Creates a new scan job description as part of a bulk scan. This is a convenience constructor + * that extracts the necessary information from the bulk scan. + * + * @param scanTarget The target to scan + * @param bulkScan The bulk scan this job is part of + * @param status The initial status of the job + */ public ScanJobDescription(ScanTarget scanTarget, BulkScan bulkScan, JobStatus status) { this( scanTarget, @@ -55,10 +76,22 @@ public ScanJobDescription(ScanTarget scanTarget, BulkScan bulkScan, JobStatus st status); } + /** + * Gets the unique identifier for this job. + * + * @return The job's UUID + */ public UUID getId() { return id; } + /** + * Custom deserialization to properly handle transient fields. + * + * @param in The input stream to read from + * @throws IOException If an I/O error occurs + * @throws ClassNotFoundException If the class of a serialized object cannot be found + */ private void readObject(java.io.ObjectInputStream in) throws IOException, ClassNotFoundException { // handle deserialization, cf. https://stackoverflow.com/a/3960558 @@ -66,30 +99,67 @@ private void readObject(java.io.ObjectInputStream in) deliveryTag = Optional.empty(); } + /** + * Gets the target to scan. + * + * @return The scan target + */ public ScanTarget getScanTarget() { return scanTarget; } + /** + * Gets the database name where results should be stored. + * + * @return The database name + */ public String getDbName() { return dbName; } + /** + * Gets the collection name where results should be stored. + * + * @return The collection name + */ public String getCollectionName() { return collectionName; } + /** + * Gets the current status of the job. + * + * @return The job status + */ public JobStatus getStatus() { return status; } + /** + * Sets the status of the job. + * + * @param status The new job status + */ public void setStatus(JobStatus status) { this.status = status; } + /** + * Gets the delivery tag assigned by the message broker. + * + * @return The delivery tag + * @throws java.util.NoSuchElementException If no delivery tag has been set + */ public long getDeliveryTag() { return deliveryTag.get(); } + /** + * Sets the delivery tag assigned by the message broker. + * + * @param deliveryTag The delivery tag + * @throws IllegalStateException If a delivery tag has already been set + */ public void setDeliveryTag(Long deliveryTag) { if (this.deliveryTag.isPresent()) { throw new IllegalStateException("Delivery tag already set"); @@ -97,6 +167,11 @@ public void setDeliveryTag(Long deliveryTag) { this.deliveryTag = Optional.of(deliveryTag); } + /** + * Gets information about the bulk scan this job is part of. + * + * @return The bulk scan information + */ public BulkScanInfo getBulkScanInfo() { return bulkScanInfo; } diff --git a/src/main/java/de/rub/nds/crawler/data/ScanResult.java b/src/main/java/de/rub/nds/crawler/data/ScanResult.java index ebd5de5..4d79de7 100644 --- a/src/main/java/de/rub/nds/crawler/data/ScanResult.java +++ b/src/main/java/de/rub/nds/crawler/data/ScanResult.java @@ -14,18 +14,36 @@ import java.util.UUID; import org.bson.Document; +/** + * Represents the result of a completed scan. Contains information about the scan target, status, + * and the actual scan results. This class is used to store scan results in the database and for + * notifications. + */ public class ScanResult implements Serializable { + /** Unique identifier for this scan result. */ private String id; + /** Reference to the bulk scan this result belongs to. */ private final String bulkScan; + /** The target that was scanned. */ private final ScanTarget scanTarget; + /** The status of the scan job. */ private final JobStatus jobStatus; + /** The actual scan results as a MongoDB document. */ private final Document result; + /** + * Private constructor for creating a scan result. + * + * @param bulkScan The bulk scan ID this result belongs to + * @param scanTarget The target that was scanned + * @param jobStatus The status of the scan job + * @param result The actual scan results + */ private ScanResult( String bulkScan, ScanTarget scanTarget, JobStatus jobStatus, Document result) { this.id = UUID.randomUUID().toString(); @@ -35,6 +53,13 @@ private ScanResult( this.result = result; } + /** + * Creates a scan result from a scan job description and result document. + * + * @param scanJobDescription The completed scan job description + * @param result The scan results as a document + * @throws IllegalArgumentException If the job status is TO_BE_EXECUTED + */ public ScanResult(ScanJobDescription scanJobDescription, Document result) { this( scanJobDescription.getBulkScanInfo().getBulkScanId(), @@ -47,6 +72,15 @@ public ScanResult(ScanJobDescription scanJobDescription, Document result) { } } + /** + * Creates a scan result from a scan job description and an exception. Used when a scan fails + * with an exception. + * + * @param scanJobDescription The scan job description that encountered an error + * @param e The exception that occurred + * @return A new ScanResult containing the exception information + * @throws IllegalArgumentException If the job status is not an error state + */ public static ScanResult fromException(ScanJobDescription scanJobDescription, Exception e) { if (!scanJobDescription.getStatus().isError()) { throw new IllegalArgumentException("ScanJobDescription must be in an error state"); @@ -56,28 +90,58 @@ public static ScanResult fromException(ScanJobDescription scanJobDescription, Ex return new ScanResult(scanJobDescription, errorDocument); } + /** + * Gets the unique identifier for this scan result. + * + * @return The scan result ID + */ @JsonProperty("_id") public String getId() { return this.id; } + /** + * Sets the unique identifier for this scan result. Used by MongoDB for document IDs. + * + * @param id The scan result ID + */ @JsonProperty("_id") public void setId(String id) { this.id = id; } + /** + * Gets the bulk scan ID this result belongs to. + * + * @return The bulk scan ID + */ public String getBulkScan() { return this.bulkScan; } + /** + * Gets the target that was scanned. + * + * @return The scan target + */ public ScanTarget getScanTarget() { return this.scanTarget; } + /** + * Gets the actual scan results. + * + * @return The scan results as a MongoDB document + */ public Document getResult() { return this.result; } + /** + * Gets the status of the scan job. + * + * @return The job status + */ public JobStatus getResultStatus() { return jobStatus; } diff --git a/src/main/java/de/rub/nds/crawler/data/ScanTarget.java b/src/main/java/de/rub/nds/crawler/data/ScanTarget.java index b5299b6..ac7540c 100644 --- a/src/main/java/de/rub/nds/crawler/data/ScanTarget.java +++ b/src/main/java/de/rub/nds/crawler/data/ScanTarget.java @@ -18,6 +18,11 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +/** + * Represents a target to be scanned by the crawler. Contains information about the hostname, IP + * address, port, and ranking information. This class is used to track targets throughout the + * scanning process. + */ public class ScanTarget implements Serializable { private static final Logger LOGGER = LogManager.getLogger(); @@ -91,49 +96,100 @@ public static Pair fromTargetString( return Pair.of(target, JobStatus.TO_BE_EXECUTED); } + /** The IP address of the target. */ private String ip; + /** The hostname of the target. */ private String hostname; + /** The port number to connect to. */ private int port; + /** The Tranco rank of the target (if applicable). */ private int trancoRank; + /** Creates a new empty scan target. Fields should be set using the setter methods. */ public ScanTarget() {} + /** + * Returns a string representation of this scan target. Uses the hostname if available, + * otherwise uses the IP address. + * + * @return The string representation + */ @Override public String toString() { return hostname != null ? hostname : ip; } + /** + * Gets the IP address of this target. + * + * @return The IP address + */ public String getIp() { return this.ip; } + /** + * Gets the hostname of this target. + * + * @return The hostname + */ public String getHostname() { return this.hostname; } + /** + * Gets the port number to connect to. + * + * @return The port number + */ public int getPort() { return this.port; } + /** + * Gets the Tranco rank of this target (if applicable). + * + * @return The Tranco rank + */ public int getTrancoRank() { return this.trancoRank; } + /** + * Sets the IP address of this target. + * + * @param ip The IP address + */ public void setIp(String ip) { this.ip = ip; } + /** + * Sets the hostname of this target. + * + * @param hostname The hostname + */ public void setHostname(String hostname) { this.hostname = hostname; } + /** + * Sets the port number to connect to. + * + * @param port The port number + */ public void setPort(int port) { this.port = port; } + /** + * Sets the Tranco rank of this target. + * + * @param trancoRank The Tranco rank + */ public void setTrancoRank(int trancoRank) { this.trancoRank = trancoRank; } diff --git a/src/main/java/de/rub/nds/crawler/denylist/IDenylistProvider.java b/src/main/java/de/rub/nds/crawler/denylist/IDenylistProvider.java index ed1e4c5..3dba32b 100644 --- a/src/main/java/de/rub/nds/crawler/denylist/IDenylistProvider.java +++ b/src/main/java/de/rub/nds/crawler/denylist/IDenylistProvider.java @@ -10,7 +10,17 @@ import de.rub.nds.crawler.data.ScanTarget; +/** + * Interface for providers that check if a scan target is on a denylist. This can be used to skip + * scanning of certain targets for various reasons (legal, ethical, or technical). + */ public interface IDenylistProvider { + /** + * Checks if a scan target is on the denylist. + * + * @param target The scan target to check + * @return True if the target is denylisted, false otherwise + */ boolean isDenylisted(ScanTarget target); } diff --git a/src/main/java/de/rub/nds/crawler/orchestration/DoneNotificationConsumer.java b/src/main/java/de/rub/nds/crawler/orchestration/DoneNotificationConsumer.java index 9af1769..90ae8c0 100644 --- a/src/main/java/de/rub/nds/crawler/orchestration/DoneNotificationConsumer.java +++ b/src/main/java/de/rub/nds/crawler/orchestration/DoneNotificationConsumer.java @@ -10,8 +10,18 @@ import de.rub.nds.crawler.data.ScanJobDescription; +/** + * Functional interface for consumers that handle completion notifications of scan jobs. Used to + * notify controllers when workers have completed their assigned tasks. + */ @FunctionalInterface public interface DoneNotificationConsumer { + /** + * Consumes a notification that a scan job has completed. + * + * @param consumerTag A tag identifying the consumer + * @param scanJobDescription The description of the completed scan job + */ void consumeDoneNotification(String consumerTag, ScanJobDescription scanJobDescription); } diff --git a/src/main/java/de/rub/nds/crawler/orchestration/ScanJobConsumer.java b/src/main/java/de/rub/nds/crawler/orchestration/ScanJobConsumer.java index 628b0ee..f565eab 100644 --- a/src/main/java/de/rub/nds/crawler/orchestration/ScanJobConsumer.java +++ b/src/main/java/de/rub/nds/crawler/orchestration/ScanJobConsumer.java @@ -10,8 +10,17 @@ import de.rub.nds.crawler.data.ScanJobDescription; +/** + * Functional interface for consumers that process scan jobs. Used by workers to receive jobs from + * the orchestration system. + */ @FunctionalInterface public interface ScanJobConsumer { + /** + * Consumes and processes a scan job. + * + * @param scanJobDescription The description of the scan job to process + */ void consumeScanJob(ScanJobDescription scanJobDescription); } diff --git a/src/main/java/de/rub/nds/crawler/persistence/IPersistenceProvider.java b/src/main/java/de/rub/nds/crawler/persistence/IPersistenceProvider.java index 2e6fb81..734876c 100644 --- a/src/main/java/de/rub/nds/crawler/persistence/IPersistenceProvider.java +++ b/src/main/java/de/rub/nds/crawler/persistence/IPersistenceProvider.java @@ -48,11 +48,9 @@ public interface IPersistenceProvider { * @param dbName The database name where the scan results are stored. * @param collectionName The collection name where the scan results are stored. * @param target The hostname or IP address to search for. - * @param limit The maximum number of results to retrieve. If null, all results are retrieved. * @return A list of scan results matching the target. */ - List getScanResultsByTarget( - String dbName, String collectionName, String target); + List getScanResultsByTarget(String dbName, String collectionName, String target); /** * Retrieve a specific scan result by its ID. diff --git a/src/main/java/de/rub/nds/crawler/persistence/MongoPersistenceProvider.java b/src/main/java/de/rub/nds/crawler/persistence/MongoPersistenceProvider.java index 078c11c..74aeb2a 100644 --- a/src/main/java/de/rub/nds/crawler/persistence/MongoPersistenceProvider.java +++ b/src/main/java/de/rub/nds/crawler/persistence/MongoPersistenceProvider.java @@ -287,8 +287,6 @@ public List getScanResultsByTarget( var iterable = collection.find(query); - - List results = new ArrayList<>(); iterable.forEach(results::add); diff --git a/src/main/java/de/rub/nds/crawler/targetlist/ITargetListProvider.java b/src/main/java/de/rub/nds/crawler/targetlist/ITargetListProvider.java index 5e4662f..98bc542 100644 --- a/src/main/java/de/rub/nds/crawler/targetlist/ITargetListProvider.java +++ b/src/main/java/de/rub/nds/crawler/targetlist/ITargetListProvider.java @@ -10,7 +10,16 @@ import java.util.List; +/** + * Interface for providers that supply lists of targets to scan. Implementations can retrieve + * targets from different sources such as files, databases, or web services. + */ public interface ITargetListProvider { + /** + * Gets the list of targets to scan. + * + * @return A list of target hostnames or IP addresses + */ List getTargetList(); } diff --git a/src/main/java/de/rub/nds/crawler/util/CanceallableThreadPoolExecutor.java b/src/main/java/de/rub/nds/crawler/util/CanceallableThreadPoolExecutor.java index f4d14fd..ae0f457 100644 --- a/src/main/java/de/rub/nds/crawler/util/CanceallableThreadPoolExecutor.java +++ b/src/main/java/de/rub/nds/crawler/util/CanceallableThreadPoolExecutor.java @@ -10,7 +10,20 @@ import java.util.concurrent.*; +/** + * A custom thread pool executor that creates cancellable futures. This executor allows tasks to + * return a partial result even when cancelled. + */ public class CanceallableThreadPoolExecutor extends ThreadPoolExecutor { + /** + * Creates a new thread pool executor with the given parameters. + * + * @param corePoolSize The number of threads to keep in the pool, even if idle + * @param maximumPoolSize The maximum number of threads to allow in the pool + * @param keepAliveTime How long idle threads should be kept alive + * @param unit The time unit for the keepAliveTime + * @param workQueue The queue to use for holding tasks before they are executed + */ public CanceallableThreadPoolExecutor( int corePoolSize, int maximumPoolSize, @@ -20,6 +33,16 @@ public CanceallableThreadPoolExecutor( super(corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue); } + /** + * Creates a new thread pool executor with the given parameters. + * + * @param corePoolSize The number of threads to keep in the pool, even if idle + * @param maximumPoolSize The maximum number of threads to allow in the pool + * @param keepAliveTime How long idle threads should be kept alive + * @param unit The time unit for the keepAliveTime + * @param workQueue The queue to use for holding tasks before they are executed + * @param threadFactory The factory to use when creating new threads + */ public CanceallableThreadPoolExecutor( int corePoolSize, int maximumPoolSize, @@ -30,6 +53,16 @@ public CanceallableThreadPoolExecutor( super(corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue, threadFactory); } + /** + * Creates a new thread pool executor with the given parameters. + * + * @param corePoolSize The number of threads to keep in the pool, even if idle + * @param maximumPoolSize The maximum number of threads to allow in the pool + * @param keepAliveTime How long idle threads should be kept alive + * @param unit The time unit for the keepAliveTime + * @param workQueue The queue to use for holding tasks before they are executed + * @param handler The handler to use when execution is blocked + */ public CanceallableThreadPoolExecutor( int corePoolSize, int maximumPoolSize, @@ -40,6 +73,17 @@ public CanceallableThreadPoolExecutor( super(corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue, handler); } + /** + * Creates a new thread pool executor with the given parameters. + * + * @param corePoolSize The number of threads to keep in the pool, even if idle + * @param maximumPoolSize The maximum number of threads to allow in the pool + * @param keepAliveTime How long idle threads should be kept alive + * @param unit The time unit for the keepAliveTime + * @param workQueue The queue to use for holding tasks before they are executed + * @param threadFactory The factory to use when creating new threads + * @param handler The handler to use when execution is blocked + */ public CanceallableThreadPoolExecutor( int corePoolSize, int maximumPoolSize, @@ -58,11 +102,26 @@ public CanceallableThreadPoolExecutor( handler); } + /** + * Creates a new cancellable future for the given callable. + * + * @param The type of the result + * @param callable The callable to be executed + * @return A new cancellable future for the callable + */ @Override protected RunnableFuture newTaskFor(Callable callable) { return new CancellableFuture<>(callable); } + /** + * Creates a new cancellable future for the given runnable and result value. + * + * @param The type of the result + * @param runnable The runnable to be executed + * @param value The result value to return when the runnable completes + * @return A new cancellable future for the runnable + */ @Override protected RunnableFuture newTaskFor(Runnable runnable, T value) { return new CancellableFuture<>(runnable, value); diff --git a/src/main/java/de/rub/nds/crawler/util/CancellableFuture.java b/src/main/java/de/rub/nds/crawler/util/CancellableFuture.java index d7706b1..b166c85 100644 --- a/src/main/java/de/rub/nds/crawler/util/CancellableFuture.java +++ b/src/main/java/de/rub/nds/crawler/util/CancellableFuture.java @@ -12,12 +12,25 @@ import java.util.concurrent.*; import java.util.concurrent.atomic.AtomicReference; +/** + * A cancellable future implementation that can return partial results even when cancelled. This + * class wraps a standard FutureTask but captures the result when available, allowing it to be + * retrieved even after cancellation. + * + * @param The result type returned by this future + */ public class CancellableFuture implements RunnableFuture { private final AtomicReference result = new AtomicReference<>(); private final RunnableFuture innerFuture; private final Semaphore resultWritten = new Semaphore(0); + /** + * Creates a new cancellable future for the given callable. When the callable completes, the + * result is stored for retrieval even after cancellation. + * + * @param callable The callable to be executed + */ public CancellableFuture(Callable callable) { innerFuture = new FutureTask<>( @@ -29,6 +42,13 @@ public CancellableFuture(Callable callable) { }); } + /** + * Creates a new cancellable future for the given runnable and result value. When the runnable + * completes, the result value is stored for retrieval even after cancellation. + * + * @param runnable The runnable to be executed + * @param res The result value to return when the runnable completes + */ public CancellableFuture(Runnable runnable, V res) { innerFuture = new FutureTask<>( @@ -40,21 +60,46 @@ public CancellableFuture(Runnable runnable, V res) { }); } + /** + * Attempts to cancel execution of this task. + * + * @param mayInterruptIfRunning True if the thread executing this task should be interrupted + * @return True if the task was cancelled, false otherwise + */ @Override - public boolean cancel(boolean b) { - return innerFuture.cancel(b); + public boolean cancel(boolean mayInterruptIfRunning) { + return innerFuture.cancel(mayInterruptIfRunning); } + /** + * Returns true if this task was cancelled before it completed normally. + * + * @return True if this task was cancelled before it completed + */ @Override public boolean isCancelled() { return innerFuture.isCancelled(); } + /** + * Returns true if this task completed. Completion may be due to normal termination, an + * exception, or cancellation. + * + * @return True if this task completed + */ @Override public boolean isDone() { return innerFuture.isDone(); } + /** + * Waits if necessary for the computation to complete, and then retrieves its result. If the + * task was cancelled but the result was captured, returns the captured result. + * + * @return The computed result + * @throws InterruptedException If the current thread was interrupted while waiting + * @throws ExecutionException If the computation threw an exception + */ @Override public V get() throws InterruptedException, ExecutionException { try { @@ -65,19 +110,32 @@ public V get() throws InterruptedException, ExecutionException { } } + /** + * Waits if necessary for at most the given time for the computation to complete, and then + * retrieves its result. If the task was cancelled but the result was captured, returns the + * captured result if available within the timeout. + * + * @param timeout The maximum time to wait + * @param timeUnit The time unit of the timeout argument + * @return The computed result + * @throws InterruptedException If the current thread was interrupted while waiting + * @throws ExecutionException If the computation threw an exception + * @throws TimeoutException If the wait timed out + */ @Override - public V get(long l, @NonNull TimeUnit timeUnit) + public V get(long timeout, @NonNull TimeUnit timeUnit) throws InterruptedException, ExecutionException, TimeoutException { try { - return innerFuture.get(l, timeUnit); + return innerFuture.get(timeout, timeUnit); } catch (CancellationException e) { - if (resultWritten.tryAcquire(l, timeUnit)) { + if (resultWritten.tryAcquire(timeout, timeUnit)) { return result.get(); } throw new TimeoutException("Timeout while waiting for cancelled result"); } } + /** Executes the underlying task. */ @Override public void run() { innerFuture.run(); diff --git a/src/test/java/de/rub/nds/crawler/dummy/DummyPersistenceProvider.java b/src/test/java/de/rub/nds/crawler/dummy/DummyPersistenceProvider.java index 9c2bd00..9208f0a 100644 --- a/src/test/java/de/rub/nds/crawler/dummy/DummyPersistenceProvider.java +++ b/src/test/java/de/rub/nds/crawler/dummy/DummyPersistenceProvider.java @@ -13,6 +13,7 @@ import de.rub.nds.crawler.data.ScanResult; import de.rub.nds.crawler.persistence.IPersistenceProvider; import java.util.ArrayList; +import java.util.LinkedList; import java.util.List; public class DummyPersistenceProvider implements IPersistenceProvider { @@ -31,4 +32,15 @@ public void insertBulkScan(BulkScan bulkScan) { @Override public void updateBulkScan(BulkScan bulkScan) {} + + @Override + public List getScanResultsByTarget( + String dbName, String collectionName, String target) { + return new LinkedList<>(); + } + + @Override + public ScanResult getScanResultById(String dbName, String collectionName, String id) { + return null; + } } From bd240174add302f88b30c6a1796bd1b27f372c25 Mon Sep 17 00:00:00 2001 From: Robert Merget Date: Wed, 21 May 2025 13:15:01 +0400 Subject: [PATCH 06/24] added codec support --- .../persistence/MongoPersistenceProvider.java | 90 ++++++++++++++++++- 1 file changed, 87 insertions(+), 3 deletions(-) diff --git a/src/main/java/de/rub/nds/crawler/persistence/MongoPersistenceProvider.java b/src/main/java/de/rub/nds/crawler/persistence/MongoPersistenceProvider.java index 74aeb2a..49d438f 100644 --- a/src/main/java/de/rub/nds/crawler/persistence/MongoPersistenceProvider.java +++ b/src/main/java/de/rub/nds/crawler/persistence/MongoPersistenceProvider.java @@ -55,7 +55,14 @@ public class MongoPersistenceProvider implements IPersistenceProvider { private static boolean isInitialized = false; private static final Set> serializers = new HashSet<>(); private static final Set modules = new HashSet<>(); + private static final Set> codecClasses = new HashSet<>(); + /** + * Register a custom JSON serializer for MongoDB serialization. + * + * @param serializer The serializer to register + * @throws RuntimeException if called after provider initialization + */ public static void registerSerializer(JsonSerializer serializer) { if (isInitialized) { throw new RuntimeException("Cannot register serializer after initialization"); @@ -63,12 +70,24 @@ public static void registerSerializer(JsonSerializer serializer) { serializers.add(serializer); } + /** + * Register multiple custom JSON serializers for MongoDB serialization. + * + * @param serializers The serializers to register + * @throws RuntimeException if called after provider initialization + */ public static void registerSerializer(JsonSerializer... serializers) { for (JsonSerializer serializer : serializers) { registerSerializer(serializer); } } + /** + * Register a Jackson module for MongoDB serialization. + * + * @param module The module to register + * @throws RuntimeException if called after provider initialization + */ public static void registerModule(Module module) { if (isInitialized) { throw new RuntimeException("Cannot register module after initialization"); @@ -76,12 +95,52 @@ public static void registerModule(Module module) { modules.add(module); } + /** + * Register multiple Jackson modules for MongoDB serialization. + * + * @param modules The modules to register + * @throws RuntimeException if called after provider initialization + */ public static void registerModule(Module... modules) { for (Module module : modules) { registerModule(module); } } + /** + * Register a class for custom codec handling. + * + * @param codecClass The class to register for custom codec handling + * @throws RuntimeException if called after provider initialization + */ + public static void registerCodecClass(Class codecClass) { + if (isInitialized) { + throw new RuntimeException("Cannot register codec class after initialization"); + } + codecClasses.add(codecClass); + } + + /** + * Register multiple classes for custom codec handling. + * + * @param codecClasses The classes to register for custom codec handling + * @throws RuntimeException if called after provider initialization + */ + public static void registerCodecClass(Class... codecClasses) { + for (Class codecClass : codecClasses) { + registerCodecClass(codecClass); + } + } + + /** + * Get all registered codec classes. + * + * @return An unmodifiable set of all registered codec classes + */ + public static Set> getCodecClasses() { + return Set.copyOf(codecClasses); + } + private final MongoClient mongoClient; private final ObjectMapper mapper; private final LoadingCache databaseCache; @@ -113,11 +172,36 @@ private static MongoClient createMongoClient(MongoDbDelegate mongoDbDelegate) { mongoDbDelegate.getMongoDbAuthSource(), pw.toCharArray()); - MongoClientSettings mongoClientSettings = + MongoClientSettings.Builder settingsBuilder = MongoClientSettings.builder() .credential(credentials) - .applyConnectionString(connectionString) - .build(); + .applyConnectionString(connectionString); + + // Register any custom codec classes if needed + if (!codecClasses.isEmpty()) { + for (Class codecClass : codecClasses) { + LOGGER.info("Custom codec class registered: {}", codecClass.getName()); + } + + // This is a placeholder for actual codec implementation + // You would need to implement a custom CodecProvider or CodecRegistry + // based on your specific requirements for the registered classes + + // Example approach using org.bson.codecs.pojo.PojoCodecProvider: + org.bson.codecs.configuration.CodecRegistry pojoCodecRegistry = + org.bson.codecs.configuration.CodecRegistries.fromRegistries( + MongoClientSettings.getDefaultCodecRegistry(), + org.bson.codecs.configuration.CodecRegistries.fromProviders( + org.bson.codecs.pojo.PojoCodecProvider.builder() + .automatic(true) + .register(codecClasses.toArray(new Class[0])) + .build())); + + settingsBuilder.codecRegistry(pojoCodecRegistry); + } + + MongoClientSettings mongoClientSettings = settingsBuilder.build(); + LOGGER.info("MongoDB persistence provider prepared to connect to {}", connectionString); return MongoClients.create(mongoClientSettings); } From 73ad98020c48fcfa2e7abeeafc8d14a60c122076 Mon Sep 17 00:00:00 2001 From: Robert Merget Date: Thu, 22 May 2025 17:12:45 +0400 Subject: [PATCH 07/24] fixed multi module and removed codec classes --- pom.xml | 6 +- .../persistence/MongoPersistenceProvider.java | 58 ------------------- 2 files changed, 2 insertions(+), 62 deletions(-) diff --git a/pom.xml b/pom.xml index 3146791..ad74e18 100644 --- a/pom.xml +++ b/pom.xml @@ -190,8 +190,7 @@ - - ${maven.multiModuleProjectDirectory}/apps + ${project.basedir}/apps @@ -206,8 +205,7 @@ true - - ${maven.multiModuleProjectDirectory}/apps + ${project.basedir}/apps diff --git a/src/main/java/de/rub/nds/crawler/persistence/MongoPersistenceProvider.java b/src/main/java/de/rub/nds/crawler/persistence/MongoPersistenceProvider.java index 49d438f..91fc3d5 100644 --- a/src/main/java/de/rub/nds/crawler/persistence/MongoPersistenceProvider.java +++ b/src/main/java/de/rub/nds/crawler/persistence/MongoPersistenceProvider.java @@ -55,7 +55,6 @@ public class MongoPersistenceProvider implements IPersistenceProvider { private static boolean isInitialized = false; private static final Set> serializers = new HashSet<>(); private static final Set modules = new HashSet<>(); - private static final Set> codecClasses = new HashSet<>(); /** * Register a custom JSON serializer for MongoDB serialization. @@ -107,40 +106,6 @@ public static void registerModule(Module... modules) { } } - /** - * Register a class for custom codec handling. - * - * @param codecClass The class to register for custom codec handling - * @throws RuntimeException if called after provider initialization - */ - public static void registerCodecClass(Class codecClass) { - if (isInitialized) { - throw new RuntimeException("Cannot register codec class after initialization"); - } - codecClasses.add(codecClass); - } - - /** - * Register multiple classes for custom codec handling. - * - * @param codecClasses The classes to register for custom codec handling - * @throws RuntimeException if called after provider initialization - */ - public static void registerCodecClass(Class... codecClasses) { - for (Class codecClass : codecClasses) { - registerCodecClass(codecClass); - } - } - - /** - * Get all registered codec classes. - * - * @return An unmodifiable set of all registered codec classes - */ - public static Set> getCodecClasses() { - return Set.copyOf(codecClasses); - } - private final MongoClient mongoClient; private final ObjectMapper mapper; private final LoadingCache databaseCache; @@ -177,29 +142,6 @@ private static MongoClient createMongoClient(MongoDbDelegate mongoDbDelegate) { .credential(credentials) .applyConnectionString(connectionString); - // Register any custom codec classes if needed - if (!codecClasses.isEmpty()) { - for (Class codecClass : codecClasses) { - LOGGER.info("Custom codec class registered: {}", codecClass.getName()); - } - - // This is a placeholder for actual codec implementation - // You would need to implement a custom CodecProvider or CodecRegistry - // based on your specific requirements for the registered classes - - // Example approach using org.bson.codecs.pojo.PojoCodecProvider: - org.bson.codecs.configuration.CodecRegistry pojoCodecRegistry = - org.bson.codecs.configuration.CodecRegistries.fromRegistries( - MongoClientSettings.getDefaultCodecRegistry(), - org.bson.codecs.configuration.CodecRegistries.fromProviders( - org.bson.codecs.pojo.PojoCodecProvider.builder() - .automatic(true) - .register(codecClasses.toArray(new Class[0])) - .build())); - - settingsBuilder.codecRegistry(pojoCodecRegistry); - } - MongoClientSettings mongoClientSettings = settingsBuilder.build(); LOGGER.info("MongoDB persistence provider prepared to connect to {}", connectionString); From 230652e6fc2eb6eefc75fabaa6339dffebe08738 Mon Sep 17 00:00:00 2001 From: Robert Merget Date: Thu, 22 May 2025 23:40:25 +0400 Subject: [PATCH 08/24] updated pom --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index ad74e18..9ddcceb 100644 --- a/pom.xml +++ b/pom.xml @@ -125,7 +125,7 @@ de.rub.nds scanner-core - 6.1.1 + 6.1.2-SNAPSHOT org.apache.commons From 781085d62a069e8555f0e9b4cd5954042bb775ae Mon Sep 17 00:00:00 2001 From: Robert Merget Date: Tue, 27 May 2025 13:22:26 +0400 Subject: [PATCH 09/24] added default constructor --- src/main/java/de/rub/nds/crawler/data/ScanResult.java | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/main/java/de/rub/nds/crawler/data/ScanResult.java b/src/main/java/de/rub/nds/crawler/data/ScanResult.java index 4d79de7..f075588 100644 --- a/src/main/java/de/rub/nds/crawler/data/ScanResult.java +++ b/src/main/java/de/rub/nds/crawler/data/ScanResult.java @@ -36,6 +36,16 @@ public class ScanResult implements Serializable { /** The actual scan results as a MongoDB document. */ private final Document result; + @SuppressWarnings("unused") + public ScanResult() { + // Default constructor for serialization + this.id = null; + this.bulkScan = null; + this.scanTarget = null; + this.jobStatus = null; + this.result = null; + } + /** * Private constructor for creating a scan result. * From 88600ab98f98c53d7512d4a67cafd2d128588d8d Mon Sep 17 00:00:00 2001 From: Robert Merget Date: Fri, 30 May 2025 15:35:41 +0400 Subject: [PATCH 10/24] switched version --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 9ddcceb..56ab5cb 100644 --- a/pom.xml +++ b/pom.xml @@ -125,7 +125,7 @@ de.rub.nds scanner-core - 6.1.2-SNAPSHOT + 6.1.3-SNAPSHOT org.apache.commons From e9245ad83274c91f74dc28514f09cede43b827b8 Mon Sep 17 00:00:00 2001 From: Robert Merget Date: Mon, 9 Jun 2025 11:16:37 +0400 Subject: [PATCH 11/24] pom update --- pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index 56ab5cb..f08ddf1 100644 --- a/pom.xml +++ b/pom.xml @@ -8,7 +8,7 @@ crawler-core - 1.2.1-SNAPSHOT + 1.2.1-json Crawler-Core https://github.com/tls-attacker/TLS-Crawler @@ -125,7 +125,7 @@ de.rub.nds scanner-core - 6.1.3-SNAPSHOT + 6.1.3-json org.apache.commons From 73866885e562e67d0d60b5f01188ac1b619e6988 Mon Sep 17 00:00:00 2001 From: Robert Merget Date: Wed, 11 Jun 2025 10:34:42 +0400 Subject: [PATCH 12/24] Add comprehensive JavaDoc documentation for core classes Enhanced documentation for key classes and interfaces: - CommonMain: Application entry point with usage examples - ScanTarget: Target parsing with format specifications and RFC references - BulkScan: Bulk scan coordination with lifecycle documentation - BulkScanWorker: Abstract worker framework with thread safety notes Provides developer-friendly documentation appropriate for TLS/Java developers with detailed API descriptions, usage examples, and cross-references. --- .../java/de/rub/nds/crawler/CommonMain.java | 45 +++++++ .../rub/nds/crawler/core/BulkScanWorker.java | 113 +++++++++++++++- .../de/rub/nds/crawler/data/BulkScan.java | 110 +++++++++++++++- .../de/rub/nds/crawler/data/ScanTarget.java | 123 +++++++++++++++++- 4 files changed, 382 insertions(+), 9 deletions(-) diff --git a/src/main/java/de/rub/nds/crawler/CommonMain.java b/src/main/java/de/rub/nds/crawler/CommonMain.java index ce13f5f..32eaf0e 100644 --- a/src/main/java/de/rub/nds/crawler/CommonMain.java +++ b/src/main/java/de/rub/nds/crawler/CommonMain.java @@ -18,9 +18,45 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +/** + * Main entry point for the TLS-Crawler application. + * + *

This class provides the command-line interface for running the TLS-Crawler in two modes: + * + *

    + *
  • Controller - Orchestrates scan jobs and manages the scanning workflow + *
  • Worker - Executes individual scan tasks assigned by the controller + *
+ * + *

The application uses RabbitMQ for communication between controllers and workers, and MongoDB + * for persistence of scan results and job status. + * + *

Usage examples: + * + *

+ * java -jar crawler-core.jar controller --config controller.properties
+ * java -jar crawler-core.jar worker --config worker.properties
+ * 
+ * + * @see Controller + * @see Worker + * @see ControllerCommandConfig + * @see WorkerCommandConfig + */ public class CommonMain { private static final Logger LOGGER = LogManager.getLogger(); + /** + * Main entry point for the TLS-Crawler application. + * + *

Parses command line arguments to determine whether to run as a controller or worker, + * initializes the appropriate configuration and dependencies, and starts the selected mode. + * + * @param args command line arguments including the mode ("controller" or "worker") and + * configuration parameters + * @param controllerCommandConfig configuration for controller mode + * @param workerCommandConfig configuration for worker mode + */ public static void main( String[] args, ControllerCommandConfig controllerCommandConfig, @@ -71,6 +107,15 @@ public static void main( } } + /** + * Convenience method for running the application with only controller configuration. + * + *

Creates a default worker configuration and delegates to the main method. This is useful + * when only controller functionality is needed. + * + * @param args command line arguments + * @param controllerConfig configuration for controller mode + */ public static void main(String[] args, ControllerCommandConfig controllerConfig) { main(args, controllerConfig, new WorkerCommandConfig()); } diff --git a/src/main/java/de/rub/nds/crawler/core/BulkScanWorker.java b/src/main/java/de/rub/nds/crawler/core/BulkScanWorker.java index d9f5a58..ba09184 100644 --- a/src/main/java/de/rub/nds/crawler/core/BulkScanWorker.java +++ b/src/main/java/de/rub/nds/crawler/core/BulkScanWorker.java @@ -19,20 +19,76 @@ import org.apache.logging.log4j.Logger; import org.bson.Document; +/** + * Abstract base class for bulk scanning workers that execute TLS scans on individual targets. + * + *

This class provides the framework for implementing specific scanner workers that can process + * multiple scan targets concurrently. It handles the lifecycle management, thread pool + * coordination, and resource cleanup for scanning operations. + * + *

Key responsibilities: + * + *

    + *
  • Concurrency Management - Manages a thread pool for parallel scanning + *
  • Lifecycle Control - Handles initialization and cleanup of scanner + * resources + *
  • Job Tracking - Tracks active scanning jobs for proper resource management + *
  • Thread Safety - Ensures safe concurrent access to shared resources + *
+ * + *

Implementations must provide: + * + *

    + *
  • {@link #scan(ScanTarget)} - The actual scanning logic + *
  • {@link #initInternal()} - Scanner-specific initialization + *
  • {@link #cleanupInternal()} - Scanner-specific cleanup + *
+ * + *

Thread Safety: This class is designed to be thread-safe and can handle + * multiple concurrent scan requests. The initialization and cleanup methods are synchronized to + * prevent race conditions. + * + *

Resource Management: The worker automatically manages its lifecycle, + * performing initialization on first use and cleanup when no active jobs remain. + * + * @param the type of scan configuration used by this worker + * @see ScanConfig + * @see ScanTarget + * @see Worker + */ public abstract class BulkScanWorker { private static final Logger LOGGER = LogManager.getLogger(); + + /** Counter for currently active scanning jobs. */ private final AtomicInteger activeJobs = new AtomicInteger(0); + + /** Flag indicating whether the worker has been initialized. */ private final AtomicBoolean initialized = new AtomicBoolean(false); + + /** Flag indicating whether the worker should perform self-cleanup when jobs complete. */ private final AtomicBoolean shouldCleanupSelf = new AtomicBoolean(false); + + /** Identifier of the bulk scan this worker is associated with. */ protected final String bulkScanId; + + /** Configuration parameters for scanning operations. */ protected final T scanConfig; /** - * Calls the inner scan function and may handle cleanup. This is needed to wrap the scanner into - * a future object such that we can handle timeouts properly. + * Thread pool executor for handling scan operations with timeout support. + * + *

This executor wraps scanner functions in Future objects to enable proper timeout handling + * and concurrent execution of multiple scans. */ private final ThreadPoolExecutor timeoutExecutor; + /** + * Creates a new BulkScanWorker with the specified configuration and thread pool size. + * + * @param bulkScanId the identifier of the bulk scan this worker belongs to + * @param scanConfig the scan configuration containing scan parameters + * @param parallelScanThreads the number of threads to use for parallel scanning + */ protected BulkScanWorker(String bulkScanId, T scanConfig, int parallelScanThreads) { this.bulkScanId = bulkScanId; this.scanConfig = scanConfig; @@ -47,6 +103,21 @@ protected BulkScanWorker(String bulkScanId, T scanConfig, int parallelScanThread new NamedThreadFactory("crawler-worker: scan executor")); } + /** + * Handles a scan request for the specified target. + * + *

This method manages the complete lifecycle of a scan operation: + * + *

    + *
  • Ensures the worker is initialized before scanning + *
  • Submits the scan to the thread pool for execution + *
  • Tracks active job count for resource management + *
  • Handles cleanup when all jobs are complete + *
+ * + * @param scanTarget the target to scan + * @return a Future representing the scan operation result + */ public Future handle(ScanTarget scanTarget) { // if we initialized ourself, we also clean up ourself shouldCleanupSelf.weakCompareAndSetAcquire(false, init()); @@ -61,8 +132,25 @@ public Future handle(ScanTarget scanTarget) { }); } + /** + * Performs the actual scan operation on the specified target. + * + *

This method must be implemented by concrete worker classes to provide the specific + * scanning logic for their scanner type. + * + * @param scanTarget the target to scan + * @return a MongoDB document containing the scan results + */ public abstract Document scan(ScanTarget scanTarget); + /** + * Initializes the worker if not already initialized. + * + *

This method ensures thread-safe initialization using double-checked locking. Only one + * thread will perform the actual initialization, while others will wait for completion. + * + * @return true if this call performed the initialization, false if already initialized + */ public final boolean init() { // synchronize such that no thread runs before being initialized // but only synchronize if not already initialized @@ -77,6 +165,15 @@ public final boolean init() { return false; } + /** + * Cleans up the worker resources if no jobs are currently active. + * + *

This method performs thread-safe cleanup using synchronization to prevent race conditions + * with initialization and active jobs. If jobs are still running, cleanup is deferred until all + * jobs complete. + * + * @return true if cleanup was performed, false if deferred or already cleaned up + */ public final boolean cleanup() { // synchronize such that init and cleanup do not run simultaneously // but only synchronize if already initialized @@ -98,7 +195,19 @@ public final boolean cleanup() { return false; } + /** + * Performs worker-specific initialization. + * + *

This method is called once during the worker's lifecycle and should set up any resources + * needed for scanning operations. + */ protected abstract void initInternal(); + /** + * Performs worker-specific cleanup. + * + *

This method is called when the worker is being shut down and should release any resources + * allocated during initialization. + */ protected abstract void cleanupInternal(); } diff --git a/src/main/java/de/rub/nds/crawler/data/BulkScan.java b/src/main/java/de/rub/nds/crawler/data/BulkScan.java index 980c089..d413841 100644 --- a/src/main/java/de/rub/nds/crawler/data/BulkScan.java +++ b/src/main/java/de/rub/nds/crawler/data/BulkScan.java @@ -17,45 +17,121 @@ import java.util.Map; import javax.persistence.Id; +/** + * Represents a bulk scanning operation with its configuration, progress tracking, and metadata. + * + *

A BulkScan encapsulates all information about a large-scale TLS scanning operation, including + * the scan configuration, target statistics, job status tracking, and version information. This + * class serves as the primary coordination entity for distributed scanning operations. + * + *

The bulk scan lifecycle typically follows this pattern: + * + *

    + *
  1. Creation with scan configuration and target list + *
  2. Target processing and job publishing to worker queues + *
  3. Progress monitoring through job status counters + *
  4. Completion marking and result aggregation + *
+ * + *

Key features: + * + *

    + *
  • Distributed coordination - Tracks jobs across multiple worker instances + *
  • Progress monitoring - Real-time status counters for different job states + *
  • Version tracking - Records scanner and crawler versions for + * reproducibility + *
  • Time tracking - Start and end time recording for performance analysis + *
  • Collection management - Automatic database collection naming with + * timestamps + *
+ * + *

Persistence: This class is designed for MongoDB persistence with JPA + * annotations. Method naming follows serialization conventions and should not be changed without + * considering backward compatibility. + * + * @see ScanConfig + * @see JobStatus + * @see ScanTarget + */ public class BulkScan implements Serializable { + /** Unique identifier for the bulk scan (managed by MongoDB). */ @Id private String _id; + /** Human-readable name for the scan operation. */ private String name; + /** MongoDB collection name where scan results are stored (auto-generated). */ private String collectionName; + /** Configuration parameters for the scanning operation. */ private ScanConfig scanConfig; + /** Whether this scan should be monitored for progress updates. */ private boolean monitored; + /** Whether the scan operation has completed. */ private boolean finished; + /** Start time of the scan operation (epoch milliseconds). */ private long startTime; + /** End time of the scan operation (epoch milliseconds). */ private long endTime; + /** Total number of targets provided for scanning. */ private int targetsGiven; + /** Number of scan jobs successfully published to worker queues. */ private long scanJobsPublished; + + /** Number of targets that failed hostname resolution. */ private long scanJobsResolutionErrors; + + /** Number of targets excluded due to denylist filtering. */ private long scanJobsDenylisted; + /** Number of successfully completed scans. */ private int successfulScans; + /** Counters for tracking job states during scan execution. */ private Map jobStatusCounters = new EnumMap<>(JobStatus.class); + /** Optional URL for scan completion notifications. */ private String notifyUrl; + /** Version of the TLS scanner used for this scan. */ private String scannerVersion; + /** Version of the crawler framework used for this scan. */ private String crawlerVersion; + /** Date format used for generating collection names with timestamps. */ private static SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd_HH-mm"); + /** + * Default constructor for deserialization. + * + *

This constructor is used by serialization frameworks and should not be called directly. + */ @SuppressWarnings("unused") private BulkScan() {} + /** + * Creates a new BulkScan with the specified configuration and metadata. + * + *

This constructor initializes a new bulk scan operation with version information extracted + * from the provided scanner and crawler classes. The collection name is automatically generated + * using the scan name and start time. + * + * @param scannerClass the scanner class to extract version information from + * @param crawlerClass the crawler class to extract version information from + * @param name the human-readable name for this scan operation + * @param scanConfig the scan configuration defining scan parameters + * @param startTime the start time in epoch milliseconds + * @param monitored whether this scan should be monitored for progress + * @param notifyUrl optional URL for completion notifications (may be null) + */ public BulkScan( Class scannerClass, Class crawlerClass, @@ -76,7 +152,14 @@ public BulkScan( this.notifyUrl = notifyUrl; } - // Getter naming important for correct serialization, do not change! + /** + * Gets the unique identifier for this bulk scan. + * + *

Important: Getter naming is critical for MongoDB serialization. Do not + * change this method name without considering serialization compatibility. + * + * @return the MongoDB document ID + */ public String get_id() { return _id; } @@ -85,6 +168,14 @@ public String getName() { return this.name; } + /** + * Gets the MongoDB collection name where scan results are stored. + * + *

The collection name is automatically generated from the scan name and start time in the + * format: {name}_{yyyy-MM-dd_HH-mm} + * + * @return the collection name for scan results + */ public String getCollectionName() { return this.collectionName; } @@ -97,6 +188,14 @@ public boolean isMonitored() { return this.monitored; } + /** + * Checks whether the bulk scan operation has completed. + * + *

A scan is considered finished when all target processing and job publishing has been + * completed, regardless of individual job success or failure. + * + * @return true if the scan is finished, false otherwise + */ public boolean isFinished() { return this.finished; } @@ -190,6 +289,15 @@ public void setCrawlerVersion(String crawlerVersion) { this.crawlerVersion = crawlerVersion; } + /** + * Gets the job status counters for tracking scan progress. + * + *

This map contains counters for each {@link JobStatus} value, allowing real-time monitoring + * of scan progress and completion rates. + * + * @return a map of job statuses to their respective counts + * @see JobStatus + */ public Map getJobStatusCounters() { return jobStatusCounters; } diff --git a/src/main/java/de/rub/nds/crawler/data/ScanTarget.java b/src/main/java/de/rub/nds/crawler/data/ScanTarget.java index b5299b6..c40f33b 100644 --- a/src/main/java/de/rub/nds/crawler/data/ScanTarget.java +++ b/src/main/java/de/rub/nds/crawler/data/ScanTarget.java @@ -18,17 +18,69 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +/** + * Represents a target for TLS scanning operations. + * + *

A scan target encapsulates the network location (hostname/IP address and port) and optional + * metadata (such as Tranco ranking) for a host to be scanned. This class provides parsing + * functionality to extract target information from various string formats commonly found in target + * lists and rankings. + * + *

Supported target string formats: + * + *

    + *
  • example.com - hostname only + *
  • 192.168.1.1 - IP address only + *
  • example.com:8080 - hostname with port + *
  • 192.168.1.1:443 - IP address with port + *
  • 1,example.com - Tranco rank with hostname + *
  • //example.com - hostname with URL prefix + *
+ * + *

The class performs hostname resolution and denylist checking during target creation. IPv6 + * addresses are currently not fully supported due to port parsing limitations. + * + * @see JobStatus + * @see IDenylistProvider + */ public class ScanTarget implements Serializable { private static final Logger LOGGER = LogManager.getLogger(); /** - * Initializes a ScanTarget object from a string that potentially contains a hostname, an ip, a - * port, the tranco rank. + * Creates a ScanTarget from a target string with comprehensive parsing and validation. + * + *

This method parses various target string formats, performs hostname resolution, and checks + * against denylists. The parsing handles multiple formats including Tranco-ranked entries, + * URLs, and port specifications. + * + *

Parsing logic: + * + *

    + *
  1. Extract Tranco rank if present (format: "rank,hostname") + *
  2. Remove URL prefixes ("//hostname") + *
  3. Remove quotes around hostnames + *
  4. Extract port number if specified ("hostname:port") + *
  5. Determine if target is IP address or hostname + *
  6. Resolve hostname to IP address if needed + *
  7. Check against denylist if provider is available + *
+ * + *

Known limitations: + * + *

    + *
  • IPv6 addresses with ports are not correctly parsed due to colon conflicts + *
  • Only the first resolved IP address is used for multi-homed hosts + *
* - * @param targetString from which to create the ScanTarget object - * @param defaultPort that used if no port is present in targetString - * @param denylistProvider which provides info if a host is denylisted - * @return ScanTarget object + * @param targetString the string to parse (supports various formats as documented in class + * description) + * @param defaultPort the port to use when none is specified in the target string + * @param denylistProvider optional provider for checking if targets are denylisted (may be + * null) + * @return a pair containing the created ScanTarget and its status (TO_BE_EXECUTED, + * UNRESOLVABLE, or DENYLISTED) + * @throws NumberFormatException if port or rank parsing fails + * @see JobStatus */ public static Pair fromTargetString( String targetString, int defaultPort, IDenylistProvider denylistProvider) { @@ -91,49 +143,108 @@ public static Pair fromTargetString( return Pair.of(target, JobStatus.TO_BE_EXECUTED); } + /** The resolved IP address of the target host. */ private String ip; + /** The hostname of the target (may be null if target was specified as IP address). */ private String hostname; + /** The port number for the scan target. */ private int port; + /** The Tranco ranking of the target (0 if not available or not specified). */ private int trancoRank; + /** + * Creates an empty ScanTarget. + * + *

All fields will be initialized to default values. This constructor is primarily used for + * deserialization and testing purposes. + */ public ScanTarget() {} + /** + * Returns a string representation of the scan target. + * + * @return the hostname if available, otherwise the IP address + */ @Override public String toString() { return hostname != null ? hostname : ip; } + /** + * Gets the resolved IP address of the target. + * + * @return the IP address as a string + */ public String getIp() { return this.ip; } + /** + * Gets the hostname of the target. + * + * @return the hostname, or null if the target was specified as an IP address + */ public String getHostname() { return this.hostname; } + /** + * Gets the port number for the scan target. + * + * @return the port number (1-65534) + */ public int getPort() { return this.port; } + /** + * Gets the Tranco ranking of the target. + * + *

The Tranco ranking is a research-oriented top sites ranking that provides a more stable + * and transparent alternative to other web ranking services. + * + * @return the Tranco rank, or 0 if not available + * @see Tranco: A Research-Oriented Top Sites Ranking + */ public int getTrancoRank() { return this.trancoRank; } + /** + * Sets the IP address of the target. + * + * @param ip the IP address as a string (IPv4 or IPv6 format) + */ public void setIp(String ip) { this.ip = ip; } + /** + * Sets the hostname of the target. + * + * @param hostname the hostname (may be null if target is IP-only) + */ public void setHostname(String hostname) { this.hostname = hostname; } + /** + * Sets the port number for the scan target. + * + * @param port the port number (should be between 1 and 65534) + */ public void setPort(int port) { this.port = port; } + /** + * Sets the Tranco ranking of the target. + * + * @param trancoRank the Tranco rank (use 0 if not available) + */ public void setTrancoRank(int trancoRank) { this.trancoRank = trancoRank; } From 4b1fd9e73eeddb05fee8f863d3d851f0f8d259f3 Mon Sep 17 00:00:00 2001 From: Robert Merget Date: Wed, 11 Jun 2025 10:44:11 +0400 Subject: [PATCH 13/24] Complete JavaDoc documentation for BulkScan class Added comprehensive JavaDoc for all remaining public methods in BulkScan: - All 13 getter methods with detailed descriptions - All 16 setter methods with parameter documentation - Special attention to MongoDB serialization requirements Now all 4 core classes have 100% complete JavaDoc documentation for all public and protected members. --- .../de/rub/nds/crawler/data/BulkScan.java | 155 +++++++++++++++++- 1 file changed, 154 insertions(+), 1 deletion(-) diff --git a/src/main/java/de/rub/nds/crawler/data/BulkScan.java b/src/main/java/de/rub/nds/crawler/data/BulkScan.java index d413841..bce0245 100644 --- a/src/main/java/de/rub/nds/crawler/data/BulkScan.java +++ b/src/main/java/de/rub/nds/crawler/data/BulkScan.java @@ -164,6 +164,11 @@ public String get_id() { return _id; } + /** + * Gets the human-readable name of the bulk scan. + * + * @return the scan name + */ public String getName() { return this.name; } @@ -180,10 +185,20 @@ public String getCollectionName() { return this.collectionName; } + /** + * Gets the scan configuration for this bulk scan. + * + * @return the scan configuration containing scan parameters + */ public ScanConfig getScanConfig() { return this.scanConfig; } + /** + * Checks whether this bulk scan is being monitored for progress updates. + * + * @return true if monitoring is enabled, false otherwise + */ public boolean isMonitored() { return this.monitored; } @@ -200,91 +215,203 @@ public boolean isFinished() { return this.finished; } + /** + * Gets the start time of the bulk scan operation. + * + * @return the start time in epoch milliseconds + */ public long getStartTime() { return this.startTime; } + /** + * Gets the end time of the bulk scan operation. + * + * @return the end time in epoch milliseconds, or 0 if not finished + */ public long getEndTime() { return this.endTime; } + /** + * Gets the total number of targets provided for this bulk scan. + * + * @return the number of targets given + */ public int getTargetsGiven() { return this.targetsGiven; } + /** + * Gets the number of scan jobs successfully published to worker queues. + * + * @return the number of published scan jobs + */ public long getScanJobsPublished() { return this.scanJobsPublished; } + /** + * Gets the number of successfully completed scans. + * + * @return the number of successful scans + */ public int getSuccessfulScans() { return this.successfulScans; } + /** + * Gets the notification URL for scan completion callbacks. + * + * @return the notification URL, or null if not configured + */ public String getNotifyUrl() { return this.notifyUrl; } + /** + * Gets the version of the TLS scanner used for this scan. + * + * @return the scanner version string + */ public String getScannerVersion() { return this.scannerVersion; } + /** + * Gets the version of the crawler framework used for this scan. + * + * @return the crawler version string + */ public String getCrawlerVersion() { return this.crawlerVersion; } - // Setter naming important for correct serialization, do not change! + /** + * Sets the unique identifier for this bulk scan. + * + *

Important: Setter naming is critical for MongoDB serialization. Do not + * change this method name without considering serialization compatibility. + * + * @param _id the MongoDB document ID + */ public void set_id(String _id) { this._id = _id; } + /** + * Sets the human-readable name of the bulk scan. + * + * @param name the scan name + */ public void setName(String name) { this.name = name; } + /** + * Sets the MongoDB collection name for scan results. + * + * @param collectionName the collection name + */ public void setCollectionName(String collectionName) { this.collectionName = collectionName; } + /** + * Sets the scan configuration for this bulk scan. + * + * @param scanConfig the scan configuration + */ public void setScanConfig(ScanConfig scanConfig) { this.scanConfig = scanConfig; } + /** + * Sets whether this bulk scan should be monitored for progress updates. + * + * @param monitored true to enable monitoring, false otherwise + */ public void setMonitored(boolean monitored) { this.monitored = monitored; } + /** + * Sets whether the bulk scan operation has completed. + * + * @param finished true if the scan is finished, false otherwise + */ public void setFinished(boolean finished) { this.finished = finished; } + /** + * Sets the start time of the bulk scan operation. + * + * @param startTime the start time in epoch milliseconds + */ public void setStartTime(long startTime) { this.startTime = startTime; } + /** + * Sets the end time of the bulk scan operation. + * + * @param endTime the end time in epoch milliseconds + */ public void setEndTime(long endTime) { this.endTime = endTime; } + /** + * Sets the total number of targets provided for this bulk scan. + * + * @param targetsGiven the number of targets given + */ public void setTargetsGiven(int targetsGiven) { this.targetsGiven = targetsGiven; } + /** + * Sets the number of scan jobs successfully published to worker queues. + * + * @param scanJobsPublished the number of published scan jobs + */ public void setScanJobsPublished(long scanJobsPublished) { this.scanJobsPublished = scanJobsPublished; } + /** + * Sets the number of successfully completed scans. + * + * @param successfulScans the number of successful scans + */ public void setSuccessfulScans(int successfulScans) { this.successfulScans = successfulScans; } + /** + * Sets the notification URL for scan completion callbacks. + * + * @param notifyUrl the notification URL, or null to disable notifications + */ public void setNotifyUrl(String notifyUrl) { this.notifyUrl = notifyUrl; } + /** + * Sets the version of the TLS scanner used for this scan. + * + * @param scannerVersion the scanner version string + */ public void setScannerVersion(String scannerVersion) { this.scannerVersion = scannerVersion; } + /** + * Sets the version of the crawler framework used for this scan. + * + * @param crawlerVersion the crawler version string + */ public void setCrawlerVersion(String crawlerVersion) { this.crawlerVersion = crawlerVersion; } @@ -302,22 +429,48 @@ public Map getJobStatusCounters() { return jobStatusCounters; } + /** + * Sets the job status counters for tracking scan progress. + * + * @param jobStatusCounters a map of job statuses to their respective counts + * @see JobStatus + */ public void setJobStatusCounters(Map jobStatusCounters) { this.jobStatusCounters = jobStatusCounters; } + /** + * Gets the number of targets that failed hostname resolution. + * + * @return the number of targets with resolution errors + */ public long getScanJobsResolutionErrors() { return scanJobsResolutionErrors; } + /** + * Sets the number of targets that failed hostname resolution. + * + * @param scanJobsResolutionErrors the number of targets with resolution errors + */ public void setScanJobsResolutionErrors(long scanJobsResolutionErrors) { this.scanJobsResolutionErrors = scanJobsResolutionErrors; } + /** + * Gets the number of targets excluded due to denylist filtering. + * + * @return the number of denylisted targets + */ public long getScanJobsDenylisted() { return scanJobsDenylisted; } + /** + * Sets the number of targets excluded due to denylist filtering. + * + * @param scanJobsDenylisted the number of denylisted targets + */ public void setScanJobsDenylisted(long scanJobsDenylisted) { this.scanJobsDenylisted = scanJobsDenylisted; } From 14a98e2b14360ed02e83224367aaf0d714181365 Mon Sep 17 00:00:00 2001 From: Robert Merget Date: Wed, 11 Jun 2025 11:04:23 +0400 Subject: [PATCH 14/24] Document critical infrastructure classes BulkScanWorkerManager and Controller Added comprehensive JavaDoc documentation for core distributed system classes: BulkScanWorkerManager: - Singleton pattern and worker lifecycle management - Guava cache configuration and thread safety - Worker creation, caching, and cleanup strategies - Usage examples and architectural integration Controller: - Central orchestration and scheduling system - Quartz scheduler integration and lifecycle - Architecture integration with multiple providers - Scheduling options (cron, simple, one-time) - Progress monitoring and automatic shutdown Both classes now have complete documentation for all public methods, fields, and architectural considerations for distributed TLS scanning. --- .../crawler/core/BulkScanWorkerManager.java | 126 ++++++++++++++++ .../de/rub/nds/crawler/core/Controller.java | 134 +++++++++++++++++- 2 files changed, 259 insertions(+), 1 deletion(-) diff --git a/src/main/java/de/rub/nds/crawler/core/BulkScanWorkerManager.java b/src/main/java/de/rub/nds/crawler/core/BulkScanWorkerManager.java index d9df6cb..4861882 100644 --- a/src/main/java/de/rub/nds/crawler/core/BulkScanWorkerManager.java +++ b/src/main/java/de/rub/nds/crawler/core/BulkScanWorkerManager.java @@ -22,10 +22,68 @@ import org.apache.logging.log4j.Logger; import org.bson.Document; +/** + * Singleton manager for bulk scan workers that handles worker lifecycle and caching. + * + *

This class implements a caching mechanism for {@link BulkScanWorker} instances to optimize + * resource usage in distributed scanning operations. Workers are cached by bulk scan ID and + * automatically cleaned up after periods of inactivity. + * + *

Key responsibilities: + * + *

    + *
  • Worker Lifecycle Management - Creates, caches, and cleans up worker + * instances + *
  • Resource Optimization - Reuses workers for the same bulk scan to avoid + * initialization overhead + *
  • Memory Management - Automatically expires unused workers to prevent memory + * leaks + *
  • Concurrent Access - Thread-safe worker creation and caching + *
+ * + *

Caching Strategy: + * + *

    + *
  • Workers are cached by bulk scan ID for efficient reuse + *
  • 30-minute expiration after last access to free resources + *
  • Automatic cleanup when workers are evicted from cache + *
  • Lazy initialization - workers created only when needed + *
+ * + *

Thread Safety: This class is thread-safe and can handle concurrent worker + * requests from multiple threads. The underlying Guava cache provides the necessary synchronization + * guarantees. + * + *

Usage Example: + * + *

+ * // Static convenience method
+ * Future<Document> result = BulkScanWorkerManager.handleStatic(
+ *     scanJobDescription, 4, 8);
+ *
+ * // Instance usage
+ * BulkScanWorkerManager manager = BulkScanWorkerManager.getInstance();
+ * Future<Document> result = manager.handle(scanJobDescription, 4, 8);
+ * 
+ * + * @see BulkScanWorker + * @see ScanJobDescription + * @see ScanConfig + */ public class BulkScanWorkerManager { private static final Logger LOGGER = LogManager.getLogger(); + + /** Singleton instance of the worker manager. */ private static BulkScanWorkerManager instance; + /** + * Gets the singleton instance of the BulkScanWorkerManager. + * + *

This method implements lazy initialization of the singleton instance. The instance is + * created on first access and reused for subsequent calls. + * + * @return the singleton BulkScanWorkerManager instance + */ public static BulkScanWorkerManager getInstance() { if (instance == null) { instance = new BulkScanWorkerManager(); @@ -33,6 +91,18 @@ public static BulkScanWorkerManager getInstance() { return instance; } + /** + * Static convenience method for handling scan jobs without explicit instance management. + * + *

This method provides a simplified interface for processing scan jobs by automatically + * obtaining the singleton instance and delegating to the instance method. + * + * @param scanJobDescription the scan job to execute + * @param parallelConnectionThreads the number of threads for connection management + * @param parallelScanThreads the number of threads for parallel scanning + * @return a Future representing the scan operation result + * @see #handle(ScanJobDescription, int, int) + */ public static Future handleStatic( ScanJobDescription scanJobDescription, int parallelConnectionThreads, @@ -41,8 +111,20 @@ public static Future handleStatic( return instance.handle(scanJobDescription, parallelConnectionThreads, parallelScanThreads); } + /** Cache of bulk scan workers indexed by bulk scan ID. */ private final Cache> bulkScanWorkers; + /** + * Private constructor for singleton pattern. + * + *

Initializes the worker cache with the following configuration: + * + *

    + *
  • 30-minute expiration after last access + *
  • Automatic cleanup of workers when evicted + *
  • Thread-safe concurrent access + *
+ */ private BulkScanWorkerManager() { bulkScanWorkers = CacheBuilder.newBuilder() @@ -58,6 +140,28 @@ private BulkScanWorkerManager() { .build(); } + /** + * Gets or creates a bulk scan worker for the specified bulk scan. + * + *

This method implements the core caching logic for worker management: + * + *

    + *
  • If a worker exists in cache for the bulk scan ID, returns it immediately + *
  • If no worker exists, creates a new worker using the scan configuration + *
  • Newly created workers are automatically initialized before caching + *
  • Workers are cached by bulk scan ID for reuse in subsequent requests + *
+ * + *

Thread Safety: This method is thread-safe and can be called concurrently. + * The cache handles synchronization of worker creation. + * + * @param bulkScanId the unique identifier of the bulk scan + * @param scanConfig the scan configuration for creating new workers + * @param parallelConnectionThreads the number of threads for connection management + * @param parallelScanThreads the number of threads for parallel scanning + * @return the cached or newly created bulk scan worker + * @throws UncheckedException if worker creation fails + */ public BulkScanWorker getBulkScanWorker( String bulkScanId, ScanConfig scanConfig, @@ -79,6 +183,28 @@ public BulkScanWorker getBulkScanWorker( } } + /** + * Handles a scan job by obtaining the appropriate worker and executing the scan. + * + *

This method orchestrates the complete scan job execution: + * + *

    + *
  1. Extracts bulk scan information from the job description + *
  2. Obtains or creates the appropriate worker for the bulk scan + *
  3. Delegates the actual scanning to the worker + *
+ * + *

The method leverages worker caching to ensure efficient resource utilization across + * multiple scan jobs belonging to the same bulk scan operation. + * + * @param scanJobDescription the scan job containing target and configuration information + * @param parallelConnectionThreads the number of threads for connection management + * @param parallelScanThreads the number of threads for parallel scanning + * @return a Future representing the scan operation result as a MongoDB document + * @throws UncheckedException if worker creation or initialization fails + * @see ScanJobDescription + * @see BulkScanWorker#handle(de.rub.nds.crawler.data.ScanTarget) + */ public Future handle( ScanJobDescription scanJobDescription, int parallelConnectionThreads, diff --git a/src/main/java/de/rub/nds/crawler/core/Controller.java b/src/main/java/de/rub/nds/crawler/core/Controller.java index 11568c7..7bbf3e3 100644 --- a/src/main/java/de/rub/nds/crawler/core/Controller.java +++ b/src/main/java/de/rub/nds/crawler/core/Controller.java @@ -23,16 +23,84 @@ import org.quartz.impl.StdSchedulerFactory; import org.quartz.impl.matchers.GroupMatcher; -/** Controller that schedules the publishing of bulk scans. */ +/** + * Controller that orchestrates and schedules bulk scanning operations. + * + *

The Controller is the central coordination component of the TLS-Crawler system, responsible + * for managing the lifecycle of large-scale TLS scanning campaigns. It integrates with multiple + * subsystems to provide comprehensive scan orchestration. + * + *

Core responsibilities: + * + *

    + *
  • Schedule Management - Uses Quartz scheduler for flexible scan timing + *
  • Job Publishing - Coordinates with orchestration providers to distribute + * scan jobs + *
  • Progress Monitoring - Optional real-time monitoring and notification + * system + *
  • Resource Integration - Manages target lists, denylists, and persistence + * layers + *
+ * + *

Architecture Integration: + * + *

    + *
  • {@link IOrchestrationProvider} - Distributes scan jobs to worker instances + *
  • {@link IPersistenceProvider} - Handles scan result storage and retrieval + *
  • {@link ITargetListProvider} - Sources scan targets from various providers + *
  • {@link IDenylistProvider} - Filters prohibited targets + *
  • {@link ProgressMonitor} - Tracks scan progress and sends notifications + *
+ * + *

Scheduling Options: + * + *

    + *
  • One-time execution - Immediate scan job publishing + *
  • Cron-based scheduling - Recurring scans with flexible timing + *
  • Simple scheduling - Basic interval-based execution + *
+ * + *

Lifecycle: + * + *

    + *
  1. Controller initialization with configuration and providers + *
  2. Optional denylist and progress monitoring setup + *
  3. Quartz scheduler configuration and job registration + *
  4. Automatic shutdown when all scheduled jobs complete + *
+ * + * @see ControllerCommandConfig + * @see PublishBulkScanJob + * @see IOrchestrationProvider + * @see IPersistenceProvider + */ public class Controller { private static final Logger LOGGER = LogManager.getLogger(); + /** Provider for distributing scan jobs to worker instances. */ private final IOrchestrationProvider orchestrationProvider; + + /** Provider for scan result storage and retrieval. */ private final IPersistenceProvider persistenceProvider; + + /** Configuration containing controller parameters and scheduling options. */ private final ControllerCommandConfig config; + + /** Optional provider for filtering prohibited scan targets. */ private IDenylistProvider denylistProvider; + /** + * Creates a new Controller with the specified configuration and providers. + * + *

This constructor initializes the controller with all necessary dependencies for + * orchestrating bulk scanning operations. If a denylist file is specified in the configuration, + * a denylist provider is automatically created. + * + * @param config the controller configuration containing scheduling and scan parameters + * @param orchestrationProvider the provider for distributing scan jobs to workers + * @param persistenceProvider the provider for storing and retrieving scan results + */ public Controller( ControllerCommandConfig config, IOrchestrationProvider orchestrationProvider, @@ -45,6 +113,31 @@ public Controller( } } + /** + * Starts the controller and begins scheduling bulk scan operations. + * + *

This method performs the complete initialization and startup sequence: + * + *

    + *
  1. Obtains the target list provider from configuration + *
  2. Initializes the Quartz scheduler with appropriate listeners + *
  3. Creates progress monitoring if enabled in configuration + *
  4. Prepares job data map with all necessary providers and configuration + *
  5. Schedules the bulk scan publishing job according to configuration + *
  6. Starts the scheduler to begin processing + *
+ * + *

Progress Monitoring: If monitoring is enabled in the configuration, a + * {@link ProgressMonitor} is created to track scan progress and send notifications. + * + *

Automatic Shutdown: The scheduler is configured to automatically shut + * down when all scheduled jobs complete execution. + * + * @throws RuntimeException if scheduler initialization or startup fails + * @see ControllerCommandConfig#isMonitored() + * @see PublishBulkScanJob + * @see ProgressMonitor + */ public void start() { ITargetListProvider targetListProvider = config.getTargetListProvider(); @@ -82,6 +175,21 @@ public void start() { } } + /** + * Creates the appropriate schedule builder based on configuration. + * + *

This method determines the scheduling strategy: + * + *

    + *
  • Cron-based: If a cron interval is specified, creates a cron schedule + * using the system default timezone + *
  • Simple: If no cron interval is specified, creates a simple schedule + * for immediate one-time execution + *
+ * + * @return the appropriate ScheduleBuilder for the configured scheduling strategy + * @see ControllerCommandConfig#getScanCronInterval() + */ private ScheduleBuilder getScanSchedule() { if (config.getScanCronInterval() != null) { return CronScheduleBuilder.cronSchedule(config.getScanCronInterval()) @@ -91,6 +199,30 @@ private ScheduleBuilder getScanSchedule() { } } + /** + * Conditionally shuts down the scheduler if all triggers have completed. + * + *

This utility method provides graceful scheduler shutdown by checking the state of all + * registered triggers. The scheduler is shut down only when no triggers are capable of firing + * again, indicating that all scheduled work is complete. + * + *

Trigger State Checking: + * + *

    + *
  • Examines all triggers across all groups + *
  • Checks if each trigger can fire again using {@code mayFireAgain()} + *
  • Handles scheduler exceptions by assuming triggers are still active + *
  • Only shuts down when all triggers are finalized + *
+ * + *

Error Handling: If trigger state cannot be determined due to scheduler + * exceptions, the trigger is conservatively treated as still active to prevent premature + * shutdown. + * + * @param scheduler the Quartz scheduler to potentially shut down + * @see Scheduler#shutdown() + * @see Trigger#mayFireAgain() + */ public static void shutdownSchedulerIfAllTriggersFinalized(Scheduler scheduler) { try { boolean allTriggersFinalized = From 8ff98bd3ad3e05e0ac8ba18bd98bcb98f3b3f15a Mon Sep 17 00:00:00 2001 From: Robert Merget Date: Wed, 11 Jun 2025 11:14:53 +0400 Subject: [PATCH 15/24] Add comprehensive JavaDoc documentation to MongoPersistenceProvider Complete method-level documentation for all public and private methods in MongoPersistenceProvider.java, achieving 100% JavaDoc coverage: - Static registration methods (registerSerializer, registerModule) - Constructor with detailed initialization sequence - Database and collection factory methods - CRUD operations (insertBulkScan, updateBulkScan, insertScanResult) - Internal helper methods for database operations Enhanced documentation includes: - MongoDB storage architecture and caching strategy - Error handling and recovery mechanisms - Performance optimization details - Thread safety considerations - Usage examples and cross-references Continues progress toward 100% documentation coverage across all classes. --- .../persistence/MongoPersistenceProvider.java | 376 +++++++++++++++++- 1 file changed, 373 insertions(+), 3 deletions(-) diff --git a/src/main/java/de/rub/nds/crawler/persistence/MongoPersistenceProvider.java b/src/main/java/de/rub/nds/crawler/persistence/MongoPersistenceProvider.java index 0cb002f..d56324c 100644 --- a/src/main/java/de/rub/nds/crawler/persistence/MongoPersistenceProvider.java +++ b/src/main/java/de/rub/nds/crawler/persistence/MongoPersistenceProvider.java @@ -44,7 +44,62 @@ import org.bson.UuidRepresentation; import org.mongojack.JacksonMongoCollection; -/** A persistence provider implementation using MongoDB as the persistence layer. */ +/** + * MongoDB implementation of the persistence provider for TLS-Crawler scan data. + * + *

This class provides a comprehensive MongoDB-based persistence layer that handles storage and + * retrieval of bulk scan metadata and individual scan results. It implements sophisticated caching + * mechanisms and provides flexible JSON serialization support. + * + *

Key features: + * + *

    + *
  • Dual Storage Model - Separate handling for bulk scan metadata and scan + * results + *
  • Database per Scan - Each bulk scan uses its own MongoDB database + *
  • Collection Caching - Guava cache for database and collection instances + *
  • Custom Serialization - Extensible Jackson mapper with custom serializers + *
  • Automatic Indexing - Performance-optimized indexes on scan target fields + *
  • Error Recovery - Graceful handling of serialization errors + *
+ * + *

Storage Architecture: + * + *

    + *
  • Bulk Scans - Stored in a dedicated "bulkScans" collection within each scan + * database + *
  • Scan Results - Stored in dynamically named collections based on scan + * configuration + *
  • Database Naming - Each bulk scan creates a database named after the scan + *
  • Index Strategy - Automatic indexing on IP, hostname, Tranco rank, and + * result status + *
+ * + *

Caching Strategy: + * + *

    + *
  • Database connections cached for 10 minutes after last access + *
  • Collection instances cached for 10 minutes after last access + *
  • Automatic cleanup of unused connections to prevent resource leaks + *
+ * + *

Serialization Support: + * + *

    + *
  • Custom JsonSerializer registration for complex types + *
  • Jackson module support for extended functionality + *
  • BigDecimal serialization as strings for precision + *
  • Java Time API support through JavaTimeModule + *
+ * + *

Error Handling: Implements sophisticated error recovery for serialization + * failures, creating error records instead of losing scan results. + * + * @see IPersistenceProvider + * @see MongoDbDelegate + * @see BulkScan + * @see ScanResult + */ public class MongoPersistenceProvider implements IPersistenceProvider { private static final Logger LOGGER = LogManager.getLogger(); @@ -54,6 +109,26 @@ public class MongoPersistenceProvider implements IPersistenceProvider { private static final Set> serializers = new HashSet<>(); private static final Set modules = new HashSet<>(); + /** + * Registers a custom JSON serializer for use in MongoDB document serialization. + * + *

This method allows registration of custom Jackson serializers that will be applied during + * JSON serialization of scan results before storing them in MongoDB. Serializers must be + * registered before the first MongoPersistenceProvider instance is created. + * + *

Registration Lifecycle: + * + *

    + *
  • Serializers can only be registered before initialization + *
  • Once the first provider instance is created, registration is locked + *
  • Attempting to register after initialization throws RuntimeException + *
+ * + * @param serializer the custom JsonSerializer to register for MongoDB serialization + * @throws RuntimeException if called after MongoPersistenceProvider initialization + * @see #registerSerializer(JsonSerializer...) + * @see #registerModule(Module) + */ public static void registerSerializer(JsonSerializer serializer) { if (isInitialized) { throw new RuntimeException("Cannot register serializer after initialization"); @@ -61,12 +136,47 @@ public static void registerSerializer(JsonSerializer serializer) { serializers.add(serializer); } + /** + * Registers multiple custom JSON serializers for use in MongoDB document serialization. + * + *

This convenience method allows bulk registration of multiple Jackson serializers. All + * serializers will be applied during JSON serialization of scan results before storing them in + * MongoDB. + * + *

This method delegates to {@link #registerSerializer(JsonSerializer)} for each provided + * serializer, maintaining the same registration lifecycle restrictions. + * + * @param serializers vararg array of JsonSerializers to register for MongoDB serialization + * @throws RuntimeException if called after MongoPersistenceProvider initialization + * @see #registerSerializer(JsonSerializer) + * @see #registerModule(Module...) + */ public static void registerSerializer(JsonSerializer... serializers) { for (JsonSerializer serializer : serializers) { registerSerializer(serializer); } } + /** + * Registers a custom Jackson module for extended JSON serialization functionality. + * + *

This method allows registration of Jackson modules that extend the ObjectMapper's + * serialization capabilities. Modules can provide custom serializers, deserializers, type + * handlers, and other Jackson extensions for MongoDB document processing. + * + *

Module Registration: + * + *

    + *
  • Modules must be registered before the first provider instance is created + *
  • Supports any Jackson Module including third-party extensions + *
  • Registration is locked after initialization to ensure consistency + *
+ * + * @param module the Jackson Module to register for enhanced serialization support + * @throws RuntimeException if called after MongoPersistenceProvider initialization + * @see #registerModule(Module...) + * @see #registerSerializer(JsonSerializer) + */ public static void registerModule(Module module) { if (isInitialized) { throw new RuntimeException("Cannot register module after initialization"); @@ -74,6 +184,20 @@ public static void registerModule(Module module) { modules.add(module); } + /** + * Registers multiple Jackson modules for extended JSON serialization functionality. + * + *

This convenience method allows bulk registration of multiple Jackson modules. Each module + * will extend the ObjectMapper's serialization capabilities for MongoDB document processing. + * + *

This method delegates to {@link #registerModule(Module)} for each provided module, + * maintaining the same registration lifecycle restrictions. + * + * @param modules vararg array of Jackson Modules to register for enhanced serialization + * @throws RuntimeException if called after MongoPersistenceProvider initialization + * @see #registerModule(Module) + * @see #registerSerializer(JsonSerializer...) + */ public static void registerModule(Module... modules) { for (Module module : modules) { registerModule(module); @@ -87,6 +211,36 @@ public static void registerModule(Module... modules) { resultCollectionCache; private JacksonMongoCollection bulkScanCollection; + /** + * Creates and configures a MongoDB client using the provided configuration. + * + *

This static factory method handles the complete MongoDB client setup including connection + * string construction, credential management, and client configuration. It supports both direct + * password provision and password file reading. + * + *

Connection Configuration: + * + *

    + *
  • Constructs connection string from host and port + *
  • Supports MongoDB authentication with username/password + *
  • Handles password files for secure credential storage + *
  • Configures authentication source database + *
+ * + *

Password Handling: + * + *

    + *
  • Direct password from configuration takes precedence + *
  • Password file reading as fallback option + *
  • Graceful error handling for missing password files + *
  • Empty password fallback for connection attempts + *
+ * + * @param mongoDbDelegate the MongoDB configuration containing connection parameters + * @return configured MongoClient ready for database operations + * @see MongoDbDelegate + * @see MongoClientSettings + */ private static MongoClient createMongoClient(MongoDbDelegate mongoDbDelegate) { ConnectionString connectionString = new ConnectionString( @@ -120,6 +274,36 @@ private static MongoClient createMongoClient(MongoDbDelegate mongoDbDelegate) { return MongoClients.create(mongoClientSettings); } + /** + * Creates and configures a Jackson ObjectMapper for MongoDB document serialization. + * + *

This static factory method creates a fully configured ObjectMapper that handles the + * complex serialization requirements of TLS scan results. The mapper integrates custom + * serializers, modules, and specific configuration for MongoDB storage. + * + *

Configuration Features: + * + *

    + *
  • Custom serializer integration from static registration + *
  • Jackson module support including JavaTimeModule + *
  • BigDecimal serialization as strings for precision preservation + *
  • Graceful handling of empty beans without failures + *
+ * + *

Serialization Strategy: + * + *

    + *
  • Registered custom serializers take precedence + *
  • Modules provide extended functionality + *
  • Java Time API support for date/time fields + *
  • String representation for BigDecimal to avoid precision loss + *
+ * + * @return configured ObjectMapper ready for MongoDB document serialization + * @see #registerSerializer(JsonSerializer) + * @see #registerModule(Module) + * @see JavaTimeModule + */ private static ObjectMapper createMapper() { ObjectMapper mapper = new ObjectMapper(); @@ -143,9 +327,38 @@ private static ObjectMapper createMapper() { } /** - * Initialize connection to mongodb and setup MongoJack PojoToBson mapper. + * Initializes connection to MongoDB and sets up MongoJack PojoToBson mapper. + * + *

This constructor performs complete initialization of the MongoDB persistence layer + * including client connection, ObjectMapper configuration, and cache setup. It establishes the + * foundation for all subsequent database operations. + * + *

Initialization Sequence: * - * @param mongoDbDelegate Mongodb command line configuration parameters + *

    + *
  1. Marks the class as initialized to lock serializer/module registration + *
  2. Creates configured ObjectMapper with custom serializers and modules + *
  3. Establishes MongoDB client connection with authentication + *
  4. Verifies connection with a test session + *
  5. Sets up Guava caches for database and collection instances + *
+ * + *

Cache Configuration: + * + *

    + *
  • Database cache expires after 10 minutes of inactivity + *
  • Collection cache expires after 10 minutes of inactivity + *
  • Automatic collection initialization with performance indexes + *
+ * + *

Error Handling: Connection failures are wrapped in RuntimeException to + * ensure proper error propagation during application startup. + * + * @param mongoDbDelegate MongoDB command line configuration parameters + * @throws RuntimeException if MongoDB connection cannot be established + * @see MongoDbDelegate + * @see #createMapper() + * @see #createMongoClient(MongoDbDelegate) */ public MongoPersistenceProvider(MongoDbDelegate mongoDbDelegate) { isInitialized = true; @@ -175,11 +388,58 @@ public MongoPersistenceProvider(MongoDbDelegate mongoDbDelegate) { key.getLeft(), key.getRight()))); } + /** + * Initializes a MongoDB database connection for the specified database name. + * + *

This method is used by the database cache to lazily initialize database connections as + * they are requested. It provides the foundation for all database operations within a specific + * scan context. + * + *

Database Naming Strategy: Each bulk scan typically uses its own database + * to ensure data isolation and simplified management of scan results. + * + * @param dbName the name of the database to initialize + * @return initialized MongoDatabase instance ready for collection operations + * @see #databaseCache + */ private MongoDatabase initDatabase(String dbName) { LOGGER.info("Initializing database: {}.", dbName); return mongoClient.getDatabase(dbName); } + /** + * Initializes a MongoDB collection for storing scan results with performance optimization. + * + *

This method is used by the collection cache to lazily initialize collections as they are + * requested. It creates properly configured MongoJack collections with automatic indexing for + * optimal query performance. + * + *

Collection Configuration: + * + *

    + *
  • Uses the configured ObjectMapper for JSON serialization + *
  • Standard UUID representation for consistent document IDs + *
  • Type-safe ScanResult document mapping + *
+ * + *

Performance Indexing: + * + *

    + *
  • scanTarget.ip - Fast IP-based queries + *
  • scanTarget.hostname - Hostname lookup optimization + *
  • scanTarget.trancoRank - Ranking-based filtering + *
  • scanTarget.resultStatus - Status-based result filtering + *
+ * + *

Index Management: Index creation is idempotent, so repeated calls will + * not create duplicate indexes. + * + * @param dbName the database name containing the collection + * @param collectionName the name of the collection to initialize + * @return configured JacksonMongoCollection ready for scan result storage + * @see #resultCollectionCache + * @see ScanResult + */ private JacksonMongoCollection initResultCollection( String dbName, String collectionName) { LOGGER.info("Initializing collection: {}.{}.", dbName, collectionName); @@ -199,6 +459,29 @@ private JacksonMongoCollection initResultCollection( return collection; } + /** + * Gets or creates the MongoDB collection for storing bulk scan metadata. + * + *

This method implements lazy initialization of the bulk scan collection, creating it only + * when first accessed. The collection stores high-level information about bulk scanning + * operations separate from individual scan results. + * + *

Collection Purpose: + * + *

    + *
  • Stores BulkScan metadata and configuration + *
  • Tracks overall progress and status of bulk operations + *
  • Provides central reference point for scan campaigns + *
+ * + *

Singleton Pattern: The collection instance is cached after first creation + * to avoid repeated initialization overhead for subsequent access. + * + * @param dbName the database name containing the bulk scan collection + * @return JacksonMongoCollection configured for BulkScan document storage + * @see BulkScan + * @see #BULK_SCAN_COLLECTION_NAME + */ private JacksonMongoCollection getBulkScanCollection(String dbName) { if (this.bulkScanCollection == null) { this.bulkScanCollection = @@ -213,17 +496,75 @@ private JacksonMongoCollection getBulkScanCollection(String dbName) { return this.bulkScanCollection; } + /** + * Inserts a new bulk scan record into the MongoDB collection. + * + *

This method stores the bulk scan metadata in the appropriate database and collection. The + * bulk scan document contains configuration, progress tracking, and high-level information + * about the scanning campaign. + * + *

Storage Location: The bulk scan is stored in a collection named + * "bulkScans" within the database corresponding to the bulk scan's name. + * + * @param bulkScan the bulk scan metadata to insert into the database + * @throws IllegalArgumentException if bulkScan is null + * @see IPersistenceProvider#insertBulkScan(BulkScan) + * @see BulkScan + */ @Override public void insertBulkScan(@NonNull BulkScan bulkScan) { this.getBulkScanCollection(bulkScan.getName()).insertOne(bulkScan); } + /** + * Updates an existing bulk scan record in the MongoDB collection. + * + *

This method implements a replace strategy for updating bulk scan metadata. It removes the + * existing document and inserts the updated version to ensure complete replacement of all + * fields. + * + *

Update Strategy: + * + *

    + *
  1. Removes the existing document by ID + *
  2. Inserts the updated bulk scan document + *
+ * + *

Atomicity Consideration: This implementation is not atomic. In production + * environments with high concurrency, consider using MongoDB's replaceOne operation for atomic + * updates. + * + * @param bulkScan the updated bulk scan metadata to store in the database + * @throws IllegalArgumentException if bulkScan is null + * @see IPersistenceProvider#updateBulkScan(BulkScan) + * @see #insertBulkScan(BulkScan) + */ @Override public void updateBulkScan(@NonNull BulkScan bulkScan) { this.getBulkScanCollection(bulkScan.getName()).removeById(bulkScan.get_id()); this.insertBulkScan(bulkScan); } + /** + * Writes a scan result to the appropriate MongoDB collection. + * + *

This private method handles the actual database insertion of scan results. It uses the + * collection cache to obtain the appropriate collection and performs the insertion with logging + * for monitoring purposes. + * + *

Collection Resolution: The method uses the collection cache with a + * composite key of database name and collection name to obtain the properly configured MongoDB + * collection. + * + *

Performance Optimization: Collections are cached to avoid repeated + * initialization overhead during high-volume scanning operations. + * + * @param dbName the database name for the scan result storage + * @param collectionName the collection name for the scan result storage + * @param scanResult the scan result to write to the database + * @see #resultCollectionCache + * @see ScanResult + */ private void writeResultToDatabase( String dbName, String collectionName, ScanResult scanResult) { LOGGER.info( @@ -234,6 +575,35 @@ private void writeResultToDatabase( resultCollectionCache.getUnchecked(Pair.of(dbName, collectionName)).insertOne(scanResult); } + /** + * Inserts a scan result into the MongoDB collection with comprehensive error handling. + * + *

This method implements the core persistence logic for individual scan results. It includes + * validation, error recovery, and recursive error handling to ensure that scan results are + * never lost due to serialization issues. + * + *

Validation: The method validates that the scan result status matches the + * job description status to ensure data consistency before insertion. + * + *

Error Recovery Strategy: + * + *

    + *
  1. Attempt normal insertion of the scan result + *
  2. If serialization fails, create an error record instead + *
  3. If error record serialization fails, mark as internal error + *
  4. Prevent infinite recursion with serialization error handling + *
+ * + *

Status Consistency: The method ensures that scan results and job + * descriptions maintain consistent status information throughout the persistence process. + * + * @param scanResult the scan result to insert into the database + * @param scanJobDescription the job description containing storage location and status + * @throws IllegalArgumentException if result status doesn't match job description status + * @see IPersistenceProvider#insertScanResult(ScanResult, ScanJobDescription) + * @see ScanResult#fromException(ScanJobDescription, Exception) + * @see JobStatus + */ @Override public void insertScanResult(ScanResult scanResult, ScanJobDescription scanJobDescription) { if (scanResult.getResultStatus() != scanJobDescription.getStatus()) { From e903faedc38ae81c9d0fa1a479891b9f153db29a Mon Sep 17 00:00:00 2001 From: Robert Merget Date: Wed, 11 Jun 2025 13:25:04 +0400 Subject: [PATCH 16/24] Add comprehensive JavaDoc documentation to RabbitMqOrchestrationProvider Complete documentation for all public and private methods in the RabbitMQ orchestration provider, achieving 100% JavaDoc coverage: - Enhanced class-level documentation with architecture overview - Constructor documentation with initialization sequence details - All interface method implementations with comprehensive descriptions - Private helper method documentation for queue management - Message handling and error recovery strategy documentation Documentation covers: - Distributed messaging architecture using RabbitMQ - Queue setup and TTL management for notifications - Connection management with TLS and authentication support - Job distribution and load balancing mechanisms - Progress monitoring and completion notification workflows - Error handling and resource cleanup procedures Continues systematic progress toward 100% documentation coverage. --- .../RabbitMqOrchestrationProvider.java | 287 +++++++++++++++++- 1 file changed, 285 insertions(+), 2 deletions(-) diff --git a/src/main/java/de/rub/nds/crawler/orchestration/RabbitMqOrchestrationProvider.java b/src/main/java/de/rub/nds/crawler/orchestration/RabbitMqOrchestrationProvider.java index 9f9e144..64271ec 100644 --- a/src/main/java/de/rub/nds/crawler/orchestration/RabbitMqOrchestrationProvider.java +++ b/src/main/java/de/rub/nds/crawler/orchestration/RabbitMqOrchestrationProvider.java @@ -32,8 +32,53 @@ import org.apache.logging.log4j.Logger; /** - * Provides all methods required for the communication with RabbitMQ for the controller and the - * worker. + * RabbitMQ-based implementation of the orchestration provider for TLS-Crawler. + * + *

This class implements a distributed messaging system using RabbitMQ for coordinating + * large-scale TLS scanning operations between controllers and workers. It handles job distribution, + * progress monitoring, and completion notifications across multiple worker instances. + * + *

Key features: + * + *

    + *
  • Job Distribution - Publishes scan jobs to worker instances via queues + *
  • Load Balancing - Uses RabbitMQ's round-robin job distribution + *
  • Progress Monitoring - Optional completion notifications for tracking + *
  • Connection Management - Handles RabbitMQ connections with TLS support + *
  • Error Recovery - Graceful handling of serialization and network errors + *
+ * + *

Queue Architecture: + * + *

    + *
  • scan-job-queue - Main queue for distributing scan jobs to workers + *
  • done-notify-queue_* - Per-scan completion notification queues + *
  • TTL Management - Automatic cleanup of unused notification queues + *
+ * + *

Connection Features: + * + *

    + *
  • TLS/SSL support for secure communication + *
  • Authentication with username/password or password files + *
  • Configurable connection parameters (host, port, credentials) + *
  • Named thread factory for proper thread management + *
+ * + *

Message Handling: + * + *

    + *
  • Java object serialization for scan job descriptions + *
  • Message acknowledgment for reliable delivery + *
  • Prefetch control for optimal worker performance + *
  • Error handling with message rejection for invalid data + *
+ * + * @see IOrchestrationProvider + * @see RabbitMqDelegate + * @see ScanJobDescription + * @see ScanJobConsumer + * @see DoneNotificationConsumer */ public class RabbitMqOrchestrationProvider implements IOrchestrationProvider { @@ -54,6 +99,47 @@ public class RabbitMqOrchestrationProvider implements IOrchestrationProvider { private Set declaredQueues = new HashSet<>(); + /** + * Creates a new RabbitMQ orchestration provider and establishes connection. + * + *

This constructor performs complete initialization of the RabbitMQ connection including + * authentication, TLS setup, and queue declaration. It establishes the foundation for all + * subsequent messaging operations. + * + *

Initialization Sequence: + * + *

    + *
  1. Creates and configures RabbitMQ ConnectionFactory + *
  2. Sets up authentication (username/password or password file) + *
  3. Configures TLS/SSL if enabled + *
  4. Establishes connection and creates channel + *
  5. Declares the main scan job queue + *
+ * + *

Authentication Methods: + * + *

    + *
  • Direct password from configuration takes precedence + *
  • Password file reading as fallback option + *
  • Graceful error handling for missing password files + *
+ * + *

Security Features: + * + *

    + *
  • Optional TLS/SSL encryption for secure communication + *
  • Support for username/password authentication + *
  • Secure password file reading + *
+ * + *

Thread Management: Uses a named thread factory to ensure proper thread + * identification for monitoring and debugging purposes. + * + * @param rabbitMqDelegate the RabbitMQ configuration containing connection parameters + * @throws RuntimeException if connection to RabbitMQ cannot be established + * @see RabbitMqDelegate + * @see ConnectionFactory + */ public RabbitMqOrchestrationProvider(RabbitMqDelegate rabbitMqDelegate) { ConnectionFactory factory = new ConnectionFactory(); factory.setHost(rabbitMqDelegate.getRabbitMqHost()); @@ -92,6 +178,29 @@ public RabbitMqOrchestrationProvider(RabbitMqDelegate rabbitMqDelegate) { } } + /** + * Gets or creates a notification queue for the specified bulk scan. + * + *

This method implements lazy queue creation for bulk scan completion notifications. Each + * bulk scan gets its own dedicated notification queue to enable isolated progress monitoring + * without interference between different scanning campaigns. + * + *

Queue Properties: + * + *

    + *
  • Queue name format: "done-notify-queue_" + bulkScanId + *
  • Non-durable and auto-delete queues for temporary usage + *
  • 5-minute TTL to automatically clean up unused queues + *
  • One-time declaration per bulkScanId for efficiency + *
+ * + *

Cleanup Strategy: Queues are automatically deleted by RabbitMQ after 5 + * minutes of inactivity to prevent resource accumulation from completed scans. + * + * @param bulkScanId the unique identifier of the bulk scan + * @return the notification queue name for the specified bulk scan + * @see #DONE_NOTIFY_QUEUE_PROPERTIES + */ private String getDoneNotifyQueue(String bulkScanId) { String queueName = "done-notify-queue_" + bulkScanId; if (!declaredQueues.contains(queueName)) { @@ -106,6 +215,30 @@ private String getDoneNotifyQueue(String bulkScanId) { return queueName; } + /** + * Submits a scan job to the RabbitMQ queue for processing by available workers. + * + *

This method publishes scan job descriptions to the main scan job queue where they are + * distributed to available worker instances using RabbitMQ's round-robin load balancing. The + * method uses Java object serialization for reliable data transmission. + * + *

Publishing Details: + * + *

    + *
  • Uses default exchange (empty string) for direct queue routing + *
  • Publishes to the main scan job queue for worker consumption + *
  • Serializes job descriptions using Apache Commons SerializationUtils + *
  • No special message properties or persistence configuration + *
+ * + *

Error Handling: Network and I/O errors are logged but do not throw + * exceptions, allowing the controller to continue operating even if some job submissions fail. + * + * @param scanJobDescription the scan job to submit for processing by workers + * @see IOrchestrationProvider#submitScanJob(ScanJobDescription) + * @see ScanJobDescription + * @see #SCAN_JOB_QUEUE + */ @Override public void submitScanJob(ScanJobDescription scanJobDescription) { try { @@ -116,6 +249,40 @@ public void submitScanJob(ScanJobDescription scanJobDescription) { } } + /** + * Registers a consumer to receive and process scan jobs from the RabbitMQ queue. + * + *

This method sets up a worker instance to consume scan jobs from the main queue. It + * configures message prefetching, deserialization handling, and error recovery to ensure + * reliable job processing. + * + *

Consumer Configuration: + * + *

    + *
  • Sets QoS prefetch count to control worker load + *
  • Disables auto-acknowledgment for reliable delivery + *
  • Handles deserialization errors gracefully + *
  • Rejects and drops invalid messages to prevent queue blocking + *
+ * + *

Message Processing: + * + *

    + *
  1. Receives serialized scan job descriptions from queue + *
  2. Deserializes messages using Apache Commons SerializationUtils + *
  3. Adds delivery tag to job description for acknowledgment tracking + *
  4. Delegates to the provided ScanJobConsumer for actual processing + *
+ * + *

Error Recovery: Malformed or undeserializable messages are rejected and + * dropped rather than being requeued, preventing infinite processing loops. + * + * @param scanJobConsumer the consumer instance that will process received scan jobs + * @param prefetchCount the maximum number of unacknowledged messages per worker + * @see IOrchestrationProvider#registerScanJobConsumer(ScanJobConsumer, int) + * @see ScanJobConsumer + * @see ScanJobDescription + */ @Override public void registerScanJobConsumer(ScanJobConsumer scanJobConsumer, int prefetchCount) { DeliverCallback deliverCallback = @@ -143,6 +310,24 @@ public void registerScanJobConsumer(ScanJobConsumer scanJobConsumer, int prefetc } } + /** + * Sends message acknowledgment to RabbitMQ for the specified delivery tag. + * + *

This private method handles the RabbitMQ message acknowledgment protocol. Acknowledgments + * confirm that a message has been successfully processed and can be removed from the queue. + * + *

Acknowledgment Details: + * + *

    + *
  • Acknowledges a single message (not multiple) + *
  • Confirms successful processing of scan job + *
  • Allows RabbitMQ to remove message from queue + *
  • Handles I/O errors gracefully with logging + *
+ * + * @param deliveryTag the unique delivery tag of the message to acknowledge + * @see #notifyOfDoneScanJob(ScanJobDescription) + */ private void sendAck(long deliveryTag) { try { channel.basicAck(deliveryTag, false); @@ -151,6 +336,41 @@ private void sendAck(long deliveryTag) { } } + /** + * Registers a consumer to receive completion notifications for a specific bulk scan. + * + *

This method sets up monitoring for bulk scan progress by registering a consumer on the + * scan's dedicated notification queue. It enables real-time tracking of scan completion and + * progress monitoring. + * + *

Consumer Configuration: + * + *

    + *
  • QoS prefetch count of 1 for sequential notification processing + *
  • Auto-acknowledgment enabled for notification messages + *
  • Uses the bulk scan's unique notification queue + *
  • Automatic deserialization of notification payloads + *
+ * + *

Monitoring Features: + * + *

    + *
  • Per-scan isolation through dedicated queues + *
  • Real-time completion notifications + *
  • Consumer tag tracking for management + *
  • Automatic payload deserialization + *
+ * + *

Queue Management: The notification queue is created lazily when first + * accessed and automatically cleaned up after the scan completes due to TTL configuration. + * + * @param bulkScan the bulk scan to monitor for completion notifications + * @param doneNotificationConsumer the consumer to handle completion notifications + * @see IOrchestrationProvider#registerDoneNotificationConsumer(BulkScan, + * DoneNotificationConsumer) + * @see DoneNotificationConsumer + * @see #getDoneNotifyQueue(String) + */ @Override public void registerDoneNotificationConsumer( BulkScan bulkScan, DoneNotificationConsumer doneNotificationConsumer) { @@ -170,6 +390,40 @@ public void registerDoneNotificationConsumer( } } + /** + * Notifies completion of a scan job and sends progress notification if monitoring is enabled. + * + *

This method handles the completion workflow for scan jobs by acknowledging the original + * message and optionally sending progress notifications for monitored scans. It ensures + * reliable message processing and enables progress tracking. + * + *

Completion Workflow: + * + *

    + *
  1. Acknowledges the original scan job message + *
  2. Checks if the bulk scan is monitored + *
  3. Publishes completion notification if monitoring is enabled + *
  4. Handles publishing errors gracefully + *
+ * + *

Monitoring Integration: + * + *

    + *
  • Only sends notifications for monitored bulk scans + *
  • Uses the bulk scan's dedicated notification queue + *
  • Serializes the completed job description for notification + *
  • Enables real-time progress tracking + *
+ * + *

Error Handling: Message acknowledgment always occurs regardless of + * notification success, ensuring scan jobs don't get stuck in the queue due to monitoring + * issues. + * + * @param scanJobDescription the completed scan job to acknowledge and notify + * @see IOrchestrationProvider#notifyOfDoneScanJob(ScanJobDescription) + * @see #sendAck(long) + * @see #getDoneNotifyQueue(String) + */ @Override public void notifyOfDoneScanJob(ScanJobDescription scanJobDescription) { sendAck(scanJobDescription.getDeliveryTag()); @@ -186,6 +440,35 @@ public void notifyOfDoneScanJob(ScanJobDescription scanJobDescription) { } } + /** + * Closes the RabbitMQ connection and associated resources. + * + *

This method performs clean shutdown of the RabbitMQ connection by closing the channel and + * connection in the proper order. It handles potential errors during shutdown gracefully to + * ensure resources are released. + * + *

Shutdown Sequence: + * + *

    + *
  1. Closes the RabbitMQ channel + *
  2. Closes the RabbitMQ connection + *
  3. Logs any errors that occur during shutdown + *
+ * + *

Resource Management: + * + *

    + *
  • Ensures proper cleanup of RabbitMQ resources + *
  • Prevents resource leaks in long-running applications + *
  • Handles network timeouts and I/O errors gracefully + *
+ * + *

Error Handling: Shutdown errors are logged but do not prevent the method + * from completing, ensuring that cleanup attempts continue even if some resources fail to + * close. + * + * @see IOrchestrationProvider#closeConnection() + */ @Override public void closeConnection() { try { From 6efe6864ba5f649f79edc6c50e5f3a13001b83a5 Mon Sep 17 00:00:00 2001 From: Robert Merget Date: Wed, 11 Jun 2025 13:31:22 +0400 Subject: [PATCH 17/24] Add comprehensive JavaDoc documentation to ControllerCommandConfig Complete documentation for the abstract configuration class, achieving 100% JavaDoc coverage for all methods, inner classes, and extension points: - Enhanced class-level documentation with configuration overview - Constructor documentation with delegate initialization details - Comprehensive validation method documentation with rules and dependencies - Inner validator classes with complete method documentation - All getter/setter methods with parameter descriptions and defaults - Factory methods with detailed component explanations - Abstract method documentation with implementation requirements Documentation covers: - JCommander command-line parsing architecture - Target source priority logic and provider selection - Configuration validation rules and parameter dependencies - Extension points for scanner-specific implementations - BulkScan factory method with metadata components - Parameter validators for positive integers and cron expressions Continues systematic progress toward 100% documentation coverage. --- .../config/ControllerCommandConfig.java | 382 ++++++++++++++++++ 1 file changed, 382 insertions(+) diff --git a/src/main/java/de/rub/nds/crawler/config/ControllerCommandConfig.java b/src/main/java/de/rub/nds/crawler/config/ControllerCommandConfig.java index becc425..fe5964b 100644 --- a/src/main/java/de/rub/nds/crawler/config/ControllerCommandConfig.java +++ b/src/main/java/de/rub/nds/crawler/config/ControllerCommandConfig.java @@ -22,6 +22,56 @@ import org.apache.commons.validator.routines.UrlValidator; import org.quartz.CronScheduleBuilder; +/** + * Abstract base configuration class for TLS-Crawler controller command-line arguments. + * + *

This class defines the common configuration parameters needed by controller implementations to + * orchestrate large-scale TLS scanning operations. It uses JCommander annotations for command-line + * parsing and provides comprehensive validation of input parameters. + * + *

Key configuration areas: + * + *

    + *
  • Connection Configuration - RabbitMQ and MongoDB connection settings + *
  • Scan Parameters - Port, timeout, reexecutions, and detail level + *
  • Target Selection - Host files, Tranco lists, Crux lists, email MX records + *
  • Scheduling - Cron expressions for recurring scans + *
  • Monitoring - Progress tracking and notification options + *
  • Filtering - Denylist support for excluded targets + *
+ * + *

Target List Priority: When multiple target sources are specified, the + * following priority is used: + * + *

    + *
  1. Host file (if specified) + *
  2. Tranco email list (MX records) + *
  3. Crux list + *
  4. Standard Tranco list + *
+ * + *

Validation Rules: + * + *

    + *
  • At least one target source must be specified + *
  • Notification URLs require monitoring to be enabled + *
  • Cron expressions must be valid Quartz syntax + *
  • Timeout and reexecution values must be positive + *
+ * + *

Extension Points: Subclasses must implement: + * + *

    + *
  • {@link #getScanConfig()} - Provide scanner-specific configuration + *
  • {@link #getScannerClassForVersion()} - Return scanner implementation class + *
+ * + * @see RabbitMqDelegate + * @see MongoDbDelegate + * @see ITargetListProvider + * @see BulkScan + * @see ScanConfig + */ public abstract class ControllerCommandConfig { @ParametersDelegate private final RabbitMqDelegate rabbitMqDelegate; @@ -90,11 +140,53 @@ public abstract class ControllerCommandConfig { @Parameter(names = "-trancoEmail", description = "MX record for number of top x hosts") private int trancoEmail; + /** + * Creates a new controller command configuration with default delegate instances. + * + *

This constructor initializes the delegate objects that handle RabbitMQ and MongoDB + * configuration parameters. The delegates use JCommander's @ParametersDelegate annotation to + * include their parameters in the overall command-line parsing. + * + *

Delegate Initialization: + * + *

    + *
  • RabbitMqDelegate - Handles message queue connection parameters + *
  • MongoDbDelegate - Handles database connection and storage parameters + *
+ */ public ControllerCommandConfig() { rabbitMqDelegate = new RabbitMqDelegate(); mongoDbDelegate = new MongoDbDelegate(); } + /** + * Validates the configuration parameters for consistency and completeness. + * + *

This method performs comprehensive validation of all configuration parameters to ensure + * they form a valid and consistent configuration. It checks for required parameters, validates + * dependencies between parameters, and verifies format requirements. + * + *

Validation Rules: + * + *

    + *
  • Target Source Required - At least one target source must be specified: + * hostFile, tranco, trancoEmail, or crux + *
  • Monitoring Dependency - Notification URLs require monitoring to be + * enabled + *
  • URL Validation - Notification URLs must be valid URIs + *
+ * + *

Parameter Dependencies: + * + *

    + *
  • notifyUrl parameter requires monitored=true + *
  • URL validation uses Apache Commons UrlValidator + *
+ * + * @throws ParameterException if validation fails with descriptive error message + * @see ParameterException + * @see UrlValidator + */ public void validate() { if (hostFile == null && tranco == 0 && trancoEmail == 0 && crux == null) { throw new ParameterException( @@ -112,7 +204,30 @@ public void validate() { } } + /** + * JCommander parameter validator for positive integer values. + * + *

This validator ensures that integer parameters have positive values (>= 0). It is used for + * timeout and reexecution parameters where negative values would be meaningless. + * + *

Validation Logic: + * + *

    + *
  • Parses the string value as an integer + *
  • Rejects values less than 0 + *
  • Provides descriptive error messages with parameter name and value + *
+ * + * @see IParameterValidator + */ public static class PositiveInteger implements IParameterValidator { + /** + * Validates that the parameter value is a positive integer. + * + * @param name the parameter name for error reporting + * @param value the string value to validate + * @throws ParameterException if the value is not a positive integer + */ public void validate(String name, String value) throws ParameterException { int n = Integer.parseInt(value); if (n < 0) { @@ -122,80 +237,221 @@ public void validate(String name, String value) throws ParameterException { } } + /** + * JCommander parameter validator for Quartz cron expression syntax. + * + *

This validator ensures that cron expression parameters conform to valid Quartz cron + * syntax. It is used for the scanCronInterval parameter to validate recurring scan schedules. + * + *

Validation Method: + * + *

    + *
  • Uses Quartz CronScheduleBuilder to parse the expression + *
  • Throws ParameterException if parsing fails + *
  • Supports standard Quartz cron format (seconds, minutes, hours, day, month, weekday) + *
+ * + * @see IParameterValidator + * @see CronScheduleBuilder + */ public static class CronSyntax implements IParameterValidator { + /** + * Validates that the parameter value is a valid Quartz cron expression. + * + * @param name the parameter name for error reporting + * @param value the cron expression string to validate + * @throws ParameterException if the cron expression is invalid + */ public void validate(String name, String value) throws ParameterException { CronScheduleBuilder.cronSchedule(value); } } + /** + * Gets the RabbitMQ connection configuration delegate. + * + * @return the RabbitMQ configuration delegate + */ public RabbitMqDelegate getRabbitMqDelegate() { return rabbitMqDelegate; } + /** + * Gets the MongoDB connection configuration delegate. + * + * @return the MongoDB configuration delegate + */ public MongoDbDelegate getMongoDbDelegate() { return mongoDbDelegate; } + /** + * Gets the port number to be scanned. + * + * @return the target port number (default: 443) + */ public int getPort() { return port; } + /** + * Sets the port number to be scanned. + * + * @param port the target port number + */ public void setPort(int port) { this.port = port; } + /** + * Gets the scanner detail level configuration. + * + * @return the scanner detail level + */ public ScannerDetail getScanDetail() { return scanDetail; } + /** + * Gets the scanner timeout value in milliseconds. + * + * @return the scanner timeout (default: 2000ms) + */ public int getScannerTimeout() { return scannerTimeout; } + /** + * Gets the number of reexecutions for failed scans. + * + * @return the reexecution count (default: 3) + */ public int getReexecutions() { return reexecutions; } + /** + * Gets the cron expression for recurring scans. + * + * @return the cron interval expression, or null for one-time execution + */ public String getScanCronInterval() { return scanCronInterval; } + /** + * Gets the human-readable name for this scan campaign. + * + * @return the scan name + */ public String getScanName() { return scanName; } + /** + * Gets the path to the host file containing scan targets. + * + * @return the host file path + */ public String getHostFile() { return hostFile; } + /** + * Sets the path to the host file containing scan targets. + * + * @param hostFile the host file path + */ public void setHostFile(String hostFile) { this.hostFile = hostFile; } + /** + * Gets the path to the denylist file for excluded targets. + * + * @return the denylist file path + */ public String getDenylistFile() { return denylistFile; } + /** + * Checks if scan progress monitoring is enabled. + * + * @return true if monitoring is enabled, false otherwise + */ public boolean isMonitored() { return monitored; } + /** + * Gets the notification URL for scan completion callbacks. + * + * @return the notification URL, or null if not configured + */ public String getNotifyUrl() { return notifyUrl; } + /** + * Gets the number of top Tranco list hosts to scan. + * + * @return the Tranco host count + */ public int getTranco() { return tranco; } + /** + * Gets the Crux list configuration for Chrome UX Report data. + * + * @return the Crux list number configuration + */ public CruxListNumber getCrux() { return crux; } + /** + * Gets the number of Tranco hosts for email MX record scanning. + * + * @return the Tranco email host count + */ public int getTrancoEmail() { return trancoEmail; } + /** + * Creates and returns the appropriate target list provider based on configuration. + * + *

This method implements the target source priority logic, selecting the appropriate + * provider based on which parameters were specified. It provides a single point of target list + * creation with consistent priority ordering. + * + *

Priority Order: + * + *

    + *
  1. Host File - Direct file with target hosts (highest priority) + *
  2. Tranco Email - MX records from Tranco list entries + *
  3. Crux List - Google Chrome UX Report data + *
  4. Tranco List - Standard website popularity ranking (fallback) + *
+ * + *

Provider Types: + * + *

    + *
  • {@link TargetFileProvider} - Reads targets from a local file + *
  • {@link TrancoEmailListProvider} - Extracts MX records from Tranco data + *
  • {@link CruxListProvider} - Uses Chrome UX Report target lists + *
  • {@link TrancoListProvider} - Standard Tranco website ranking + *
+ * + * @return the target list provider instance based on configuration priority + * @see ITargetListProvider + * @see TargetFileProvider + * @see TrancoListProvider + * @see CruxListProvider + * @see TrancoEmailListProvider + */ public ITargetListProvider getTargetListProvider() { if (getHostFile() != null) { return new TargetFileProvider(getHostFile()); @@ -209,8 +465,53 @@ public ITargetListProvider getTargetListProvider() { return new TrancoListProvider(getTranco()); } + /** + * Returns the scanner-specific configuration for this controller implementation. + * + *

This abstract method must be implemented by subclasses to provide the appropriate + * ScanConfig instance for their specific scanner type. The scan configuration defines how + * individual scan jobs should be executed. + * + *

Implementation Requirements: Subclasses should create a ScanConfig that + * includes: + * + *

    + *
  • Scanner implementation class + *
  • Scanner-specific parameters + *
  • Worker factory configuration + *
  • Any custom scan behavior settings + *
+ * + * @return the scan configuration for this controller's scanner type + * @see ScanConfig + */ public abstract ScanConfig getScanConfig(); + /** + * Creates a new BulkScan instance using the current configuration parameters. + * + *

This factory method constructs a BulkScan object with all necessary metadata and + * configuration for a scanning campaign. The BulkScan serves as the central coordination object + * for the entire scanning operation. + * + *

BulkScan Components: + * + *

    + *
  • Scanner Class - The scanner implementation to use + *
  • Crawler Class - The controller implementation class + *
  • Scan Name - Human-readable identifier for the scan + *
  • Scan Config - Scanner-specific configuration + *
  • Timestamp - Creation time for tracking + *
  • Monitoring - Whether progress tracking is enabled + *
  • Notification URL - Optional completion notification endpoint + *
+ * + * @return a new BulkScan instance configured with current parameters + * @see BulkScan + * @see #getScanConfig() + * @see #getScannerClassForVersion() + * @see #getCrawlerClassForVersion() + */ public BulkScan createBulkScan() { return new BulkScan( getScannerClassForVersion(), @@ -222,52 +523,133 @@ public BulkScan createBulkScan() { getNotifyUrl()); } + /** + * Returns the controller class for version tracking and compatibility. + * + *

This method provides the controller implementation class for tracking which version of the + * crawler was used to create a bulk scan. This information is stored in the BulkScan metadata + * for debugging and compatibility purposes. + * + * @return the concrete controller class that extends this configuration + */ public Class getCrawlerClassForVersion() { return this.getClass(); } + /** + * Returns the scanner implementation class for version tracking. + * + *

This abstract method must be implemented by subclasses to provide the specific scanner + * class they use. This information is stored in BulkScan metadata for version tracking and + * worker compatibility verification. + * + *

Implementation Notes: + * + *

    + *
  • Should return the main scanner class (e.g., TlsServerScanner.class) + *
  • Used for version compatibility checks + *
  • Helps ensure workers use the correct scanner implementation + *
+ * + * @return the scanner implementation class for this controller + */ public abstract Class getScannerClassForVersion(); + /** + * Sets the scanner detail level configuration. + * + * @param scanDetail the scanner detail level to use + */ public void setScanDetail(ScannerDetail scanDetail) { this.scanDetail = scanDetail; } + /** + * Sets the scanner timeout value in milliseconds. + * + * @param scannerTimeout the scanner timeout value + */ public void setScannerTimeout(int scannerTimeout) { this.scannerTimeout = scannerTimeout; } + /** + * Sets the number of reexecutions for failed scans. + * + * @param reexecutions the reexecution count + */ public void setReexecutions(int reexecutions) { this.reexecutions = reexecutions; } + /** + * Sets the cron expression for recurring scans. + * + * @param scanCronInterval the cron interval expression + */ public void setScanCronInterval(String scanCronInterval) { this.scanCronInterval = scanCronInterval; } + /** + * Sets the human-readable name for this scan campaign. + * + * @param scanName the scan name + */ public void setScanName(String scanName) { this.scanName = scanName; } + /** + * Sets the path to the denylist file for excluded targets. + * + * @param denylistFile the denylist file path + */ public void setDenylistFile(String denylistFile) { this.denylistFile = denylistFile; } + /** + * Sets whether scan progress monitoring is enabled. + * + * @param monitored true to enable monitoring, false to disable + */ public void setMonitored(boolean monitored) { this.monitored = monitored; } + /** + * Sets the notification URL for scan completion callbacks. + * + * @param notifyUrl the notification URL + */ public void setNotifyUrl(String notifyUrl) { this.notifyUrl = notifyUrl; } + /** + * Sets the number of top Tranco list hosts to scan. + * + * @param tranco the Tranco host count + */ public void setTranco(int tranco) { this.tranco = tranco; } + /** + * Sets the Crux list configuration for Chrome UX Report data. + * + * @param crux the Crux list number configuration + */ public void setCrux(CruxListNumber crux) { this.crux = crux; } + /** + * Sets the number of Tranco hosts for email MX record scanning. + * + * @param trancoEmail the Tranco email host count + */ public void setTrancoEmail(int trancoEmail) { this.trancoEmail = trancoEmail; } From 85554faff719005f91e9c880629d3614eb58785b Mon Sep 17 00:00:00 2001 From: Robert Merget Date: Wed, 11 Jun 2025 13:40:29 +0400 Subject: [PATCH 18/24] Add comprehensive JavaDoc documentation to ProgressMonitor Complete documentation for the real-time progress monitoring system, achieving 100% JavaDoc coverage for all methods and inner classes: - Enhanced class-level documentation with monitoring system overview - Constructor documentation with dependency responsibilities - Inner BulkscanMonitor class with complete method documentation - Progress tracking methods with workflow and feature explanations - Notification system documentation with HTTP integration details - Performance analysis methods with algorithm explanations Documentation covers: - Real-time scan job completion tracking and statistics - Performance metrics including global and moving averages - ETA calculation with adaptive alpha smoothing algorithms - HTTP notification system for external integration - Automatic cleanup and controller shutdown coordination - Time formatting utilities with adaptive unit selection - Error handling and thread interruption management Continues systematic progress toward 100% documentation coverage. --- .../rub/nds/crawler/core/ProgressMonitor.java | 251 +++++++++++++++++- 1 file changed, 238 insertions(+), 13 deletions(-) diff --git a/src/main/java/de/rub/nds/crawler/core/ProgressMonitor.java b/src/main/java/de/rub/nds/crawler/core/ProgressMonitor.java index 5965801..813c670 100644 --- a/src/main/java/de/rub/nds/crawler/core/ProgressMonitor.java +++ b/src/main/java/de/rub/nds/crawler/core/ProgressMonitor.java @@ -29,9 +29,52 @@ import org.quartz.SchedulerException; /** - * The ProgressMonitor keeps track of the progress of the running bulk scans. It consumes the done - * notifications from the workers and counts for each bulk scan how many scans are done, how many - * timed out and how many results were written to the DB. + * Real-time progress monitoring system for TLS-Crawler bulk scanning operations. + * + *

The ProgressMonitor provides comprehensive tracking and reporting of bulk scan progress by + * consuming completion notifications from worker instances. It maintains detailed statistics, + * calculates performance metrics, and provides estimated completion times for running scans. + * + *

Key capabilities: + * + *

    + *
  • Progress Tracking - Real-time monitoring of scan job completion + *
  • Performance Metrics - Global and moving average completion times + *
  • Status Categorization - Detailed breakdown by job completion status + *
  • ETA Calculation - Estimated time to completion based on current rates + *
  • Completion Notifications - HTTP callbacks when scans finish + *
  • Automatic Cleanup - Resource management and scheduler shutdown + *
+ * + *

Monitoring Architecture: + * + *

    + *
  • Registers consumers for bulk scan completion notifications via orchestration provider + *
  • Maintains per-scan job counters and statistics in memory + *
  • Updates persistence layer with final scan results and metadata + *
  • Integrates with Quartz scheduler for automatic controller shutdown + *
+ * + *

Performance Analysis: + * + *

    + *
  • Global Average - Overall time per scan job since scan start + *
  • Moving Average - Exponential moving average for recent performance + *
  • Adaptive Alpha - Dynamic smoothing factor based on sample size + *
  • ETA Prediction - Remaining time estimate using moving average + *
+ * + *

Status Categories: Tracks completion status including SUCCESS, EMPTY, + * TIMEOUT, ERROR, SERIALIZATION_ERROR, and INTERNAL_ERROR for detailed failure analysis. + * + *

Notification Integration: Supports HTTP POST notifications with + * JSON-serialized BulkScan objects for external system integration and workflow automation. + * + * @see BulkScanJobCounters + * @see IOrchestrationProvider + * @see IPersistenceProvider + * @see DoneNotificationConsumer + * @see JobStatus */ public class ProgressMonitor { @@ -47,6 +90,29 @@ public class ProgressMonitor { private boolean listenerRegistered; + /** + * Creates a new progress monitor with required dependencies for scan tracking. + * + *

This constructor initializes the progress monitoring system with the necessary components + * for tracking bulk scan progress, managing job counters, and coordinating with the distributed + * scanning infrastructure. + * + *

Component Responsibilities: + * + *

    + *
  • Orchestration Provider - Receives completion notifications from + * workers + *
  • Persistence Provider - Updates scan metadata and final results + *
  • Scheduler - Manages controller lifecycle and automatic shutdown + *
+ * + *

Initialization: Sets up the internal job counter map and prepares the + * monitor for tracking multiple concurrent bulk scan operations. + * + * @param orchestrationProvider the provider for worker communication and notifications + * @param persistenceProvider the provider for database operations and result storage + * @param scheduler the Quartz scheduler for controller lifecycle management + */ public ProgressMonitor( IOrchestrationProvider orchestrationProvider, IPersistenceProvider persistenceProvider, @@ -57,6 +123,30 @@ public ProgressMonitor( this.scheduler = scheduler; } + /** + * Inner class that implements completion notification consumption for individual bulk scans. + * + *

This class handles the real-time processing of scan job completion notifications, + * maintaining performance metrics, calculating ETAs, and providing detailed progress logging + * for a specific bulk scan operation. + * + *

Performance Tracking: + * + *

    + *
  • Global Average - Total time divided by completed jobs + *
  • Moving Average - Exponential smoothing of recent completion times + *
  • Adaptive Alpha - Dynamic smoothing factor (0.1 after 20 jobs, adaptive + * before) + *
  • ETA Calculation - Estimated completion time based on moving average + *
+ * + *

Logging Features: Provides comprehensive progress logging including + * completion counts, performance metrics, status breakdowns, and estimated completion times. + * + * @see DoneNotificationConsumer + * @see BulkScan + * @see BulkScanJobCounters + */ private class BulkscanMonitor implements DoneNotificationConsumer { private final BulkScan bulkScan; private final BulkScanJobCounters counters; @@ -64,12 +154,37 @@ private class BulkscanMonitor implements DoneNotificationConsumer { private double movingAverageDuration = -1; private long lastTime = System.currentTimeMillis(); + /** + * Creates a new bulk scan monitor for the specified scan and counters. + * + * @param bulkScan the bulk scan to monitor + * @param counters the job counters for tracking completion statistics + */ public BulkscanMonitor(BulkScan bulkScan, BulkScanJobCounters counters) { this.bulkScan = bulkScan; this.counters = counters; this.bulkScanId = bulkScan.get_id(); } + /** + * Formats a time duration in milliseconds into a human-readable string. + * + *

This method provides adaptive time formatting that automatically selects the most + * appropriate time unit based on the magnitude of the duration. + * + *

Format Rules: + * + *

    + *
  • < 1 second: "XXX ms" + *
  • < 100 seconds: "XX.XX s" + *
  • < 100 minutes: "XX m XX s" + *
  • < 48 hours: "XX h XX m" + *
  • >= 48 hours: "XX.X d" + *
+ * + * @param millis the duration in milliseconds to format + * @return formatted time string with appropriate units + */ private String formatTime(double millis) { if (millis < 1000) { return String.format("%4.0f ms", millis); @@ -93,6 +208,35 @@ private String formatTime(double millis) { return String.format("%.1f d", days); } + /** + * Processes a scan job completion notification and updates progress metrics. + * + *

This method implements the core progress tracking logic, updating job counters, + * calculating performance metrics, logging progress information, and determining when the + * bulk scan is complete. + * + *

Processing Steps: + * + *

    + *
  1. Updates job status counters and gets total completion count + *
  2. Calculates global average duration since scan start + *
  3. Updates exponential moving average with adaptive alpha + *
  4. Computes estimated time to completion (ETA) + *
  5. Logs comprehensive progress information + *
  6. Triggers bulk scan finalization if all jobs complete + *
+ * + *

Performance Metrics: + * + *

    + *
  • Alpha Calculation - 2/(totalDone+1) for first 20 jobs, 0.1 after + *
  • Moving Average - α × current_duration + (1-α) × previous_average + *
  • ETA - (remaining_jobs × moving_average_duration) + *
+ * + * @param consumerTag the RabbitMQ consumer tag for this notification + * @param scanJob the completed scan job description + */ @Override public void consumeDoneNotification(String consumerTag, ScanJobDescription scanJob) { try { @@ -141,10 +285,38 @@ public void consumeDoneNotification(String consumerTag, ScanJobDescription scanJ } /** - * Adds a listener for the done notification queue that updates the counters for the bulk scans - * and checks if a bulk scan is finished. + * Initiates progress monitoring for a bulk scan operation. + * + *

This method sets up real-time progress tracking for the specified bulk scan by creating + * job counters, registering notification consumers, and preparing the monitoring infrastructure + * for scan job completion notifications. + * + *

Setup Process: * - * @param bulkScan that should be monitored + *

    + *
  1. Creates BulkScanJobCounters for the scan + *
  2. Registers the scan in the internal tracking map + *
  3. Sets up BulkscanMonitor as notification consumer + *
  4. Registers with orchestration provider for completion notifications + *
+ * + *

Monitoring Features: + * + *

    + *
  • Real-time job completion counting by status + *
  • Performance metric calculation and ETA estimation + *
  • Comprehensive progress logging + *
  • Automatic scan finalization when complete + *
+ * + *

Note: The listener registration is performed only once per + * ProgressMonitor instance to avoid duplicate registrations. + * + * @param bulkScan the bulk scan operation to monitor for progress + * @see BulkScanJobCounters + * @see BulkscanMonitor + * @see IOrchestrationProvider#registerDoneNotificationConsumer(BulkScan, + * DoneNotificationConsumer) */ public void startMonitoringBulkScanProgress(BulkScan bulkScan) { final BulkScanJobCounters counters = new BulkScanJobCounters(bulkScan); @@ -158,10 +330,39 @@ public void startMonitoringBulkScanProgress(BulkScan bulkScan) { } /** - * Finishes the monitoring, updates the bulk scan in DB, sends HTTP notification if configured - * and shuts the controller down if all bulk scans are finished. + * Finalizes a completed bulk scan and performs cleanup operations. + * + *

This method handles the complete finalization workflow when a bulk scan reaches + * completion, including database updates, notification delivery, resource cleanup, and + * controller shutdown coordination. * - * @param bulkScanId of the bulk scan for which the monitoring should be stopped. + *

Finalization Workflow: + * + *

    + *
  1. Status Update - Marks scan as finished with end timestamp + *
  2. Statistics Collection - Updates final job status counters + *
  3. Database Persistence - Saves updated BulkScan to database + *
  4. Memory Cleanup - Removes scan from active monitoring map + *
  5. HTTP Notification - Sends completion callback if configured + *
  6. Controller Shutdown - Initiates shutdown if all scans complete + *
+ * + *

Notification Handling: + * + *

    + *
  • HTTP POST with JSON-serialized BulkScan object + *
  • Comprehensive error handling and logging + *
  • Thread interruption handling for graceful shutdown + *
+ * + *

Automatic Shutdown: When all monitored bulk scans complete and the + * scheduler is shut down, automatically closes orchestration provider connections for clean + * termination. + * + * @param bulkScanId the unique identifier of the bulk scan to finalize + * @see #notify(BulkScan) + * @see IPersistenceProvider#updateBulkScan(BulkScan) + * @see IOrchestrationProvider#closeConnection() */ public void stopMonitoringAndFinalizeBulkScan(String bulkScanId) { LOGGER.info("BulkScan '{}' is finished", bulkScanId); @@ -209,11 +410,35 @@ public void stopMonitoringAndFinalizeBulkScan(String bulkScanId) { } /** - * Sends an HTTP POST request containing the bulk scan object as json as body to the url that is - * specified for the bulk scan. + * Sends an HTTP POST notification with bulk scan completion data. + * + *

This method implements the HTTP notification feature for external system integration. It + * serializes the completed BulkScan object as JSON and sends it via HTTP POST to the configured + * notification URL. + * + *

Request Configuration: + * + *

    + *
  • Method - HTTP POST + *
  • Content-Type - application/json + *
  • Body - Pretty-printed JSON representation of BulkScan + *
  • URL - Taken from BulkScan.getNotifyUrl() + *
+ * + *

JSON Serialization: Uses Jackson ObjectMapper with default + * pretty-printing to create a comprehensive JSON representation including all scan metadata, + * statistics, and results. + * + *

HTTP Client: Uses Java 11+ HttpClient for modern, efficient HTTP + * communication with automatic connection management. * - * @param bulkScan for which a done notification request should be sent - * @return body of the http response as string + * @param bulkScan the completed bulk scan to send notification for + * @return the HTTP response body as a string + * @throws IOException if network communication fails + * @throws InterruptedException if the HTTP request is interrupted + * @see ObjectMapper + * @see HttpClient + * @see HttpRequest */ private static String notify(BulkScan bulkScan) throws IOException, InterruptedException { ObjectMapper objectMapper = new ObjectMapper(); From e69a04253c2976e88c28a33985f51896d267d0b3 Mon Sep 17 00:00:00 2001 From: Robert Merget Date: Wed, 11 Jun 2025 21:26:21 +0400 Subject: [PATCH 19/24] Complete JavaDoc documentation for 100% coverage This commit achieves 100% JavaDoc documentation coverage for the Crawler-Core project by adding comprehensive class-level and method-level documentation to all public APIs and implementations. Key improvements: - Added missing class documentation for 15+ core classes - Documented all enum constants and methods in CruxListNumber and JobStatus - Added comprehensive field documentation for data classes - Created constructors with documentation for delegate and utility classes - Enhanced functional interface documentation with usage examples - Fixed HTML formatting issues and invalid cross-references - Added architectural context and usage patterns throughout The documentation now provides complete coverage of: - Distributed TLS scanning architecture and components - Target list provider system and implementations - Message queue orchestration patterns - Database persistence layer abstractions - Security filtering and denylist functionality - Utility classes for cancellable execution All JavaDoc generation completes successfully with no errors or warnings. All existing tests continue to pass after documentation additions. --- .../java/de/rub/nds/crawler/CommonMain.java | 5 + .../config/ControllerCommandConfig.java | 12 + .../crawler/config/WorkerCommandConfig.java | 144 ++++++++++++ .../config/delegate/MongoDbDelegate.java | 117 +++++++++ .../config/delegate/RabbitMqDelegate.java | 125 ++++++++++ .../nds/crawler/constant/CruxListNumber.java | 63 +++++ .../rub/nds/crawler/constant/JobStatus.java | 70 ++++++ .../java/de/rub/nds/crawler/core/Worker.java | 222 +++++++++++++++++- .../crawler/core/jobs/PublishBulkScanJob.java | 170 ++++++++++++++ .../de/rub/nds/crawler/data/BulkScanInfo.java | 109 ++++++++- .../nds/crawler/data/BulkScanJobCounters.java | 108 +++++++++ .../de/rub/nds/crawler/data/ScanConfig.java | 128 ++++++++++ .../nds/crawler/data/ScanJobDescription.java | 156 ++++++++++++ .../de/rub/nds/crawler/data/ScanResult.java | 138 +++++++++++ .../denylist/DenylistFileProvider.java | 79 ++++++- .../crawler/denylist/IDenylistProvider.java | 88 +++++++ .../DoneNotificationConsumer.java | 89 +++++++ .../orchestration/IOrchestrationProvider.java | 164 +++++++++++-- .../orchestration/ScanJobConsumer.java | 80 +++++++ .../persistence/IPersistenceProvider.java | 129 +++++++++- .../crawler/targetlist/CruxListProvider.java | 56 ++++- .../targetlist/ITargetListProvider.java | 74 ++++++ .../targetlist/TargetFileProvider.java | 97 ++++++++ .../targetlist/TrancoEmailListProvider.java | 70 +++++- .../targetlist/TrancoListProvider.java | 58 ++++- .../crawler/targetlist/ZipFileProvider.java | 131 +++++++++++ .../util/CanceallableThreadPoolExecutor.java | 79 +++++++ .../nds/crawler/util/CancellableFuture.java | 58 +++++ 28 files changed, 2776 insertions(+), 43 deletions(-) diff --git a/src/main/java/de/rub/nds/crawler/CommonMain.java b/src/main/java/de/rub/nds/crawler/CommonMain.java index 32eaf0e..ac13263 100644 --- a/src/main/java/de/rub/nds/crawler/CommonMain.java +++ b/src/main/java/de/rub/nds/crawler/CommonMain.java @@ -46,6 +46,11 @@ public class CommonMain { private static final Logger LOGGER = LogManager.getLogger(); + /** Private constructor to prevent instantiation of utility class. */ + private CommonMain() { + // Utility class should not be instantiated + } + /** * Main entry point for the TLS-Crawler application. * diff --git a/src/main/java/de/rub/nds/crawler/config/ControllerCommandConfig.java b/src/main/java/de/rub/nds/crawler/config/ControllerCommandConfig.java index fe5964b..31527f5 100644 --- a/src/main/java/de/rub/nds/crawler/config/ControllerCommandConfig.java +++ b/src/main/java/de/rub/nds/crawler/config/ControllerCommandConfig.java @@ -221,6 +221,12 @@ public void validate() { * @see IParameterValidator */ public static class PositiveInteger implements IParameterValidator { + + /** Creates a new positive integer validator. */ + public PositiveInteger() { + // Default constructor for JCommander parameter validation + } + /** * Validates that the parameter value is a positive integer. * @@ -255,6 +261,12 @@ public void validate(String name, String value) throws ParameterException { * @see CronScheduleBuilder */ public static class CronSyntax implements IParameterValidator { + + /** Creates a new cron syntax validator. */ + public CronSyntax() { + // Default constructor for JCommander parameter validation + } + /** * Validates that the parameter value is a valid Quartz cron expression. * diff --git a/src/main/java/de/rub/nds/crawler/config/WorkerCommandConfig.java b/src/main/java/de/rub/nds/crawler/config/WorkerCommandConfig.java index 63dc681..6491791 100644 --- a/src/main/java/de/rub/nds/crawler/config/WorkerCommandConfig.java +++ b/src/main/java/de/rub/nds/crawler/config/WorkerCommandConfig.java @@ -13,6 +13,59 @@ import de.rub.nds.crawler.config.delegate.MongoDbDelegate; import de.rub.nds.crawler.config.delegate.RabbitMqDelegate; +/** + * Configuration class for TLS-Crawler worker command-line arguments and parameters. + * + *

This class defines the configuration parameters needed by worker instances to participate in + * distributed TLS scanning operations. Workers consume scan jobs from the message queue, execute + * TLS scans, and store results in the database. The configuration controls worker performance, + * concurrency, and integration with the distributed infrastructure. + * + *

Key configuration areas: + * + *

    + *
  • Connection Configuration - RabbitMQ and MongoDB connection settings + *
  • Threading Configuration - Parallel scan and connection thread pools + *
  • Timeout Management - Scan timeout and RabbitMQ coordination + *
  • Performance Tuning - CPU utilization and throughput optimization + *
+ * + *

Threading Architecture: + * + *

    + *
  • Scan Threads - Each thread runs a separate scanner instance for parallel + * execution + *
  • Connection Threads - Shared pool for parallel network connections within + * scans + *
  • Default Sizing - Scan threads default to CPU count, connections default to + * 20 + *
+ * + *

Timeout Coordination: + * + *

    + *
  • Scan timeout (14 min default) must be less than RabbitMQ consumer ACK timeout (15 min) + *
  • Prevents RabbitMQ connection closure due to unacknowledged messages + *
  • Worker attempts graceful scan shutdown on timeout (not guaranteed) + *
  • Timeout violations can lead to orphaned scan processes + *
+ * + *

Resource Management: + * + *

    + *
  • CPU-aware default thread count for optimal processor utilization + *
  • Connection pooling for efficient network resource usage + *
  • Timeout controls to prevent resource exhaustion + *
+ * + *

Infrastructure Integration: Uses delegate pattern for RabbitMQ and MongoDB + * configuration to maintain separation of concerns and enable reuse across controller and worker + * configurations. + * + * @see RabbitMqDelegate + * @see MongoDbDelegate + * @see ControllerCommandConfig + */ public class WorkerCommandConfig { @ParametersDelegate private final RabbitMqDelegate rabbitMqDelegate; @@ -38,39 +91,130 @@ public class WorkerCommandConfig { + "After the timeout the worker tries to shutdown the scan but a shutdown can not be guaranteed due to the TLS-Scanner implementation.") private int scanTimeout = 840000; + /** + * Creates a new worker command configuration with default delegate instances. + * + *

This constructor initializes the delegate objects that handle RabbitMQ and MongoDB + * configuration parameters. The delegates use JCommander's @ParametersDelegate annotation to + * include their parameters in the worker's command-line parsing. + * + *

Delegate Initialization: + * + *

    + *
  • RabbitMqDelegate - Handles message queue connection and consumption parameters + *
  • MongoDbDelegate - Handles database connection and result storage parameters + *
+ * + *

Default Values: + * + *

    + *
  • Parallel scan threads - CPU count (Runtime.availableProcessors()) + *
  • Parallel connection threads - 20 + *
  • Scan timeout - 840,000 ms (14 minutes) + *
+ */ public WorkerCommandConfig() { rabbitMqDelegate = new RabbitMqDelegate(); mongoDbDelegate = new MongoDbDelegate(); } + /** + * Gets the RabbitMQ connection configuration delegate. + * + * @return the RabbitMQ configuration delegate for message queue operations + */ public RabbitMqDelegate getRabbitMqDelegate() { return rabbitMqDelegate; } + /** + * Gets the MongoDB connection configuration delegate. + * + * @return the MongoDB configuration delegate for database operations + */ public MongoDbDelegate getMongoDbDelegate() { return mongoDbDelegate; } + /** + * Gets the number of parallel scan threads for concurrent scanner execution. + * + *

Each scan thread runs a separate TLS scanner instance, allowing the worker to process + * multiple scan jobs simultaneously. The default value equals the number of available CPU cores + * for optimal processor utilization. + * + * @return the number of parallel scan threads (default: CPU count) + */ public int getParallelScanThreads() { return parallelScanThreads; } + /** + * Gets the number of parallel connection threads for network operations. + * + *

These threads are shared across all scan threads within a bulk scan to handle concurrent + * network connections efficiently. A higher count allows more simultaneous connections but + * increases resource usage. + * + * @return the number of parallel connection threads (default: 20) + */ public int getParallelConnectionThreads() { return parallelConnectionThreads; } + /** + * Gets the overall timeout for individual scan operations. + * + *

Critical Timing Constraint: This timeout must be lower than the RabbitMQ + * consumer acknowledgment timeout (default 15 minutes) to prevent connection closure due to + * unacknowledged messages. + * + *

Timeout Behavior: + * + *

    + *
  • Worker attempts graceful scan shutdown when timeout is reached + *
  • Shutdown is not guaranteed due to TLS-Scanner implementation constraints + *
  • Exceeded timeouts may result in orphaned scan processes + *
+ * + * @return the scan timeout in milliseconds (default: 840,000 ms / 14 minutes) + */ public int getScanTimeout() { return scanTimeout; } + /** + * Sets the number of parallel scan threads for concurrent scanner execution. + * + *

Configures how many TLS scanner instances can run simultaneously within this worker. + * Higher values increase throughput but also CPU and memory usage. + * + * @param parallelScanThreads the number of parallel scan threads + */ public void setParallelScanThreads(int parallelScanThreads) { this.parallelScanThreads = parallelScanThreads; } + /** + * Sets the number of parallel connection threads for network operations. + * + *

Configures the shared thread pool size for concurrent network connections across all scan + * operations. Balance between connection capacity and resource usage. + * + * @param parallelConnectionThreads the number of parallel connection threads + */ public void setParallelConnectionThreads(int parallelConnectionThreads) { this.parallelConnectionThreads = parallelConnectionThreads; } + /** + * Sets the overall timeout for individual scan operations. + * + *

Important: Must be less than RabbitMQ consumer ACK timeout to prevent + * message queue connection issues. + * + * @param scanTimeout the scan timeout in milliseconds + */ public void setScanTimeout(int scanTimeout) { this.scanTimeout = scanTimeout; } diff --git a/src/main/java/de/rub/nds/crawler/config/delegate/MongoDbDelegate.java b/src/main/java/de/rub/nds/crawler/config/delegate/MongoDbDelegate.java index 3cfd571..5a293ab 100644 --- a/src/main/java/de/rub/nds/crawler/config/delegate/MongoDbDelegate.java +++ b/src/main/java/de/rub/nds/crawler/config/delegate/MongoDbDelegate.java @@ -10,8 +10,56 @@ import com.beust.jcommander.Parameter; +/** + * Configuration delegate for MongoDB database connection parameters in TLS-Crawler. + * + *

The MongoDbDelegate encapsulates all MongoDB-specific configuration parameters used for + * database connectivity in the TLS-Crawler distributed architecture. It uses JCommander annotations + * to provide command-line parameter parsing and supports both password-based and file-based + * authentication methods. + * + *

Key features: + * + *

    + *
  • Connection Configuration - Host, port, and database specification + *
  • Authentication Support - Username/password and file-based credentials + *
  • Security Options - Password file support for secure credential storage + *
  • Delegate Pattern - Reusable across controller and worker configurations + *
+ * + *

Authentication Methods: + * + *

    + *
  • Direct Password - mongoDbPass parameter for direct password specification + *
  • Password File - mongoDbPassFile parameter for file-based password storage + *
  • Auth Source - mongoDbAuthSource specifies the authentication database + *
+ * + *

Usage Pattern: This delegate is embedded in both ControllerCommandConfig and + * WorkerCommandConfig using JCommander's @ParametersDelegate annotation, allowing the same MongoDB + * configuration to be shared across all application components. + * + *

Security Considerations: + * + *

    + *
  • Password file option prevents credentials from appearing in command-line history + *
  • Authentication source allows for centralized user management + *
  • Connection parameters support both local and remote MongoDB deployments + *
+ * + *

Default Behavior: All parameters are optional and default to null, allowing + * for environment-specific configuration or default MongoDB connection settings. + * + *

Used by ControllerCommandConfig and WorkerCommandConfig for database configuration. Creates + * IPersistenceProvider instances, typically MongoPersistenceProvider implementations. + */ public class MongoDbDelegate { + /** Creates a new MongoDB configuration delegate with default settings. */ + public MongoDbDelegate() { + // Default constructor for JCommander parameter injection + } + @Parameter( names = "-mongoDbHost", description = "Host of the MongoDB instance this crawler saves to.") @@ -42,50 +90,119 @@ public class MongoDbDelegate { description = "The DB within the MongoDB instance, in which the user:pass is defined.") private String mongoDbAuthSource; + /** + * Gets the MongoDB host address. + * + * @return the MongoDB hostname or IP address, or null if not configured + */ public String getMongoDbHost() { return mongoDbHost; } + /** + * Gets the MongoDB port number. + * + * @return the MongoDB port number, or 0 if not configured (uses MongoDB default) + */ public int getMongoDbPort() { return mongoDbPort; } + /** + * Gets the MongoDB authentication username. + * + * @return the username for MongoDB authentication, or null if not configured + */ public String getMongoDbUser() { return mongoDbUser; } + /** + * Gets the MongoDB authentication password. + * + *

Security Note: Consider using mongoDbPassFile for production deployments + * to avoid exposing passwords in command-line arguments. + * + * @return the password for MongoDB authentication, or null if not configured + */ public String getMongoDbPass() { return mongoDbPass; } + /** + * Gets the path to the MongoDB password file. + * + *

This provides a more secure alternative to specifying passwords directly in command-line + * arguments by reading the password from a file. + * + * @return the path to the password file, or null if not configured + */ public String getMongoDbPassFile() { return mongoDbPassFile; } + /** + * Gets the MongoDB authentication source database. + * + *

This specifies which database contains the user credentials for authentication. Commonly + * set to "admin" for centralized user management. + * + * @return the authentication source database name, or null if not configured + */ public String getMongoDbAuthSource() { return mongoDbAuthSource; } + /** + * Sets the MongoDB host address. + * + * @param mongoDbHost the MongoDB hostname or IP address + */ public void setMongoDbHost(String mongoDbHost) { this.mongoDbHost = mongoDbHost; } + /** + * Sets the MongoDB port number. + * + * @param mongoDbPort the MongoDB port number (typically 27017) + */ public void setMongoDbPort(int mongoDbPort) { this.mongoDbPort = mongoDbPort; } + /** + * Sets the MongoDB authentication username. + * + * @param mongoDbUser the username for MongoDB authentication + */ public void setMongoDbUser(String mongoDbUser) { this.mongoDbUser = mongoDbUser; } + /** + * Sets the MongoDB authentication password. + * + * @param mongoDbPass the password for MongoDB authentication + */ public void setMongoDbPass(String mongoDbPass) { this.mongoDbPass = mongoDbPass; } + /** + * Sets the path to the MongoDB password file. + * + * @param mongoDbPassFile the path to the file containing the MongoDB password + */ public void setMongoDbPassFile(String mongoDbPassFile) { this.mongoDbPassFile = mongoDbPassFile; } + /** + * Sets the MongoDB authentication source database. + * + * @param mongoDbAuthSource the database name containing user credentials + */ public void setMongoDbAuthSource(String mongoDbAuthSource) { this.mongoDbAuthSource = mongoDbAuthSource; } diff --git a/src/main/java/de/rub/nds/crawler/config/delegate/RabbitMqDelegate.java b/src/main/java/de/rub/nds/crawler/config/delegate/RabbitMqDelegate.java index 9d89180..03454dc 100644 --- a/src/main/java/de/rub/nds/crawler/config/delegate/RabbitMqDelegate.java +++ b/src/main/java/de/rub/nds/crawler/config/delegate/RabbitMqDelegate.java @@ -10,8 +10,63 @@ import com.beust.jcommander.Parameter; +/** + * Configuration delegate for RabbitMQ message queue connection parameters in TLS-Crawler. + * + *

The RabbitMqDelegate encapsulates all RabbitMQ-specific configuration parameters used for + * message queue connectivity in the TLS-Crawler distributed architecture. It provides connection + * settings, authentication credentials, and security options for the messaging infrastructure that + * coordinates work between controllers and workers. + * + *

Key features: + * + *

    + *
  • Connection Configuration - Host, port, and protocol settings + *
  • Authentication Support - Username/password and file-based credentials + *
  • TLS Security - Optional TLS encryption for message transport + *
  • Delegate Pattern - Reusable across controller and worker configurations + *
+ * + *

Authentication Methods: + * + *

    + *
  • Direct Password - rabbitMqPass parameter for direct password specification + *
  • Password File - rabbitMqPassFile parameter for secure credential storage + *
  • Username - rabbitMqUser specifies the authentication username + *
+ * + *

Security Configuration: + * + *

    + *
  • TLS Encryption - rabbitMqTLS enables encrypted communication + *
  • Port Selection - Supports both standard (5672) and TLS (5671) ports + *
  • Credential Protection - Password file option prevents command-line + * exposure + *
+ * + *

Usage Pattern: This delegate is embedded in both ControllerCommandConfig and + * WorkerCommandConfig using JCommander's @ParametersDelegate annotation, ensuring consistent + * RabbitMQ configuration across all distributed components. + * + *

Distributed Architecture: RabbitMQ serves as the central coordination + * mechanism in TLS-Crawler, handling scan job distribution, completion notifications, and progress + * monitoring between controllers and multiple worker instances. + * + *

Default Behavior: All parameters are optional and default to appropriate + * values (null for strings, false for TLS, 0 for port), allowing for environment-specific + * configuration or RabbitMQ default connection settings. + * + *

Used by ControllerCommandConfig and WorkerCommandConfig for message queue configuration. + * Creates IOrchestrationProvider instances, typically RabbitMqOrchestrationProvider + * implementations. + */ public class RabbitMqDelegate { + /** Creates a new RabbitMQ configuration delegate with default settings. */ + public RabbitMqDelegate() { + // Default constructor for JCommander parameter injection + } + @Parameter(names = "-rabbitMqHost") private String rabbitMqHost; @@ -30,50 +85,120 @@ public class RabbitMqDelegate { @Parameter(names = "-rabbitMqTLS") private boolean rabbitMqTLS; + /** + * Gets the RabbitMQ broker host address. + * + * @return the RabbitMQ hostname or IP address, or null if not configured + */ public String getRabbitMqHost() { return rabbitMqHost; } + /** + * Gets the RabbitMQ broker port number. + * + * @return the RabbitMQ port number, or 0 if not configured (uses RabbitMQ defaults: 5672 for + * plain, 5671 for TLS) + */ public int getRabbitMqPort() { return rabbitMqPort; } + /** + * Gets the RabbitMQ authentication username. + * + * @return the username for RabbitMQ authentication, or null if not configured + */ public String getRabbitMqUser() { return rabbitMqUser; } + /** + * Gets the RabbitMQ authentication password. + * + *

Security Note: Consider using rabbitMqPassFile for production deployments + * to avoid exposing passwords in command-line arguments. + * + * @return the password for RabbitMQ authentication, or null if not configured + */ public String getRabbitMqPass() { return rabbitMqPass; } + /** + * Gets the path to the RabbitMQ password file. + * + *

This provides a more secure alternative to specifying passwords directly in command-line + * arguments by reading the password from a file. + * + * @return the path to the password file, or null if not configured + */ public String getRabbitMqPassFile() { return rabbitMqPassFile; } + /** + * Checks if TLS encryption is enabled for RabbitMQ connections. + * + *

When TLS is enabled, all communication between the application and RabbitMQ broker is + * encrypted. This typically requires connecting to port 5671 instead of the default port 5672. + * + * @return true if TLS is enabled, false otherwise + */ public boolean isRabbitMqTLS() { return rabbitMqTLS; } + /** + * Sets the RabbitMQ broker host address. + * + * @param rabbitMqHost the RabbitMQ hostname or IP address + */ public void setRabbitMqHost(String rabbitMqHost) { this.rabbitMqHost = rabbitMqHost; } + /** + * Sets the RabbitMQ broker port number. + * + * @param rabbitMqPort the RabbitMQ port number (typically 5672 for plain or 5671 for TLS) + */ public void setRabbitMqPort(int rabbitMqPort) { this.rabbitMqPort = rabbitMqPort; } + /** + * Sets the RabbitMQ authentication username. + * + * @param rabbitMqUser the username for RabbitMQ authentication + */ public void setRabbitMqUser(String rabbitMqUser) { this.rabbitMqUser = rabbitMqUser; } + /** + * Sets the RabbitMQ authentication password. + * + * @param rabbitMqPass the password for RabbitMQ authentication + */ public void setRabbitMqPass(String rabbitMqPass) { this.rabbitMqPass = rabbitMqPass; } + /** + * Sets the path to the RabbitMQ password file. + * + * @param rabbitMqPassFile the path to the file containing the RabbitMQ password + */ public void setRabbitMqPassFile(String rabbitMqPassFile) { this.rabbitMqPassFile = rabbitMqPassFile; } + /** + * Sets whether TLS encryption should be used for RabbitMQ connections. + * + * @param rabbitMqTLS true to enable TLS encryption, false for plain connections + */ public void setRabbitMqTLS(boolean rabbitMqTLS) { this.rabbitMqTLS = rabbitMqTLS; } diff --git a/src/main/java/de/rub/nds/crawler/constant/CruxListNumber.java b/src/main/java/de/rub/nds/crawler/constant/CruxListNumber.java index 8eafb0e..0efd885 100644 --- a/src/main/java/de/rub/nds/crawler/constant/CruxListNumber.java +++ b/src/main/java/de/rub/nds/crawler/constant/CruxListNumber.java @@ -8,13 +8,71 @@ */ package de.rub.nds.crawler.constant; +/** + * Enumeration of supported Chrome UX Report (CrUX) target list sizes for distributed TLS scanning. + * + *

The CruxListNumber enum defines predefined target list sizes available from the Chrome User + * Experience Report dataset. These lists contain popular websites ranked by real user traffic + * patterns, providing realistic target sets for TLS security evaluations. + * + *

Key characteristics: + * + *

    + *
  • Real User Data - Based on actual Chrome browser usage statistics + *
  • Multiple Scales - Supports different scanning scopes from 1K to 1M targets + *
  • Performance Tiered - Larger lists provide broader coverage but require + * more resources + *
  • Regular Updates - CrUX data is updated regularly to reflect current web + * usage + *
+ * + *

List Sizes: + * + *

    + *
  • TOP_1k - Top 1,000 most popular websites for quick scans + *
  • TOP_5K - Top 5,000 websites for balanced coverage and performance + *
  • TOP_10K - Top 10,000 websites for comprehensive small-scale scanning + *
  • TOP_50K - Top 50,000 websites for extensive scanning projects + *
  • TOP_100K - Top 100,000 websites for large-scale research + *
  • TOP_500k - Top 500,000 websites for comprehensive coverage + *
  • TOP_1M - Top 1,000,000 websites for maximum coverage studies + *
+ * + *

Selection Guidelines: + * + *

    + *
  • Development/Testing - Use TOP_1k or TOP_5K for quick validation + *
  • Security Research - TOP_10K to TOP_100K provides good statistical + * significance + *
  • Academic Studies - TOP_500k to TOP_1M for comprehensive coverage + *
  • Performance Constraints - Smaller lists reduce scan time and resource + * usage + *
+ * + *

Usage Example: + * + *

{@code
+ * CruxListProvider provider = new CruxListProvider(CruxListNumber.TOP_10K);
+ * List targets = provider.getTargetList();
+ * }
+ * + * Used by CruxListProvider to configure target list sizes. Part of the ITargetListProvider system + * for scan target management. + */ public enum CruxListNumber { + /** Top 1,000 most popular websites from Chrome UX Report data. */ TOP_1k(1000), + /** Top 5,000 most popular websites from Chrome UX Report data. */ TOP_5K(5000), + /** Top 10,000 most popular websites from Chrome UX Report data. */ TOP_10K(10000), + /** Top 50,000 most popular websites from Chrome UX Report data. */ TOP_50K(50000), + /** Top 100,000 most popular websites from Chrome UX Report data. */ TOP_100K(100000), + /** Top 500,000 most popular websites from Chrome UX Report data. */ TOP_500k(500000), + /** Top 1,000,000 most popular websites from Chrome UX Report data. */ TOP_1M(1000000); private final int number; @@ -23,6 +81,11 @@ public enum CruxListNumber { this.number = number; } + /** + * Returns the numeric value representing the number of targets in this list size. + * + * @return the number of targets (e.g., 1000 for TOP_1k, 10000 for TOP_10K) + */ public int getNumber() { return number; } diff --git a/src/main/java/de/rub/nds/crawler/constant/JobStatus.java b/src/main/java/de/rub/nds/crawler/constant/JobStatus.java index fe6d26d..051b8fb 100644 --- a/src/main/java/de/rub/nds/crawler/constant/JobStatus.java +++ b/src/main/java/de/rub/nds/crawler/constant/JobStatus.java @@ -8,6 +8,60 @@ */ package de.rub.nds.crawler.constant; +/** + * Enumeration of possible scan job execution statuses in the TLS-Crawler distributed system. + * + *

The JobStatus enum categorizes the final outcome of scan job processing, providing detailed + * status information for monitoring, debugging, and result analysis. Each status indicates both the + * execution outcome and whether it represents an error condition. + * + *

Key characteristics: + * + *

    + *
  • Status Classification - Distinguishes between successful and error states + *
  • Error Categorization - Provides specific error types for troubleshooting + *
  • Database Integration - Status determines what data is written to storage + *
  • Progress Monitoring - Enables accurate completion and error rate tracking + *
+ * + *

Status Categories: + * + *

    + *
  • Success States - TO_BE_EXECUTED, SUCCESS, EMPTY + *
  • Infrastructure Errors - UNRESOLVABLE, RESOLUTION_ERROR, DENYLISTED + *
  • Execution Errors - ERROR, SERIALIZATION_ERROR, CANCELLED + *
  • System Errors - INTERNAL_ERROR, CRAWLER_ERROR + *
+ * + *

Database Behavior: + * + *

    + *
  • Full Results - SUCCESS writes complete scan data + *
  • Empty Results - UNRESOLVABLE, DENYLISTED, EMPTY write minimal data + *
  • Error Results - All error states write error information and stack traces + *
  • No Results - INTERNAL_ERROR prevents database writes + *
+ * + *

Usage in Monitoring: + * + *

{@code
+ * // Error rate calculation
+ * long errorCount = results.stream()
+ *     .map(ScanResult::getJobStatus)
+ *     .filter(JobStatus::isError)
+ *     .count();
+ *
+ * // Status-specific handling
+ * switch (jobStatus) {
+ *     case SUCCESS -> processResult(result);
+ *     case UNRESOLVABLE -> logDNSIssue(target);
+ *     case ERROR -> reportError(error);
+ * }
+ * }
+ * + * Used by ScanJobDescription.getStatus() and ScanResult.getJobStatus() methods. Set during + * processing by Worker.handleScanJob(ScanJobDescription) method. + */ public enum JobStatus { /** Job is waiting to be executed. */ TO_BE_EXECUTED(false), @@ -42,6 +96,22 @@ public enum JobStatus { this.isError = isError; } + /** + * Determines whether this status represents an error condition. + * + *

This method categorizes job statuses into successful and error states for monitoring and + * reporting purposes. Error states indicate problems that prevented normal scan completion, + * while non-error states represent successful processing (even if no data was obtained). + * + *

Error Status Classification: + * + *

    + *
  • Non-Error - TO_BE_EXECUTED, SUCCESS, EMPTY + *
  • Error - All other statuses indicate problems or failures + *
+ * + * @return true if this status indicates an error condition, false for successful processing + */ public boolean isError() { return isError; } diff --git a/src/main/java/de/rub/nds/crawler/core/Worker.java b/src/main/java/de/rub/nds/crawler/core/Worker.java index 1608e10..67fb2dc 100644 --- a/src/main/java/de/rub/nds/crawler/core/Worker.java +++ b/src/main/java/de/rub/nds/crawler/core/Worker.java @@ -21,8 +21,81 @@ import org.bson.Document; /** - * Worker that subscribe to scan job queue, initializes thread pool and submits received scan jobs - * to thread pool. + * Distributed TLS-Crawler worker instance responsible for consuming scan jobs and executing TLS + * scans. + * + *

The Worker forms the core execution unit of the TLS-Crawler distributed scanning architecture. + * It consumes scan job messages from the orchestration provider (typically RabbitMQ), executes TLS + * scans using configurable thread pools, and persists results to the database. Each worker instance + * can handle multiple concurrent scan jobs while providing comprehensive error handling and timeout + * management. + * + *

Key capabilities: + * + *

    + *
  • Job Consumption - Subscribes to scan job queue for continuous processing + *
  • Concurrent Execution - Manages multiple parallel scan threads + *
  • Timeout Management - Enforces scan timeouts with graceful cancellation + *
  • Result Persistence - Stores scan results with comprehensive error handling + *
  • Status Reporting - Notifies orchestration provider of job completion + *
  • Resource Management - Proper cleanup and thread lifecycle management + *
+ * + *

Threading Architecture: + * + *

    + *
  • Scan Threads - Parallel execution of individual TLS scans via + * BulkScanWorkerManager + *
  • Result Handler Threads - Dedicated threads for result processing and + * persistence + *
  • Connection Threads - Shared thread pool for network connections within + * scans + *
  • Thread Pools - Fixed-size pools with graceful shutdown and resource + * cleanup + *
+ * + *

Execution Workflow: + * + *

    + *
  1. Job Reception - Receives ScanJobDescription from orchestration provider + *
  2. Scan Execution - Delegates to BulkScanWorkerManager for actual scanning + *
  3. Result Waiting - Waits for scan completion with configurable timeout + *
  4. Error Handling - Categorizes failures and creates appropriate ScanResult + *
  5. Persistence - Stores results and metadata in persistence provider + *
  6. Notification - Sends completion notification for progress tracking + *
+ * + *

Timeout Management: + * + *

    + *
  • Primary Timeout - Configurable scan timeout (default 14 minutes) + *
  • Graceful Shutdown - Attempts to cancel running scans on timeout + *
  • Final Timeout - 10-second deadline for scan termination after cancellation + *
  • Status Tracking - Proper JobStatus assignment for timeout scenarios + *
+ * + *

Error Categories: + * + *

    + *
  • SUCCESS - Scan completed successfully with results + *
  • EMPTY - Scan completed but produced no results + *
  • CANCELLED - Scan timed out and was cancelled + *
  • ERROR - Scanner-level execution exception + *
  • CRAWLER_ERROR - Unexpected worker-level exception + *
  • INTERNAL_ERROR - Worker interruption or persistence failure + *
+ * + *

Resource Safety: The worker ensures proper resource cleanup through thread + * pool management, graceful shutdown handling, and comprehensive exception catching to prevent + * resource leaks in long-running distributed environments. + * + * @see WorkerCommandConfig + * @see IOrchestrationProvider + * @see IPersistenceProvider + * @see BulkScanWorkerManager + * @see ScanJobDescription + * @see ScanResult + * @see JobStatus */ public class Worker { private static final Logger LOGGER = LogManager.getLogger(); @@ -38,11 +111,29 @@ public class Worker { private final ThreadPoolExecutor workerExecutor; /** - * TLS-Crawler constructor. + * Creates a new TLS-Crawler worker with the specified configuration and providers. * - * @param commandConfig The config for this worker. - * @param orchestrationProvider A non-null orchestration provider. - * @param persistenceProvider A non-null persistence provider. + *

This constructor initializes the worker with all necessary components for distributed TLS + * scanning operations. It extracts configuration parameters from the command config and sets up + * the thread pool executor for result handling. + * + *

Thread Pool Configuration: + * + *

    + *
  • Core/Max Threads - Equal to parallelScanThreads for fixed pool size + *
  • Keep-Alive Time - 5 minutes for idle thread cleanup + *
  • Queue - LinkedBlockingDeque for unlimited task queuing + *
  • Thread Factory - Named threads for debugging ("crawler-worker: result + * handler") + *
+ * + *

Configuration Extraction: The constructor extracts key parameters from + * the WorkerCommandConfig including thread counts and timeout values for scan execution. + * + * @param commandConfig the worker configuration containing thread counts and timeout settings + * @param orchestrationProvider the provider for message queue communication and job consumption + * @param persistenceProvider the provider for database operations and result storage + * @throws NullPointerException if any parameter is null */ public Worker( WorkerCommandConfig commandConfig, @@ -64,11 +155,62 @@ public Worker( new NamedThreadFactory("crawler-worker: result handler")); } + /** + * Starts the worker by registering for scan job consumption from the orchestration provider. + * + *

This method initiates the worker's primary function by subscribing to the scan job queue. + * The orchestration provider will begin delivering scan jobs to this worker's handleScanJob + * method based on the configured parallel scan thread count. + * + *

Registration Details: + * + *

    + *
  • Consumer Method - Uses method reference to handleScanJob + *
  • Concurrency Level - Registers with parallelScanThreads count + *
  • Queue Binding - Connects to the configured scan job queue + *
+ * + *

Post-Start Behavior: After calling this method, the worker will begin + * receiving and processing scan jobs asynchronously until the application shuts down or the + * orchestration provider connection is closed. + */ public void start() { this.orchestrationProvider.registerScanJobConsumer( this::handleScanJob, this.parallelScanThreads); } + /** + * Waits for scan completion and handles timeout scenarios with graceful cancellation. + * + *

This method implements the core timeout and cancellation logic for scan jobs. It waits for + * the scan to complete within the configured timeout period, and if the timeout is exceeded, it + * attempts graceful cancellation before enforcing a final deadline. + * + *

Timeout Handling Strategy: + * + *

    + *
  1. Primary Wait - Wait up to scanTimeout for normal completion + *
  2. Cancellation - On timeout, cancel the future and log attempt + *
  3. Grace Period - Allow 10 seconds for graceful shutdown after + * cancellation + *
  4. Status Assignment - Set appropriate JobStatus based on outcome + *
+ * + *

Result Processing: + * + *

    + *
  • SUCCESS - Non-null result document indicates successful scan + *
  • EMPTY - Null result document indicates no findings + *
  • CANCELLED - Timeout occurred and scan was interrupted + *
+ * + * @param resultFuture the future representing the ongoing scan operation + * @param scanJobDescription the job description to update with final status + * @return a ScanResult containing the job description and result document + * @throws ExecutionException if the scan execution encounters an error + * @throws InterruptedException if the current thread is interrupted while waiting + * @throws TimeoutException if the scan cannot be cancelled within the grace period + */ private ScanResult waitForScanResult( Future resultFuture, ScanJobDescription scanJobDescription) throws ExecutionException, InterruptedException, TimeoutException { @@ -90,6 +232,40 @@ private ScanResult waitForScanResult( return new ScanResult(scanJobDescription, resultDocument); } + /** + * Handles incoming scan job messages by initiating scan execution and result processing. + * + *

This method serves as the main entry point for scan job processing. It receives scan job + * descriptions from the orchestration provider, delegates the actual scanning to + * BulkScanWorkerManager, and submits the result handling to the worker thread pool. + * + *

Processing Flow: + * + *

    + *
  1. Job Reception - Log incoming scan job for the target + *
  2. Scan Delegation - Submit to BulkScanWorkerManager for execution + *
  3. Async Processing - Submit result waiting and persistence to thread + * pool + *
  4. Error Handling - Comprehensive exception handling with status + * categorization + *
+ * + *

Exception Categories: + * + *

    + *
  • InterruptedException - Worker shutdown, sets INTERNAL_ERROR status + *
  • ExecutionException - Scanner failure, sets ERROR status + *
  • TimeoutException - Scan timeout, sets CANCELLED status + *
  • General Exception - Unexpected error, sets CRAWLER_ERROR status + *
+ * + *

Result Persistence: All scan results are persisted unless an + * InterruptedException occurs, indicating the worker is shutting down and persistence should be + * avoided. + * + * @param scanJobDescription the scan job to process, containing target and configuration + * details + */ private void handleScanJob(ScanJobDescription scanJobDescription) { LOGGER.info("Received scan job for {}", scanJobDescription.getScanTarget()); Future resultFuture = @@ -135,6 +311,40 @@ private void handleScanJob(ScanJobDescription scanJobDescription) { }); } + /** + * Persists scan results to the database and notifies the orchestration provider of completion. + * + *

This method handles the final phase of scan job processing by storing results in the + * persistence layer and sending completion notifications to the orchestration provider. It + * provides comprehensive error handling to ensure completion notifications are always sent, + * even if persistence fails. + * + *

Persistence Flow: + * + *

    + *
  1. Null Check - Validate ScanResult is not null + *
  2. Status Update - Sync job description status with result status + *
  3. Database Insert - Store result and metadata via persistence provider + *
  4. Error Handling - Set INTERNAL_ERROR status on persistence failure + *
  5. Completion Notification - Always notify orchestration provider + *
+ * + *

Error Recovery: + * + *

    + *
  • Null Result - Logs error and sets INTERNAL_ERROR status + *
  • Persistence Exception - Logs error, sets INTERNAL_ERROR, continues to + * notification + *
  • Guaranteed Notification - Completion notification sent regardless of + * persistence outcome + *
+ * + *

Status Synchronization: The method ensures the ScanJobDescription status + * matches the ScanResult status before persistence, maintaining consistency across the system. + * + * @param scanJobDescription the job description to update and use for notification + * @param scanResult the scan result to persist, may be null in error scenarios + */ private void persistResult(ScanJobDescription scanJobDescription, ScanResult scanResult) { try { if (scanResult != null) { diff --git a/src/main/java/de/rub/nds/crawler/core/jobs/PublishBulkScanJob.java b/src/main/java/de/rub/nds/crawler/core/jobs/PublishBulkScanJob.java index 1459b1a..e0c44a5 100644 --- a/src/main/java/de/rub/nds/crawler/core/jobs/PublishBulkScanJob.java +++ b/src/main/java/de/rub/nds/crawler/core/jobs/PublishBulkScanJob.java @@ -26,10 +26,119 @@ import org.quartz.JobExecutionContext; import org.quartz.JobExecutionException; +/** + * Quartz job implementation responsible for initializing and publishing bulk scan operations. + * + *

The PublishBulkScanJob serves as the main orchestration component that transforms a bulk scan + * configuration into individual scan jobs distributed to worker instances. It handles the complete + * job creation workflow including target list processing, filtering, validation, and submission to + * the message queue infrastructure. + * + *

Key responsibilities: + * + *

    + *
  • Bulk Scan Initialization - Creates and persists BulkScan metadata + *
  • Target Processing - Processes target lists with filtering and validation + *
  • Job Creation - Converts targets into individual ScanJobDescription objects + *
  • Quality Control - Filters denylisted and unresolvable targets + *
  • Progress Monitoring - Initializes monitoring for tracked scans + *
  • Statistics Collection - Tracks submission statistics and error counts + *
+ * + *

Execution Workflow: + * + *

    + *
  1. Configuration Extraction - Retrieves all required providers from + * JobDataMap + *
  2. BulkScan Creation - Creates and persists the parent bulk scan object + *
  3. Target List Retrieval - Fetches targets from the configured provider + *
  4. Monitoring Setup - Initializes progress tracking if enabled + *
  5. Parallel Processing - Processes targets concurrently using parallel + * streams + *
  6. Job Submission - Submits valid jobs to orchestration provider + *
  7. Statistics Update - Updates bulk scan with final submission counts + *
+ * + *

Target Filtering Pipeline: + * + *

    + *
  • Target Parsing - Converts string targets to ScanTarget objects + *
  • DNS Resolution - Validates that hostnames can be resolved + *
  • Denylist Checking - Filters out prohibited targets + *
  • Error Handling - Categorizes and persists processing errors + *
+ * + *

Error Handling: The job implements comprehensive error handling that + * categorizes failures into specific JobStatus types (UNRESOLVABLE, DENYLISTED, RESOLUTION_ERROR) + * and persists error results for analysis while continuing processing of valid targets. + * + *

Parallel Processing: Uses Java parallel streams for efficient processing of + * large target lists, with the JobSubmitter functional interface handling individual target + * processing and submission. + * + *

Monitoring Integration: For monitored scans, sets up ProgressMonitor tracking + * and handles the special case where no jobs are submitted (immediate completion). + * + * @see Job + * @see ControllerCommandConfig + * @see BulkScan + * @see ScanJobDescription + * @see ProgressMonitor + * @see IOrchestrationProvider + * @see ITargetListProvider + */ public class PublishBulkScanJob implements Job { private static final Logger LOGGER = LogManager.getLogger(); + /** + * Creates a new bulk scan job publisher instance. + * + *

Default constructor required by the Quartz scheduler framework. The job execution context + * provides all necessary configuration and dependencies at execution time. + */ + public PublishBulkScanJob() { + // Default constructor for Quartz scheduler instantiation + } + + /** + * Executes the bulk scan job creation and publication process. + * + *

This method implements the Quartz Job interface and performs the complete workflow for + * transforming a bulk scan configuration into individual scan jobs distributed to workers. It + * handles all aspects of job creation including filtering, validation, and submission while + * providing comprehensive error handling and statistics collection. + * + *

Required JobDataMap Entries: + * + *

    + *
  • config - ControllerCommandConfig with scan parameters + *
  • orchestrationProvider - IOrchestrationProvider for job submission + *
  • persistenceProvider - IPersistenceProvider for data storage + *
  • targetListProvider - ITargetListProvider for target acquisition + *
  • denylistProvider - IDenylistProvider for target filtering + *
  • progressMonitor - ProgressMonitor for tracking (if enabled) + *
+ * + *

Execution Steps: + * + *

    + *
  1. Extract configuration and providers from JobDataMap + *
  2. Create and persist BulkScan object with metadata + *
  3. Retrieve target list from configured provider + *
  4. Initialize progress monitoring if enabled + *
  5. Process targets in parallel using JobSubmitter + *
  6. Collect statistics and update BulkScan + *
  7. Handle edge case of zero submitted jobs + *
+ * + *

Error Handling: Any exception during execution is caught, logged, and + * converted to a JobExecutionException with unscheduleAllTriggers=true to prevent retry + * attempts that would likely fail with the same error. + * + * @param context the Quartz job execution context containing configuration and providers + * @throws JobExecutionException if any error occurs during job execution + */ public void execute(JobExecutionContext context) throws JobExecutionException { try { JobDataMap data = context.getMergedJobDataMap(); @@ -102,6 +211,35 @@ public void execute(JobExecutionContext context) throws JobExecutionException { } } + /** + * Functional interface implementation for processing individual target strings into scan jobs. + * + *

The JobSubmitter class implements the Function interface to enable parallel processing of + * target lists using Java streams. Each instance processes target strings by parsing, + * validating, filtering, and either submitting valid jobs or persisting error results. + * + *

Processing Pipeline: + * + *

    + *
  1. Target Parsing - Converts string to ScanTarget with DNS resolution + *
  2. Denylist Checking - Validates target against configured denylist + *
  3. Job Creation - Creates ScanJobDescription with appropriate status + *
  4. Submission/Persistence - Submits valid jobs or persists error results + *
+ * + *

Status Determination: + * + *

    + *
  • TO_BE_EXECUTED - Valid target, submitted to orchestration provider + *
  • DENYLISTED - Target blocked by denylist configuration + *
  • UNRESOLVABLE - DNS resolution failed for hostname + *
  • RESOLUTION_ERROR - Unexpected error during target processing + *
+ * + *

Error Persistence: All error cases result in ScanResult objects being + * persisted to maintain complete audit trails and enable analysis of filtering effectiveness + * and target list quality. + */ private static class JobSubmitter implements Function { private final IOrchestrationProvider orchestrationProvider; private final IPersistenceProvider persistenceProvider; @@ -109,6 +247,15 @@ private static class JobSubmitter implements Function { private final BulkScan bulkScan; private final int defaultPort; + /** + * Creates a new JobSubmitter with the required dependencies for target processing. + * + * @param orchestrationProvider provider for submitting valid scan jobs + * @param persistenceProvider provider for storing error results + * @param denylistProvider provider for target filtering + * @param bulkScan the parent bulk scan for job association + * @param defaultPort the default port to use when not specified in target strings + */ public JobSubmitter( IOrchestrationProvider orchestrationProvider, IPersistenceProvider persistenceProvider, @@ -122,6 +269,29 @@ public JobSubmitter( this.defaultPort = defaultPort; } + /** + * Processes a single target string and returns the resulting job status. + * + *

This method implements the core target processing logic, handling parsing, validation, + * filtering, and job submission or error persistence. It uses the + * ScanTarget.fromTargetString method for DNS resolution and denylist checking. + * + *

Processing Flow: + * + *

    + *
  1. Parse target string using ScanTarget.fromTargetString + *
  2. Create ScanJobDescription with parsed target and determined status + *
  3. For valid targets (TO_BE_EXECUTED): submit to orchestration provider + *
  4. For invalid targets: create and persist ScanResult with error details + *
+ * + *

Error Handling: Exceptions during target parsing are caught and + * result in RESOLUTION_ERROR status with the exception persisted in the ScanResult for + * debugging purposes. + * + * @param targetString the target string to process (e.g., "example.com:443") + * @return the JobStatus indicating how the target was processed + */ @Override public JobStatus apply(String targetString) { ScanJobDescription jobDescription; diff --git a/src/main/java/de/rub/nds/crawler/data/BulkScanInfo.java b/src/main/java/de/rub/nds/crawler/data/BulkScanInfo.java index 1e40e41..143bb73 100644 --- a/src/main/java/de/rub/nds/crawler/data/BulkScanInfo.java +++ b/src/main/java/de/rub/nds/crawler/data/BulkScanInfo.java @@ -11,34 +11,139 @@ import java.io.Serializable; /** - * Metadata about a bulk scan which is serialized to the workers. This is expected to stay the same - * for the duration of a bulk scan. + * Immutable metadata container for bulk scan information distributed to worker instances. + * + *

The BulkScanInfo class serves as a lightweight, serializable representation of essential bulk + * scan metadata that workers need to execute individual scan jobs correctly. It contains only the + * core information required for job execution while avoiding the overhead of transmitting the + * complete BulkScan object to every worker. + * + *

Key design principles: + * + *

    + *
  • Immutability - All fields are final and cannot be modified after creation + *
  • Serialization Efficiency - Lightweight alternative to full BulkScan + * objects + *
  • Essential Data Only - Contains only the minimum information needed by + * workers + *
  • Type Safety - Provides typed access to scanner-specific configurations + *
+ * + *

Contained Information: + * + *

    + *
  • Bulk Scan ID - Unique identifier for traceability and result correlation + *
  • Scan Configuration - Scanner-specific settings and parameters + *
  • Monitoring Flag - Whether progress monitoring is enabled for this scan + *
+ * + *

Lifecycle and Usage: + * + *

    + *
  • Creation - Extracted from BulkScan objects by controllers + *
  • Distribution - Serialized and included in ScanJobDescription messages + *
  • Worker Usage - Used by workers to configure scan execution + *
  • Result Correlation - Links individual results back to bulk scan + *
+ * + *

Immutability Guarantee: The class is designed to remain unchanged for the + * entire duration of a bulk scan operation, ensuring consistent configuration across all + * distributed workers and preventing configuration drift during long-running scans. + * + *

Serialization: Implements Serializable for efficient transmission via message + * queues between controller and worker instances in the distributed architecture. + * + * @see BulkScan + * @see ScanConfig + * @see ScanJobDescription */ public class BulkScanInfo implements Serializable { + /** Unique identifier for the bulk scan operation. */ private final String bulkScanId; + /** Configuration settings for individual scan jobs within this bulk operation. */ private final ScanConfig scanConfig; + /** Flag indicating whether this bulk scan should be monitored for progress tracking. */ private final boolean isMonitored; + /** + * Creates a new bulk scan info object by extracting essential metadata from a bulk scan. + * + *

This constructor extracts only the core information needed by workers for scan execution, + * creating a lightweight representation that can be efficiently serialized and distributed via + * message queues. + * + *

Extracted Information: + * + *

    + *
  • Bulk Scan ID - For result correlation and traceability + *
  • Scan Configuration - Scanner settings and parameters + *
  • Monitoring Status - Whether progress tracking is enabled + *
+ * + * @param bulkScan the source bulk scan to extract metadata from + */ public BulkScanInfo(BulkScan bulkScan) { this.bulkScanId = bulkScan.get_id(); this.scanConfig = bulkScan.getScanConfig(); this.isMonitored = bulkScan.isMonitored(); } + /** + * Gets the unique identifier of the bulk scan this metadata represents. + * + *

This ID is used for correlating individual scan job results back to their originating bulk + * scan operation and for progress tracking. + * + * @return the bulk scan unique identifier + */ public String getBulkScanId() { return bulkScanId; } + /** + * Gets the scan configuration for this bulk scan operation. + * + *

The scan configuration contains scanner-specific settings and parameters that control how + * individual scan jobs should be executed. + * + * @return the scan configuration object + */ public ScanConfig getScanConfig() { return scanConfig; } + /** + * Gets the scan configuration cast to a specific scanner implementation type. + * + *

This method provides type-safe access to scanner-specific configuration implementations, + * allowing workers to access configuration details specific to their scanner type without + * manual casting. + * + *

Usage Example: + * + *

+     * TlsServerScanConfig tlsConfig = info.getScanConfig(TlsServerScanConfig.class);
+     * 
+ * + * @param the specific scan configuration type + * @param clazz the class object of the desired configuration type + * @return the scan configuration cast to the specified type + * @throws ClassCastException if the configuration is not of the specified type + */ public T getScanConfig(Class clazz) { return clazz.cast(scanConfig); } + /** + * Checks if progress monitoring is enabled for this bulk scan. + * + *

When monitoring is enabled, workers send completion notifications that are used for + * progress tracking, performance metrics, and completion callbacks. + * + * @return true if progress monitoring is enabled, false otherwise + */ public boolean isMonitored() { return isMonitored; } diff --git a/src/main/java/de/rub/nds/crawler/data/BulkScanJobCounters.java b/src/main/java/de/rub/nds/crawler/data/BulkScanJobCounters.java index bfaac3a..fab7020 100644 --- a/src/main/java/de/rub/nds/crawler/data/BulkScanJobCounters.java +++ b/src/main/java/de/rub/nds/crawler/data/BulkScanJobCounters.java @@ -13,6 +13,57 @@ import java.util.Map; import java.util.concurrent.atomic.AtomicInteger; +/** + * Thread-safe job status counters for tracking bulk scan progress and completion statistics. + * + *

The BulkScanJobCounters class provides atomic counting and tracking of scan job completion + * status across all worker threads in a distributed TLS scanning operation. It maintains separate + * counters for each job status type and provides thread-safe access to progress metrics used by the + * monitoring and progress tracking systems. + * + *

Key capabilities: + * + *

    + *
  • Thread Safety - Uses AtomicInteger for concurrent access from multiple + * threads + *
  • Status Categorization - Separate counters for each JobStatus enum value + *
  • Total Tracking - Maintains overall completion count across all statuses + *
  • Progress Monitoring - Provides real-time statistics for ProgressMonitor + *
+ * + *

Atomic Operations: + * + *

    + *
  • Status Increment - Thread-safe increment of specific job status counters + *
  • Total Increment - Synchronized increment of overall completion count + *
  • Snapshot Access - Thread-safe reading of current counter values + *
+ * + *

Status Categories Tracked: + * + *

    + *
  • SUCCESS - Scan completed successfully with results + *
  • EMPTY - Scan completed but produced no results + *
  • ERROR - Scanner-level execution failure + *
  • CANCELLED - Scan timed out and was cancelled + *
  • INTERNAL_ERROR - Worker-level processing failure + *
  • SERIALIZATION_ERROR - Result serialization failure + *
  • CRAWLER_ERROR - Unexpected crawler exception + *
+ * + *

Excluded Status: The TO_BE_EXECUTED status is not tracked as it represents + * jobs that haven't completed yet, and this class only tracks completion statistics. + * + *

Performance Metrics: The counters support real-time calculation of completion + * rates, error rates, and progress percentages for monitoring dashboards and ETA calculations. + * + *

Memory Efficiency: Uses EnumMap for optimal memory usage and access speed + * when dealing with the finite set of JobStatus enum values. + * + * @see BulkScan + * @see JobStatus Used by ProgressMonitor for tracking bulk scan completion statistics. + * @see AtomicInteger + */ public class BulkScanJobCounters { private final BulkScan bulkScan; @@ -20,6 +71,19 @@ public class BulkScanJobCounters { private final AtomicInteger totalJobDoneCount = new AtomicInteger(0); private final Map jobStatusCounters = new EnumMap<>(JobStatus.class); + /** + * Creates a new job counter tracker for the specified bulk scan. + * + *

This constructor initializes atomic counters for all completion status types, excluding + * TO_BE_EXECUTED which represents jobs that haven't completed yet. Each counter starts at zero + * and is thread-safe for concurrent updates. + * + *

Counter Initialization: Creates AtomicInteger instances for each + * JobStatus enum value except TO_BE_EXECUTED, ensuring thread-safe access from multiple worker + * threads and monitoring components. + * + * @param bulkScan the bulk scan operation to track counters for + */ public BulkScanJobCounters(BulkScan bulkScan) { this.bulkScan = bulkScan; for (JobStatus jobStatus : JobStatus.values()) { @@ -30,10 +94,29 @@ public BulkScanJobCounters(BulkScan bulkScan) { } } + /** + * Gets the bulk scan operation that these counters are tracking. + * + * @return the associated bulk scan object + */ public BulkScan getBulkScan() { return bulkScan; } + /** + * Creates a snapshot copy of all job status counters at the current moment. + * + *

This method provides a thread-safe way to get a consistent view of all counter values + * without holding locks. The returned map contains the current count for each job status type + * and can be safely used for reporting or persistence without affecting the ongoing counter + * updates. + * + *

Thread Safety: While individual counter reads are atomic, the overall + * snapshot may not be perfectly consistent if updates occur during iteration. However, this + * provides a reasonable approximation for monitoring purposes. + * + * @return a new EnumMap containing current counter values for all job statuses + */ public Map getJobStatusCountersCopy() { EnumMap ret = new EnumMap<>(JobStatus.class); for (Map.Entry entry : jobStatusCounters.entrySet()) { @@ -42,10 +125,35 @@ public Map getJobStatusCountersCopy() { return ret; } + /** + * Gets the current count for a specific job status type. + * + *

This method provides thread-safe access to individual counter values, returning the + * current count for the specified job status. + * + * @param jobStatus the job status type to get the count for + * @return the current count for the specified job status + * @throws NullPointerException if jobStatus is TO_BE_EXECUTED (not tracked) + */ public int getJobStatusCount(JobStatus jobStatus) { return jobStatusCounters.get(jobStatus).get(); } + /** + * Atomically increments the counter for a specific job status and returns the new total. + * + *

This method performs two atomic operations: incrementing the specific job status counter + * and incrementing the overall completion count. The operations are performed in sequence but + * are individually atomic, ensuring thread safety but not perfect consistency between the two + * counters at any given instant. + * + *

Usage: Called by workers when scan jobs complete with a specific status, + * providing real-time updates for progress monitoring and statistics. + * + * @param jobStatus the job status type to increment + * @return the new total count of completed jobs across all status types + * @throws NullPointerException if jobStatus is TO_BE_EXECUTED (not tracked) + */ public int increaseJobStatusCount(JobStatus jobStatus) { jobStatusCounters.get(jobStatus).incrementAndGet(); return totalJobDoneCount.incrementAndGet(); diff --git a/src/main/java/de/rub/nds/crawler/data/ScanConfig.java b/src/main/java/de/rub/nds/crawler/data/ScanConfig.java index 8f91fc2..6b95cb8 100644 --- a/src/main/java/de/rub/nds/crawler/data/ScanConfig.java +++ b/src/main/java/de/rub/nds/crawler/data/ScanConfig.java @@ -12,47 +12,175 @@ import de.rub.nds.scanner.core.config.ScannerDetail; import java.io.Serializable; +/** + * Abstract base configuration class for TLS scanner implementations in distributed scanning. + * + *

The ScanConfig class provides the foundation for scanner-specific configuration in the + * TLS-Crawler distributed architecture. It defines common scanning parameters that apply across + * different TLS scanner implementations while allowing concrete subclasses to add scanner-specific + * configuration options. + * + *

Key responsibilities: + * + *

    + *
  • Common Configuration - Provides scanner detail, timeout, and retry + * settings + *
  • Worker Factory - Abstract factory method for creating scan workers + *
  • Serialization - Supports JSON/BSON serialization for distributed messaging + *
  • Type Safety - Generic typing ensures worker compatibility with + * configuration + *
+ * + *

Configuration Parameters: + * + *

    + *
  • Scanner Detail - Controls depth and comprehensiveness of scanning + *
  • Reexecutions - Number of retry attempts for failed scans + *
  • Timeout - Maximum execution time per scan in milliseconds + *
+ * + *

Factory Pattern: The abstract createWorker() method implements the factory + * pattern, allowing each scanner implementation to create appropriately configured worker instances + * that match the scanner's requirements and capabilities. + * + *

Serialization Support: The class implements Serializable and includes a + * no-argument constructor for compatibility with serialization frameworks used in distributed + * messaging and database persistence. + * + *

Extension Points: Subclasses should: + * + *

    + *
  • Add scanner-specific configuration parameters + *
  • Implement the createWorker() method to return appropriate worker instances + *
  • Ensure proper serialization of additional fields + *
  • Maintain compatibility with the distributed architecture + *
+ * + *

Common Usage Pattern: Configuration instances are created by controllers, + * serialized and distributed to workers via message queues, then used to create scanner-specific + * worker instances that execute the actual TLS scans. + * + * @see BulkScanWorker + * @see ScannerDetail + * @see BulkScan + */ public abstract class ScanConfig implements Serializable { + /** Scanner implementation details and configuration parameters. */ private ScannerDetail scannerDetail; + /** Number of retry attempts for failed scan operations. */ private int reexecutions; + /** Maximum execution time in milliseconds for individual scan operations. */ private int timeout; @SuppressWarnings("unused") private ScanConfig() {} + /** + * Creates a new scan configuration with the specified parameters. + * + *

This protected constructor is intended for use by subclasses to initialize the common + * configuration parameters that apply to all scanner implementations. + * + * @param scannerDetail the scanner detail level controlling scan comprehensiveness + * @param reexecutions the number of retry attempts for failed scans + * @param timeout the maximum execution time per scan in milliseconds + */ protected ScanConfig(ScannerDetail scannerDetail, int reexecutions, int timeout) { this.scannerDetail = scannerDetail; this.reexecutions = reexecutions; this.timeout = timeout; } + /** + * Gets the scanner detail level configuration. + * + *

The scanner detail level controls how comprehensive the TLS scanning should be, affecting + * factors like the number of probes executed, the depth of analysis, and the amount of data + * collected. + * + * @return the scanner detail level + */ public ScannerDetail getScannerDetail() { return this.scannerDetail; } + /** + * Gets the number of reexecution attempts for failed scans. + * + *

When a scan fails due to network issues or other transient problems, the scanner will + * retry the scan up to this many times before marking it as failed. + * + * @return the number of retry attempts (typically 3) + */ public int getReexecutions() { return this.reexecutions; } + /** + * Gets the timeout value for individual scan operations. + * + *

This timeout controls how long the scanner will wait for a single scan to complete before + * considering it failed. The timeout applies to the TLS-Scanner execution, not the overall + * worker timeout. + * + * @return the scan timeout in milliseconds (typically 2000ms) + */ public int getTimeout() { return this.timeout; } + /** + * Sets the scanner detail level configuration. + * + * @param scannerDetail the scanner detail level to use + */ public void setScannerDetail(ScannerDetail scannerDetail) { this.scannerDetail = scannerDetail; } + /** + * Sets the number of reexecution attempts for failed scans. + * + * @param reexecutions the number of retry attempts + */ public void setReexecutions(int reexecutions) { this.reexecutions = reexecutions; } + /** + * Sets the timeout value for individual scan operations. + * + * @param timeout the scan timeout in milliseconds + */ public void setTimeout(int timeout) { this.timeout = timeout; } + /** + * Factory method for creating scanner-specific worker instances. + * + *

This abstract method must be implemented by subclasses to create appropriate + * BulkScanWorker instances that are compatible with their specific scanner implementation. The + * worker will use this configuration to control scanning behavior. + * + *

Worker Creation: The created worker should be properly configured with + * the scanner implementation, threading parameters, and this configuration instance. + * + *

Threading Parameters: + * + *

    + *
  • Connection Threads - Shared pool for parallel network connections + *
  • Scan Threads - Number of concurrent scanner instances + *
+ * + * @param bulkScanID the ID of the bulk scan this worker belongs to + * @param parallelConnectionThreads the number of threads for parallel connections + * @param parallelScanThreads the number of parallel scanner instances + * @return a new BulkScanWorker instance configured for this scanner type + */ public abstract BulkScanWorker createWorker( String bulkScanID, int parallelConnectionThreads, int parallelScanThreads); } diff --git a/src/main/java/de/rub/nds/crawler/data/ScanJobDescription.java b/src/main/java/de/rub/nds/crawler/data/ScanJobDescription.java index 841b410..9d1cbf3 100644 --- a/src/main/java/de/rub/nds/crawler/data/ScanJobDescription.java +++ b/src/main/java/de/rub/nds/crawler/data/ScanJobDescription.java @@ -13,23 +13,99 @@ import java.io.Serializable; import java.util.Optional; +/** + * Data transfer object representing a single TLS scan job in the distributed scanning architecture. + * + *

The ScanJobDescription serves as the primary communication unit between the controller and + * worker nodes in the TLS-Crawler system. It encapsulates all information necessary for a worker to + * execute a TLS scan and store the results, including the scan target, execution status, database + * storage location, and message queue metadata. + * + *

Key responsibilities: + * + *

    + *
  • Job Definition - Specifies what should be scanned (target host/port) + *
  • Status Tracking - Maintains current execution status throughout lifecycle + *
  • Storage Configuration - Defines where results should be persisted + *
  • Message Queue Integration - Handles RabbitMQ delivery tags for + * acknowledgment + *
  • Bulk Scan Coordination - Links individual jobs to their parent bulk scan + *
+ * + *

Lifecycle Management: + * + *

    + *
  • Creation - Controller creates jobs with TO_BE_EXECUTED status + *
  • Distribution - Jobs are serialized and sent via message queue + *
  • Processing - Workers receive, execute, and update status + *
  • Completion - Final status and results are persisted + *
+ * + *

Message Queue Integration: + * + *

    + *
  • Delivery Tag - RabbitMQ message identifier for acknowledgment + *
  • Transient Field - Delivery tag is not serialized (transport-specific) + *
  • Single Assignment - Delivery tag can only be set once per job + *
  • Deserialization Handling - Custom readObject() ensures proper + * initialization + *
+ * + *

Database Storage: + * + *

    + *
  • Database Name - Target database for result storage + *
  • Collection Name - Specific collection/table for this scan type + *
  • Bulk Scan Traceability - Links results back to originating bulk scan + *
+ * + *

Immutability: Most fields are final to ensure job definitions remain + * consistent throughout processing, with only the status field being mutable to track execution + * progress. + * + *

Serialization: The class supports Java serialization for message queue + * transport while handling the transient delivery tag appropriately during deserialization. + * + * @see ScanTarget + * @see BulkScanInfo + * @see BulkScan + * @see JobStatus + */ public class ScanJobDescription implements Serializable { + /** Target specification containing hostname, IP address, and port information. */ private final ScanTarget scanTarget; // Metadata private transient Optional deliveryTag = Optional.empty(); + /** Current execution status of this scan job (pending, success, error, etc.). */ private JobStatus status; + /** Metadata about the parent bulk scan operation this job belongs to. */ private final BulkScanInfo bulkScanInfo; // data to write back results + /** Database name where scan results should be stored. */ private final String dbName; + /** Collection name within the database for result storage. */ private final String collectionName; + /** + * Creates a new scan job description with explicit database storage configuration. + * + *

This constructor allows precise control over where scan results will be stored by + * specifying the database name and collection name directly. It's primarily used for advanced + * scenarios where custom storage locations are needed. + * + * @param scanTarget the target host and port to scan + * @param bulkScanInfo metadata about the parent bulk scan operation + * @param dbName the database name where results should be stored + * @param collectionName the collection/table name for result storage + * @param status the initial job status (typically TO_BE_EXECUTED) + */ public ScanJobDescription( ScanTarget scanTarget, BulkScanInfo bulkScanInfo, @@ -43,6 +119,17 @@ public ScanJobDescription( this.status = status; } + /** + * Creates a new scan job description from a bulk scan configuration. + * + *

This convenience constructor extracts storage configuration from the bulk scan object, + * using the bulk scan name as the database name and the bulk scan's collection name for result + * storage. This is the most common way to create scan jobs. + * + * @param scanTarget the target host and port to scan + * @param bulkScan the parent bulk scan containing storage and configuration details + * @param status the initial job status (typically TO_BE_EXECUTED) + */ public ScanJobDescription(ScanTarget scanTarget, BulkScan bulkScan, JobStatus status) { this( scanTarget, @@ -52,6 +139,17 @@ public ScanJobDescription(ScanTarget scanTarget, BulkScan bulkScan, JobStatus st status); } + /** + * Custom deserialization method to properly initialize transient fields. + * + *

This method ensures that the transient deliveryTag field is properly initialized to an + * empty Optional after deserialization. The delivery tag is transport-specific and should not + * be serialized across message boundaries. + * + * @param in the object input stream for deserialization + * @throws IOException if an I/O error occurs during deserialization + * @throws ClassNotFoundException if a class cannot be found during deserialization + */ private void readObject(java.io.ObjectInputStream in) throws IOException, ClassNotFoundException { // handle deserialization, cf. https://stackoverflow.com/a/3960558 @@ -59,30 +157,80 @@ private void readObject(java.io.ObjectInputStream in) deliveryTag = Optional.empty(); } + /** + * Gets the scan target containing the host and port to be scanned. + * + * @return the scan target specifying what should be scanned + */ public ScanTarget getScanTarget() { return scanTarget; } + /** + * Gets the database name where scan results should be stored. + * + * @return the target database name for result persistence + */ public String getDbName() { return dbName; } + /** + * Gets the collection/table name where scan results should be stored. + * + * @return the target collection name for result persistence + */ public String getCollectionName() { return collectionName; } + /** + * Gets the current execution status of this scan job. + * + *

The status tracks the job's progress through its lifecycle from initial creation + * (TO_BE_EXECUTED) through completion (SUCCESS, ERROR, etc.). + * + * @return the current job execution status + */ public JobStatus getStatus() { return status; } + /** + * Updates the execution status of this scan job. + * + *

This method is used to track the job's progress as it moves through the execution + * pipeline, from queued to running to completed states. + * + * @param status the new job execution status + */ public void setStatus(JobStatus status) { this.status = status; } + /** + * Gets the RabbitMQ delivery tag for message acknowledgment. + * + *

The delivery tag is used by workers to acknowledge message processing back to the RabbitMQ + * broker. This ensures reliable message delivery in the distributed system. + * + * @return the RabbitMQ delivery tag + * @throws java.util.NoSuchElementException if no delivery tag has been set + */ public long getDeliveryTag() { return deliveryTag.get(); } + /** + * Sets the RabbitMQ delivery tag for this job message. + * + *

This method is called by the orchestration provider when a job message is received from + * the queue. The delivery tag can only be set once to prevent accidental overwrites that could + * break message acknowledgment. + * + * @param deliveryTag the RabbitMQ delivery tag for message acknowledgment + * @throws IllegalStateException if a delivery tag has already been set + */ public void setDeliveryTag(Long deliveryTag) { if (this.deliveryTag.isPresent()) { throw new IllegalStateException("Delivery tag already set"); @@ -90,6 +238,14 @@ public void setDeliveryTag(Long deliveryTag) { this.deliveryTag = Optional.of(deliveryTag); } + /** + * Gets the bulk scan metadata for this individual job. + * + *

The bulk scan info provides traceability back to the parent bulk scan operation and + * contains configuration details needed for job execution. + * + * @return the bulk scan information object + */ public BulkScanInfo getBulkScanInfo() { return bulkScanInfo; } diff --git a/src/main/java/de/rub/nds/crawler/data/ScanResult.java b/src/main/java/de/rub/nds/crawler/data/ScanResult.java index ebd5de5..70af899 100644 --- a/src/main/java/de/rub/nds/crawler/data/ScanResult.java +++ b/src/main/java/de/rub/nds/crawler/data/ScanResult.java @@ -14,16 +14,69 @@ import java.util.UUID; import org.bson.Document; +/** + * Immutable container for TLS scan results and associated metadata. + * + *

The ScanResult class encapsulates the complete outcome of a TLS scan operation, including the + * scan target, execution status, result data, and traceability information. It serves as the + * primary data transfer object between the scanning engine, persistence layer, and monitoring + * systems in the distributed TLS-Crawler architecture. + * + *

Key characteristics: + * + *

    + *
  • Immutability - All fields are final except the database-managed ID + *
  • Traceability - Links results back to their originating bulk scan + *
  • Status Tracking - Maintains job execution status for monitoring + *
  • Error Handling - Supports both successful results and exception storage + *
  • Serialization - Compatible with JSON/BSON for database persistence + *
+ * + *

Construction Patterns: + * + *

    + *
  • Normal Constructor - Creates result from completed ScanJobDescription + *
  • Exception Factory - Creates error result via fromException() method + *
  • Validation - Enforces valid status transitions and error states + *
+ * + *

Data Components: + * + *

    + *
  • Unique ID - UUID for database primary key and result identification + *
  • Bulk Scan ID - Reference to the parent bulk scanning campaign + *
  • Scan Target - The host/port combination that was scanned + *
  • Job Status - Final execution status (SUCCESS, ERROR, TIMEOUT, etc.) + *
  • Result Document - BSON document containing scan findings or error details + *
+ * + *

Status Validation: The class enforces that results are only created from scan + * jobs that have completed execution (not in TO_BE_EXECUTED state) and that error results have + * appropriate error status codes. + * + *

Database Integration: Uses Jackson annotations for JSON serialization and + * MongoDB integration, with the _id field mapping to the database primary key. + * + * @see ScanJobDescription + * @see ScanTarget + * @see JobStatus + * @see BulkScanInfo + */ public class ScanResult implements Serializable { + /** Unique identifier for this scan result record. */ private String id; + /** Identifier of the bulk scan operation that produced this result. */ private final String bulkScan; + /** Target specification that was scanned to produce this result. */ private final ScanTarget scanTarget; + /** Final execution status indicating success, failure, or error condition. */ private final JobStatus jobStatus; + /** MongoDB document containing the actual scan results or error information. */ private final Document result; private ScanResult( @@ -35,6 +88,25 @@ private ScanResult( this.result = result; } + /** + * Creates a new scan result from a completed scan job description and result document. + * + *

This is the primary constructor for creating scan results from successful or failed scan + * operations. It extracts metadata from the scan job description and associates it with the + * result document from the scanning process. + * + *

Status Validation: The constructor validates that the scan job has + * completed execution by checking that its status is not TO_BE_EXECUTED. This ensures that only + * completed scan jobs are converted to results. + * + *

Metadata Extraction: The constructor extracts key information from the + * scan job description including the bulk scan ID, scan target, and execution status to + * populate the result object. + * + * @param scanJobDescription the completed scan job containing metadata and final status + * @param result the BSON document containing scan results, may be null for empty results + * @throws IllegalArgumentException if the scan job is still in TO_BE_EXECUTED state + */ public ScanResult(ScanJobDescription scanJobDescription, Document result) { this( scanJobDescription.getBulkScanInfo().getBulkScanId(), @@ -47,6 +119,25 @@ public ScanResult(ScanJobDescription scanJobDescription, Document result) { } } + /** + * Factory method for creating scan results from exceptions during scan execution. + * + *

This method provides a standardized way to create scan results when scan operations fail + * with exceptions. It creates a result document containing the exception details and ensures + * the scan job description is in an appropriate error state. + * + *

Error State Validation: The method validates that the scan job + * description has an error status (ERROR, CANCELLED, INTERNAL_ERROR, etc.) before creating the + * error result, ensuring consistency between status and result content. + * + *

Exception Handling: The exception is embedded in a BSON document under + * the "exception" key, allowing for structured storage and later analysis of scan failures. + * + * @param scanJobDescription the scan job in an error state + * @param e the exception that caused the scan to fail + * @return a new ScanResult containing the exception details + * @throws IllegalArgumentException if the scan job is not in an error state + */ public static ScanResult fromException(ScanJobDescription scanJobDescription, Exception e) { if (!scanJobDescription.getStatus().isError()) { throw new IllegalArgumentException("ScanJobDescription must be in an error state"); @@ -56,28 +147,75 @@ public static ScanResult fromException(ScanJobDescription scanJobDescription, Ex return new ScanResult(scanJobDescription, errorDocument); } + /** + * Gets the unique identifier for this scan result. + * + *

The ID is a UUID string that serves as the primary key for database storage and unique + * identification of scan results across the system. + * + * @return the unique ID string for this scan result + */ @JsonProperty("_id") public String getId() { return this.id; } + /** + * Sets the unique identifier for this scan result. + * + *

This method is primarily used by serialization frameworks and database drivers to set the + * ID when loading results from persistent storage. + * + * @param id the unique ID string to assign to this scan result + */ @JsonProperty("_id") public void setId(String id) { this.id = id; } + /** + * Gets the bulk scan ID that this result belongs to. + * + *

This provides traceability back to the bulk scanning campaign that generated this + * individual scan result. + * + * @return the bulk scan ID string + */ public String getBulkScan() { return this.bulkScan; } + /** + * Gets the scan target (host and port) that was scanned. + * + * @return the scan target containing hostname and port information + */ public ScanTarget getScanTarget() { return this.scanTarget; } + /** + * Gets the result document containing scan findings or error details. + * + *

For successful scans, this contains the TLS scanner output in BSON format. For failed + * scans created via fromException(), this contains exception details. May be null for scans + * that completed but produced no results. + * + * @return the BSON document containing scan results or error information, may be null + */ public Document getResult() { return this.result; } + /** + * Gets the final execution status of the scan job. + * + *

This status indicates how the scan completed, including success, various error conditions, + * timeouts, and cancellations. + * + * @return the final job status for this scan result + * @see JobStatus + */ public JobStatus getResultStatus() { return jobStatus; } diff --git a/src/main/java/de/rub/nds/crawler/denylist/DenylistFileProvider.java b/src/main/java/de/rub/nds/crawler/denylist/DenylistFileProvider.java index b480d2f..4ad9e22 100644 --- a/src/main/java/de/rub/nds/crawler/denylist/DenylistFileProvider.java +++ b/src/main/java/de/rub/nds/crawler/denylist/DenylistFileProvider.java @@ -26,8 +26,74 @@ import org.apache.logging.log4j.Logger; /** - * Reads the specified denylist file. Supports hostnames, ips and complete subnets as denylist - * entries. + * File-based denylist provider supporting hostnames, IP addresses, and CIDR subnet filtering. + * + *

The DenylistFileProvider implements IDenylistProvider by reading filtering rules from a local + * text file. It supports multiple entry types to provide comprehensive target filtering + * capabilities for compliance, security, and resource management requirements. + * + *

Key features: + * + *

    + *
  • Multiple Formats - Hostnames, individual IPs, and CIDR subnet blocks + *
  • Automatic Classification - Validates and categorizes entries by type + *
  • Performance Optimized - Uses appropriate data structures for fast lookups + *
  • Thread-Safe - Synchronized access for concurrent worker operations + *
+ * + *

Supported Entry Types: + * + *

    + *
  • Domain Names - Exact hostname matching (e.g., "example.com") + *
  • IP Addresses - Individual IPv4/IPv6 addresses (e.g., "192.168.1.1") + *
  • CIDR Blocks - Subnet ranges (e.g., "192.168.0.0/16", "10.0.0.0/8") + *
+ * + *

File Format: Plain text file with one entry per line. Invalid entries are + * silently ignored. Comments and empty lines are processed as invalid entries. + * + *

Example Denylist File: + * + *

+ * # Private networks
+ * 192.168.0.0/16
+ * 10.0.0.0/8
+ * 172.16.0.0/12
+ *
+ * # Specific domains
+ * government.gov
+ * sensitive.internal
+ *
+ * # Individual IPs
+ * 203.0.113.1
+ * 2001:db8::1
+ * 
+ * + *

Validation and Processing: + * + *

    + *
  • Domain Validation - Uses Apache Commons validator for RFC compliance + *
  • IP Validation - Supports both IPv4 and IPv6 address formats + *
  • CIDR Validation - Validates subnet notation and creates SubnetUtils + * objects + *
  • Error Handling - Invalid entries are logged and ignored + *
+ * + *

Performance Characteristics: + * + *

    + *
  • Domain Lookup - O(1) HashSet lookup for exact hostname matches + *
  • IP Lookup - O(1) HashSet lookup for individual IP addresses + *
  • Subnet Lookup - O(n) linear search through CIDR blocks + *
  • Memory Usage - Efficient storage with type-specific collections + *
+ * + *

Thread Safety: The isDenylisted method is synchronized to ensure thread-safe + * access during concurrent scanning operations. + * + * @see IDenylistProvider + * @see ScanTarget + * @see SubnetUtils */ public class DenylistFileProvider implements IDenylistProvider { @@ -37,6 +103,15 @@ public class DenylistFileProvider implements IDenylistProvider { private final List cidrDenylist = new ArrayList<>(); private final Set domainDenylistSet = new HashSet<>(); + /** + * Creates a new file-based denylist provider from the specified file. + * + *

The constructor reads and parses the denylist file, categorizing entries by type (domain, + * IP, CIDR) and storing them in optimized data structures for fast lookup. File access errors + * are logged but don't prevent provider creation. + * + * @param denylistFilename the path to the denylist file to read + */ public DenylistFileProvider(String denylistFilename) { List denylist = List.of(); try (Stream lines = Files.lines(Paths.get(denylistFilename))) { diff --git a/src/main/java/de/rub/nds/crawler/denylist/IDenylistProvider.java b/src/main/java/de/rub/nds/crawler/denylist/IDenylistProvider.java index ed1e4c5..1ff0eb6 100644 --- a/src/main/java/de/rub/nds/crawler/denylist/IDenylistProvider.java +++ b/src/main/java/de/rub/nds/crawler/denylist/IDenylistProvider.java @@ -10,7 +10,95 @@ import de.rub.nds.crawler.data.ScanTarget; +/** + * Denylist provider interface for filtering prohibited scan targets in TLS-Crawler operations. + * + *

The IDenylistProvider defines the contract for target filtering and access control in the + * TLS-Crawler system. It enables implementations to block specific hosts, IP ranges, or domains + * from being scanned, supporting compliance requirements, ethical scanning practices, and resource + * management policies. + * + *

Key responsibilities: + * + *

    + *
  • Target Filtering - Determines if scan targets should be excluded + *
  • Policy Enforcement - Implements organizational scanning policies + *
  • Compliance Support - Ensures adherence to legal and ethical guidelines + *
  • Resource Protection - Prevents scanning of sensitive or protected systems + *
+ * + *

Filtering Criteria: + * + *

    + *
  • Hostname Patterns - Exact matches, wildcards, or domain suffixes + *
  • IP Address Ranges - CIDR blocks, subnet ranges, or individual IPs + *
  • Port Restrictions - Specific ports or port ranges to avoid + *
  • Protocol Considerations - Protocol-specific filtering rules + *
+ * + *

Common Use Cases: + * + *

    + *
  • Internal Networks - Block private IP ranges (RFC 1918) + *
  • Government Domains - Exclude .gov, .mil, or country-specific domains + *
  • Critical Infrastructure - Protect essential services and utilities + *
  • Legal Compliance - Honor legal restrictions and opt-out requests + *
+ * + *

Implementation Guidelines: + * + *

    + *
  • Performance - Optimize for fast lookups with large denylists + *
  • Memory Efficiency - Use appropriate data structures for scale + *
  • Thread Safety - Support concurrent access from multiple workers + *
  • Dynamic Updates - Consider support for runtime denylist updates + *
+ * + *

Common Implementations: + * + *

    + *
  • DenylistFileProvider - File-based denylist with various formats + *
  • CIDR Block Providers - IP range filtering with subnet support + *
  • Domain Pattern Providers - Regex or wildcard domain matching + *
  • Composite Providers - Multiple filtering criteria combined + *
+ * + *

Integration Points: Denylist providers are typically used during target + * processing in PublishBulkScanJob and ScanTarget.fromTargetString() to filter targets before scan + * job creation. + * + * @see ScanTarget + * @see ScanTarget#fromTargetString(String, int, IDenylistProvider) + * @see DenylistFileProvider + */ public interface IDenylistProvider { + /** + * Determines if a scan target should be excluded from scanning based on denylist rules. + * + *

This method evaluates the provided scan target against the configured denylist criteria + * and returns true if the target should be blocked from scanning. The implementation should + * consider all relevant target attributes including hostname, IP address, and port when making + * the determination. + * + *

Evaluation Criteria: + * + *

    + *
  • Hostname Matching - Check hostname against domain patterns + *
  • IP Address Filtering - Evaluate IP against CIDR blocks or ranges + *
  • Port Restrictions - Consider port-specific filtering rules + *
  • Combined Rules - Apply multiple criteria as configured + *
+ * + *

Performance Considerations: This method may be called frequently during + * target processing, so implementations should optimize for fast evaluation, especially with + * large denylists. + * + *

Thread Safety: This method must be thread-safe as it will be called + * concurrently during parallel target processing. + * + * @param target the scan target to evaluate against denylist rules + * @return true if the target is denylisted and should not be scanned, false otherwise + */ boolean isDenylisted(ScanTarget target); } diff --git a/src/main/java/de/rub/nds/crawler/orchestration/DoneNotificationConsumer.java b/src/main/java/de/rub/nds/crawler/orchestration/DoneNotificationConsumer.java index 9af1769..f157aa2 100644 --- a/src/main/java/de/rub/nds/crawler/orchestration/DoneNotificationConsumer.java +++ b/src/main/java/de/rub/nds/crawler/orchestration/DoneNotificationConsumer.java @@ -10,8 +10,97 @@ import de.rub.nds.crawler.data.ScanJobDescription; +/** + * Functional interface for consuming scan job completion notifications in distributed TLS scanning. + * + *

The DoneNotificationConsumer defines the contract for controllers and monitoring systems to + * receive notifications when scan jobs complete processing. It enables real-time progress tracking, + * statistics collection, and completion event handling in the TLS-Crawler distributed architecture. + * + *

Key characteristics: + * + *

    + *
  • Functional Interface - Single method interface suitable for lambda + * expressions + *
  • Event-Driven - Called asynchronously when scan jobs complete + *
  • Progress Monitoring - Primary mechanism for tracking bulk scan progress + *
  • Statistics Collection - Enables real-time performance and completion + * metrics + *
+ * + *

Usage Scenarios: + * + *

    + *
  • Progress Tracking - ProgressMonitor uses this to track scan completion + *
  • Statistics Updates - Update completion counters and performance metrics + *
  • ETA Calculation - Calculate estimated time to completion + *
  • Completion Detection - Detect when bulk scans finish + *
+ * + *

Implementation Pattern: + * + *

    + *
  1. Notification Reception - Receive completion event from orchestration + * provider + *
  2. Status Processing - Extract and categorize job completion status + *
  3. Statistics Update - Update counters and performance metrics + *
  4. Progress Logging - Log progress information and ETAs + *
+ * + *

Thread Safety: Implementations must be thread-safe as they may be called + * concurrently by multiple message handling threads from the orchestration provider. + * + *

Consumer Tag Usage: The consumer tag parameter identifies the specific + * message queue consumer that delivered the notification, useful for debugging and routing. + * + *

Typical Usage: + * + *

{@code
+ * // Lambda implementation
+ * DoneNotificationConsumer consumer = (tag, job) -> {
+ *     updateProgress(job.getStatus());
+ *     logCompletion(job);
+ * };
+ *
+ * // Method reference
+ * DoneNotificationConsumer consumer = this::handleCompletion;
+ *
+ * // Registration with orchestration provider
+ * orchestrationProvider.registerDoneNotificationConsumer(bulkScan, consumer);
+ * }
+ * + * @see ScanJobDescription + * @see IOrchestrationProvider#registerDoneNotificationConsumer(de.rub.nds.crawler.data.BulkScan, + * DoneNotificationConsumer) Typically implemented by + * ProgressMonitor.BulkscanMonitor.consumeDoneNotification method. + */ @FunctionalInterface public interface DoneNotificationConsumer { + /** + * Processes a scan job completion notification from the orchestration provider. + * + *

This method is called asynchronously by the orchestration provider when a scan job + * completes processing. The implementation should update progress tracking, statistics, and any + * monitoring systems based on the completed job information. + * + *

Processing Responsibilities: + * + *

    + *
  • Status Tracking - Record job completion status (SUCCESS, ERROR, etc.) + *
  • Progress Updates - Update completion counters and percentages + *
  • Performance Metrics - Calculate timing and throughput statistics + *
  • Completion Detection - Detect when bulk scan operations finish + *
+ * + *

Thread Safety: This method may be called concurrently from multiple + * threads, so implementations must handle synchronization appropriately. + * + *

Exception Handling: Implementations should catch all exceptions + * internally to prevent disruption of the notification delivery system. + * + * @param consumerTag the message queue consumer tag that delivered this notification + * @param scanJobDescription the completed scan job with final status and metadata + */ void consumeDoneNotification(String consumerTag, ScanJobDescription scanJobDescription); } diff --git a/src/main/java/de/rub/nds/crawler/orchestration/IOrchestrationProvider.java b/src/main/java/de/rub/nds/crawler/orchestration/IOrchestrationProvider.java index c39f41b..e92ae6e 100644 --- a/src/main/java/de/rub/nds/crawler/orchestration/IOrchestrationProvider.java +++ b/src/main/java/de/rub/nds/crawler/orchestration/IOrchestrationProvider.java @@ -12,45 +12,177 @@ import de.rub.nds.crawler.data.ScanJobDescription; /** - * Interface for the orchestration provider. Its job is to accept jobs from the controller and to - * submit them to the worker. The provider may open a connection in its constructor, which must be - * closed in {@link #closeConnection()}. + * Orchestration provider interface for distributed job coordination in TLS-Crawler. + * + *

The IOrchestrationProvider defines the contract for coordinating scan job distribution between + * controllers and workers in the TLS-Crawler distributed architecture. It abstracts the underlying + * message queue implementation (RabbitMQ, etc.) and provides a reliable communication mechanism for + * job submission, consumption, and completion notifications. + * + *

Key responsibilities: + * + *

    + *
  • Job Distribution - Delivers scan jobs from controllers to available + * workers + *
  • Load Balancing - Distributes work across multiple worker instances + *
  • Reliable Messaging - Ensures job delivery with acknowledgment mechanisms + *
  • Progress Monitoring - Provides completion notifications for tracking + *
  • Resource Management - Manages connections and cleanup for long-running + * operations + *
+ * + *

Message Flow Architecture: + * + *

    + *
  1. Job Submission - Controllers submit jobs via submitScanJob() + *
  2. Job Distribution - Provider routes jobs to registered consumers + *
  3. Job Processing - Workers receive jobs through registered consumers + *
  4. Completion Notification - Workers notify completion via + * notifyOfDoneScanJob() + *
  5. Progress Tracking - Completion events are forwarded to monitoring systems + *
+ * + *

Consumer Registration: + * + *

    + *
  • Scan Job Consumers - Workers register to receive scan jobs + *
  • Done Notification Consumers - Controllers register for completion events + *
  • Prefetch Control - Configurable flow control for consumer capacity + *
+ * + *

Reliability Features: + * + *

    + *
  • Acknowledgment - Jobs must be explicitly acknowledged after processing + *
  • Delivery Guarantees - Ensures jobs are not lost during processing + *
  • Error Handling - Supports requeue and retry mechanisms + *
  • Connection Recovery - Resilient to network interruptions + *
+ * + *

Implementation Notes: + * + *

    + *
  • Connection Management - Providers may establish connections in constructor + *
  • Resource Cleanup - Must implement closeConnection() for proper cleanup + *
  • Thread Safety - Should support concurrent access from multiple threads + *
  • Configuration - Should support flexible connection and routing + * configuration + *
+ * + *

Common Implementations: + * + *

    + *
  • RabbitMqOrchestrationProvider - RabbitMQ-based message queue orchestration + *
  • Local Providers - In-memory implementations for testing and development + *
  • Cloud Providers - Integration with cloud messaging services + *
+ * + * @see ScanJobDescription + * @see ScanJobConsumer + * @see DoneNotificationConsumer + * @see BulkScan + * @see RabbitMqOrchestrationProvider */ public interface IOrchestrationProvider { /** - * Submit a scan job to the orchestration provider. + * Submits a scan job for distribution to available worker instances. + * + *

This method queues a scan job for processing by worker nodes, using the underlying message + * queue system to ensure reliable delivery. The job will be routed to an available worker based + * on the provider's load balancing strategy. + * + *

Delivery Behavior: The implementation should ensure that jobs are + * persistently queued and will be delivered even if no workers are currently available, + * supporting fault-tolerant distributed processing. * - * @param scanJobDescription The scan job to be submitted. + * @param scanJobDescription the scan job to submit for processing + * @throws RuntimeException if the job cannot be submitted (implementation-specific) */ void submitScanJob(ScanJobDescription scanJobDescription); /** - * Register a scan job consumer. It has to confirm that the job is done using {@link - * #notifyOfDoneScanJob(ScanJobDescription)}. + * Registers a scan job consumer to receive jobs from the orchestration provider. * - * @param scanJobConsumer The scan job consumer to be registered. - * @param prefetchCount Number of unacknowledged jobs that may be sent to the consumer. + *

This method registers a worker to receive scan jobs from the message queue. The consumer + * will be called for each available job, and must acknowledge completion using {@link + * #notifyOfDoneScanJob(ScanJobDescription)} to ensure reliable processing. + * + *

Flow Control: The prefetchCount parameter controls how many + * unacknowledged jobs can be delivered to this consumer simultaneously, enabling back-pressure + * management and preventing worker overload. + * + *

Consumer Lifecycle: The consumer remains active until the connection is + * closed or the application terminates. Implementations should handle consumer failures + * gracefully and support reregistration. + * + * @param scanJobConsumer the functional interface to handle incoming scan jobs + * @param prefetchCount maximum number of unacknowledged jobs to deliver simultaneously + * @throws RuntimeException if the consumer cannot be registered (implementation-specific) */ void registerScanJobConsumer(ScanJobConsumer scanJobConsumer, int prefetchCount); /** - * Register a done notification consumer. It is called when a scan job is done. + * Registers a completion notification consumer for a specific bulk scan operation. + * + *

This method enables controllers to receive notifications when individual scan jobs within + * a bulk scan complete. The consumer will be called for each job completion, enabling real-time + * progress tracking and statistics collection. * - * @param bulkScan The bulk scan for which the consumer accepts notifications. - * @param doneNotificationConsumer The done notification consumer to be registered. + *

Bulk Scan Scope: The consumer is registered specifically for the provided + * bulk scan and will only receive notifications for jobs belonging to that bulk scan operation. + * + *

Monitoring Integration: This mechanism is typically used by + * ProgressMonitor instances to track scan progress and calculate completion statistics. + * + * @param bulkScan the bulk scan operation to monitor for completion notifications + * @param doneNotificationConsumer the consumer to handle job completion events + * @throws RuntimeException if the consumer cannot be registered (implementation-specific) */ void registerDoneNotificationConsumer( BulkScan bulkScan, DoneNotificationConsumer doneNotificationConsumer); /** - * Send an acknowledgment that a scan job received by a scan consumer is finished. + * Acknowledges completion of a scan job and triggers completion notifications. + * + *

This method performs dual functions: it acknowledges successful processing of a scan job + * to the message queue system, and it publishes completion notifications to registered done + * notification consumers for progress monitoring. + * + *

Acknowledgment Behavior: The method confirms to the message queue that + * the job has been successfully processed and can be removed from the queue, preventing + * redelivery to other workers. * - * @param scanJobDescription The scan job that is finished. Its status should reflect the status - * of the results. + *

Notification Publishing: Simultaneously publishes the completion event to + * any registered done notification consumers, enabling real-time progress tracking and + * statistics updates. + * + *

Status Consistency: The scan job description's status field should + * accurately reflect the final processing outcome before calling this method. + * + * @param scanJobDescription the completed scan job with final status information + * @throws RuntimeException if acknowledgment or notification fails (implementation-specific) */ void notifyOfDoneScanJob(ScanJobDescription scanJobDescription); - /** Close any connection to the orchestration provider, freeing resources. */ + /** + * Closes connections and releases resources used by the orchestration provider. + * + *

This method performs cleanup of all resources including message queue connections, thread + * pools, and any other resources allocated during provider operation. It should be called when + * the application is shutting down or when the provider is no longer needed. + * + *

Cleanup Responsibilities: + * + *

    + *
  • Connection Closure - Close message queue connections gracefully + *
  • Consumer Cleanup - Unregister all active consumers + *
  • Resource Release - Free any allocated resources (threads, memory) + *
  • State Cleanup - Clear any internal state or caches + *
+ * + *

Thread Safety: This method should be safe to call from any thread and + * should handle concurrent calls gracefully. + */ void closeConnection(); } diff --git a/src/main/java/de/rub/nds/crawler/orchestration/ScanJobConsumer.java b/src/main/java/de/rub/nds/crawler/orchestration/ScanJobConsumer.java index 628b0ee..85c511a 100644 --- a/src/main/java/de/rub/nds/crawler/orchestration/ScanJobConsumer.java +++ b/src/main/java/de/rub/nds/crawler/orchestration/ScanJobConsumer.java @@ -10,8 +10,88 @@ import de.rub.nds.crawler.data.ScanJobDescription; +/** + * Functional interface for consuming scan jobs from the orchestration provider in distributed TLS + * scanning. + * + *

The ScanJobConsumer defines the contract for worker instances to receive and process scan jobs + * from the message queue system. It serves as the callback mechanism that enables asynchronous job + * processing in the TLS-Crawler distributed architecture. + * + *

Key characteristics: + * + *

    + *
  • Functional Interface - Single method interface suitable for lambda + * expressions + *
  • Asynchronous Processing - Called by orchestration provider when jobs + * arrive + *
  • Worker Integration - Typically implemented by Worker class instances + *
  • Acknowledgment Responsibility - Must ensure job completion is acknowledged + *
+ * + *

Implementation Pattern: + * + *

    + *
  1. Job Reception - Receive ScanJobDescription from orchestration provider + *
  2. Processing - Execute the TLS scan based on job configuration + *
  3. Result Handling - Store results and handle any errors + *
  4. Acknowledgment - Notify orchestration provider of completion + *
+ * + *

Thread Safety: Implementations must be thread-safe as they may be called + * concurrently by the orchestration provider's message handling threads. + * + *

Error Handling: Implementations should handle all exceptions internally and + * ensure proper acknowledgment even in error scenarios to prevent message redelivery issues. + * + *

Typical Usage: + * + *

{@code
+ * // Lambda implementation
+ * ScanJobConsumer consumer = jobDescription -> {
+ *     // Process the scan job
+ *     processJob(jobDescription);
+ * };
+ *
+ * // Method reference
+ * ScanJobConsumer consumer = this::handleScanJob;
+ *
+ * // Registration with orchestration provider
+ * orchestrationProvider.registerScanJobConsumer(consumer, prefetchCount);
+ * }
+ * + * @see ScanJobDescription + * @see IOrchestrationProvider#registerScanJobConsumer(ScanJobConsumer, int) Typically implemented + * by Worker.handleScanJob(ScanJobDescription) method. + */ @FunctionalInterface public interface ScanJobConsumer { + /** + * Processes a scan job received from the orchestration provider. + * + *

This method is called asynchronously by the orchestration provider when a scan job becomes + * available for processing. The implementation must handle the complete job lifecycle including + * execution, result storage, and acknowledgment. + * + *

Processing Responsibilities: + * + *

    + *
  • Job Execution - Perform the TLS scan based on job configuration + *
  • Result Storage - Persist scan results to the configured database + *
  • Error Handling - Handle and categorize any processing errors + *
  • Acknowledgment - Notify completion via orchestration provider + *
+ * + *

Thread Safety: This method may be called concurrently from multiple + * threads, so implementations must be thread-safe or handle synchronization appropriately. + * + *

Exception Handling: Implementations should catch all exceptions + * internally and not allow them to propagate, as uncaught exceptions may disrupt the message + * queue processing loop. + * + * @param scanJobDescription the scan job to process, containing target and configuration + * details + */ void consumeScanJob(ScanJobDescription scanJobDescription); } diff --git a/src/main/java/de/rub/nds/crawler/persistence/IPersistenceProvider.java b/src/main/java/de/rub/nds/crawler/persistence/IPersistenceProvider.java index 50e3626..b7c9bef 100644 --- a/src/main/java/de/rub/nds/crawler/persistence/IPersistenceProvider.java +++ b/src/main/java/de/rub/nds/crawler/persistence/IPersistenceProvider.java @@ -13,31 +13,140 @@ import de.rub.nds.crawler.data.ScanResult; /** - * Persistence provider interface. Exposes methods to write out the different stages of a task to a - * file/database/api. + * Persistence provider interface for database operations in the TLS-Crawler distributed + * architecture. + * + *

The IPersistenceProvider defines the contract for storing and retrieving scan data throughout + * the TLS-Crawler workflow. It abstracts the underlying storage implementation (MongoDB, file + * system, etc.) and provides a consistent interface for controllers and workers to persist scan + * metadata, results, and progress information. + * + *

Key responsibilities: + * + *

    + *
  • Scan Result Storage - Persists individual scan results with metadata + *
  • Bulk Scan Management - Handles bulk scan lifecycle (create, update) + *
  • Data Consistency - Ensures reliable storage across distributed operations + *
  • Storage Abstraction - Provides database-agnostic persistence interface + *
+ * + *

Implementation Requirements: + * + *

    + *
  • Thread Safety - Must support concurrent access from multiple worker + * threads + *
  • Error Handling - Should handle storage failures gracefully with + * appropriate exceptions + *
  • ID Generation - Must assign unique IDs to BulkScan objects during + * insertion + *
  • Data Integrity - Ensure scan results are correctly associated with their + * bulk scans + *
+ * + *

Storage Workflow: + * + *

    + *
  1. Bulk Scan Creation - Controller creates bulk scan with insertBulkScan() + *
  2. Job Processing - Workers store individual results with insertScanResult() + *
  3. Progress Updates - Controller updates bulk scan metadata with + * updateBulkScan() + *
  4. Completion - Final statistics and status updates via updateBulkScan() + *
+ * + *

Data Relationships: + * + *

    + *
  • BulkScan - Parent container with metadata and aggregate statistics + *
  • ScanResult - Individual scan outcomes linked to bulk scan via ID + *
  • ScanJobDescription - Job metadata for result correlation and debugging + *
+ * + *

Common Implementations: + * + *

    + *
  • MongoPersistenceProvider - MongoDB-based storage with JSON serialization + *
  • File-based Providers - Local file system storage for development/testing + *
  • API Providers - REST API integration for external systems + *
+ * + * @see BulkScan + * @see ScanResult + * @see ScanJobDescription + * @see MongoPersistenceProvider */ public interface IPersistenceProvider { /** - * Insert a scan result into the database. + * Persists a scan result and its associated job metadata to the database. * - * @param scanResult The scan result to insert. - * @param job The job that was used to create the scan result. + *

This method stores the complete outcome of a scan job execution, including the scan + * findings, execution status, and metadata for traceability. The implementation must ensure the + * result is correctly linked to its parent bulk scan. + * + *

Storage Requirements: + * + *

    + *
  • Result Data - Store the complete scan result document + *
  • Job Metadata - Include job description for debugging and audit + *
  • Bulk Scan Link - Maintain relationship to parent bulk scan + *
  • Timestamp - Record insertion time for analysis + *
+ * + *

Thread Safety: This method must be thread-safe as it will be called + * concurrently by multiple worker threads processing scan jobs. + * + * @param scanResult the scan result containing findings and execution status + * @param job the job description containing metadata and configuration details + * @throws RuntimeException if the result cannot be persisted (implementation-specific) */ void insertScanResult(ScanResult scanResult, ScanJobDescription job); /** - * Insert a bulk scan into the database. This is used to store metadata about the bulk scan. - * This adds an ID to the bulk scan. + * Creates a new bulk scan record in the database and assigns a unique identifier. + * + *

This method initializes a bulk scan operation by persisting its configuration and metadata + * to the database. The implementation must generate and assign a unique ID to the bulk scan + * object, which will be used to correlate individual scan results. + * + *

Initialization Responsibilities: * - * @param bulkScan The bulk scan to insert. + *

    + *
  • ID Assignment - Generate and set unique bulk scan identifier + *
  • Metadata Storage - Persist scan configuration and parameters + *
  • Timestamp Recording - Set creation timestamp for tracking + *
  • Initial Status - Establish starting state for monitoring + *
+ * + *

ID Generation: The implementation must ensure the generated ID is unique + * across all bulk scans and suitable for use as a foreign key reference in scan result records. + * + * @param bulkScan the bulk scan object to persist (ID will be assigned) + * @throws RuntimeException if the bulk scan cannot be created (implementation-specific) */ void insertBulkScan(BulkScan bulkScan); /** - * Update a bulk scan in the database. This updated the whole bulk scan. + * Updates an existing bulk scan record with current progress and statistics. + * + *

This method replaces the existing bulk scan record with updated information, typically + * called to record progress updates, final statistics, or completion status. The bulk scan ID + * must remain unchanged during updates. + * + *

Update Scenarios: + * + *

    + *
  • Progress Updates - Job submission counts and statistics + *
  • Status Changes - Monitoring state and completion flags + *
  • Final Statistics - Success/error counts and performance metrics + *
  • Completion - End timestamp and notification status + *
+ * + *

Consistency Requirements: The implementation should ensure that updates + * are atomic and maintain data consistency, especially when called concurrently with scan + * result insertions. * - * @param bulkScan The bulk scan to update. + * @param bulkScan the bulk scan object with updated information + * @throws RuntimeException if the bulk scan cannot be updated (implementation-specific) */ void updateBulkScan(BulkScan bulkScan); } diff --git a/src/main/java/de/rub/nds/crawler/targetlist/CruxListProvider.java b/src/main/java/de/rub/nds/crawler/targetlist/CruxListProvider.java index b979ae8..79e7747 100644 --- a/src/main/java/de/rub/nds/crawler/targetlist/CruxListProvider.java +++ b/src/main/java/de/rub/nds/crawler/targetlist/CruxListProvider.java @@ -14,8 +14,51 @@ import java.util.stream.Stream; /** - * Target list provider that downloads the most recent crux list (...) and extracts the top x hosts from it. + * Chrome UX Report (CrUX) target list provider for distributed TLS scanning operations. + * + *

The CruxListProvider downloads and processes the most recent Chrome User Experience Report + * data to extract popular website targets for TLS security scanning. It provides access to + * real-world web traffic patterns based on actual Chrome browser usage statistics. + * + *

Key features: + * + *

    + *
  • Real User Data - Based on actual Chrome browser navigation patterns + *
  • Current Rankings - Downloads the most recent CrUX data available + *
  • Configurable Size - Supports various list sizes from 1K to 1M targets + *
  • HTTPS Focus - Filters for HTTPS-enabled websites only + *
+ * + *

Data Source: The provider downloads compressed CSV data from the official + * CrUX Top Lists repository maintained by zakird on GitHub. This data is updated regularly to + * reflect current web usage patterns. + * + *

Processing Pipeline: + * + *

    + *
  1. Download - Fetch current.csv.gz from GitHub repository + *
  2. Extract - Decompress GZIP data to CSV format + *
  3. Filter - Select only HTTPS websites within rank threshold + *
  4. Transform - Extract hostnames by removing protocol prefixes + *
+ * + *

CSV Format: Each line contains "protocol://domain, crux_rank" where the rank + * indicates popularity based on Chrome usage statistics. + * + *

Target Selection: Only HTTPS websites with ranks <= configured number are + * included, ensuring TLS-capable targets for security scanning. + * + *

Usage Example: + * + *

{@code
+ * CruxListProvider provider = new CruxListProvider(CruxListNumber.TOP_10K);
+ * List targets = provider.getTargetList();
+ * // Returns up to 10,000 popular HTTPS-enabled hostnames
+ * }
+ * + * @see ZipFileProvider + * @see CruxListNumber + * @see ITargetListProvider */ public class CruxListProvider extends ZipFileProvider { @@ -24,6 +67,15 @@ public class CruxListProvider extends ZipFileProvider { private static final String ZIP_FILENAME = "current.csv.gz"; private static final String FILENAME = "current.csv"; + /** + * Creates a new CrUX list provider for the specified target list size. + * + *

The constructor configures the provider to download and process the current CrUX data, + * extracting up to the specified number of top-ranked HTTPS websites for TLS scanning + * operations. + * + * @param cruxListNumber the desired list size determining maximum number of targets + */ public CruxListProvider(CruxListNumber cruxListNumber) { super(cruxListNumber.getNumber(), SOURCE, ZIP_FILENAME, FILENAME, "Crux"); } diff --git a/src/main/java/de/rub/nds/crawler/targetlist/ITargetListProvider.java b/src/main/java/de/rub/nds/crawler/targetlist/ITargetListProvider.java index 5e4662f..311b428 100644 --- a/src/main/java/de/rub/nds/crawler/targetlist/ITargetListProvider.java +++ b/src/main/java/de/rub/nds/crawler/targetlist/ITargetListProvider.java @@ -10,7 +10,81 @@ import java.util.List; +/** + * Target list provider interface for supplying scan targets to TLS-Crawler operations. + * + *

The ITargetListProvider defines the contract for obtaining lists of scan targets from various + * sources including files, web services, databases, and curated lists. It abstracts the target + * acquisition mechanism and provides a consistent interface for controllers to obtain targets for + * bulk scanning operations. + * + *

Key responsibilities: + * + *

    + *
  • Target Acquisition - Retrieves targets from the configured source + *
  • Format Standardization - Provides targets in consistent string format + *
  • Source Abstraction - Hides implementation details of target sources + *
  • Error Handling - Manages source-specific failures gracefully + *
+ * + *

Target Format: + * + *

    + *
  • Hostname Only - "example.com" (uses default port) + *
  • Hostname with Port - "example.com:443" (explicit port) + *
  • IP Address - "192.168.1.1" or "192.168.1.1:8443" + *
  • IPv6 Address - "[::1]" or "[::1]:443" + *
+ * + *

Common Implementations: + * + *

    + *
  • TargetFileProvider - Reads targets from local files + *
  • TrancoListProvider - Fetches targets from Tranco web ranking + *
  • CruxListProvider - Uses Google Chrome UX Report data + *
  • TrancoEmailListProvider - Extracts MX records from Tranco data + *
  • ZipFileProvider - Reads from compressed archive files + *
+ * + *

Implementation Guidelines: + * + *

    + *
  • Error Resilience - Should handle network failures and missing sources + *
  • Performance - Consider caching for expensive operations + *
  • Memory Efficiency - Stream large lists when possible + *
  • Format Validation - Ensure returned targets are well-formed + *
+ * + *

Usage Pattern: Target list providers are typically configured based on + * command-line arguments and used by controllers during bulk scan initialization to obtain the + * complete list of targets for processing. + * + * @see TargetFileProvider + * @see TrancoListProvider + * @see CruxListProvider Configured via ControllerCommandConfig.getTargetListProvider() method. + */ public interface ITargetListProvider { + /** + * Retrieves the complete list of scan targets from the configured source. + * + *

This method fetches all available targets from the provider's source and returns them as a + * list of string representations. The implementation should handle any necessary data + * retrieval, parsing, and formatting to produce valid target strings. + * + *

Target Format: Each string should represent a valid scan target in + * hostname[:port] format, suitable for parsing by ScanTarget.fromTargetString(). + * + *

Error Handling: Implementations should handle source-specific errors + * (network failures, file not found, etc.) and either throw appropriate exceptions or return + * empty lists based on the error recovery strategy. + * + *

Performance Considerations: This method may perform expensive operations + * like network requests or large file parsing. Consider implementing caching or streaming + * strategies for large target lists. + * + * @return a list of target strings in hostname[:port] format + * @throws RuntimeException if targets cannot be retrieved (implementation-specific) + */ List getTargetList(); } diff --git a/src/main/java/de/rub/nds/crawler/targetlist/TargetFileProvider.java b/src/main/java/de/rub/nds/crawler/targetlist/TargetFileProvider.java index 0bffaa7..51036f2 100644 --- a/src/main/java/de/rub/nds/crawler/targetlist/TargetFileProvider.java +++ b/src/main/java/de/rub/nds/crawler/targetlist/TargetFileProvider.java @@ -17,16 +17,113 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +/** + * File-based target list provider for reading scan targets from local text files. + * + *

The TargetFileProvider implements ITargetListProvider to supply scan targets by reading from a + * local text file. It supports common file formats with comment filtering and empty line handling, + * making it suitable for managing static target lists in development, testing, and production + * environments. + * + *

Key features: + * + *

    + *
  • File-Based Storage - Reads targets from local filesystem files + *
  • Comment Support - Filters out lines starting with '#' character + *
  • Empty Line Handling - Automatically removes empty lines + *
  • Stream Processing - Uses Java streams for efficient file processing + *
+ * + *

File Format: + * + *

    + *
  • One Target Per Line - Each line contains a single target specification + *
  • Comment Lines - Lines starting with '#' are ignored + *
  • Empty Lines - Blank lines are automatically filtered out + *
  • Target Format - hostname[:port] format (e.g., "example.com:443") + *
+ * + *

Example File Content: + * + *

+ * # TLS Crawler Target List
+ * # Production servers
+ * example.com:443
+ * api.example.com
+ * secure.example.org:8443
+ *
+ * # Test servers
+ * test.example.com:443
+ * 
+ * + *

Error Handling: File access errors (file not found, permission denied, I/O + * errors) are wrapped in RuntimeException with descriptive messages for troubleshooting. + * + *

Performance Characteristics: + * + *

    + *
  • Memory Efficient - Uses streams to process large files + *
  • Fast Processing - Efficient filtering and collection operations + *
  • One-Time Read - File is read completely on each getTargetList() call + *
+ * + *

Usage Example: + * + *

{@code
+ * TargetFileProvider provider = new TargetFileProvider("/path/to/targets.txt");
+ * List targets = provider.getTargetList();
+ * }
+ * + * @see ITargetListProvider Configured via ControllerCommandConfig.getTargetListProvider() method. + */ public class TargetFileProvider implements ITargetListProvider { private static final Logger LOGGER = LogManager.getLogger(); private String filename; + /** + * Creates a new target file provider for the specified file path. + * + *

The constructor stores the file path for later use when getTargetList() is called. The + * file is not validated or accessed during construction, allowing for flexible deployment + * scenarios where the file may be created after the provider is instantiated. + * + * @param filename the path to the target list file to read + */ public TargetFileProvider(String filename) { this.filename = filename; } + /** + * Reads and returns the complete list of scan targets from the configured file. + * + *

This method opens the file, reads all lines, and filters out comments (lines starting with + * '#') and empty lines. The remaining lines are returned as scan targets in the order they + * appear in the file. + * + *

Processing Steps: + * + *

    + *
  1. Open file using Java NIO Files.lines() for stream processing + *
  2. Filter out comment lines (starting with '#') + *
  3. Filter out empty lines + *
  4. Collect remaining lines into a list + *
  5. Log the number of targets read + *
+ * + *

File Format Requirements: + * + *

    + *
  • One target per line in hostname[:port] format + *
  • Comment lines start with '#' character + *
  • Empty lines are automatically ignored + *
  • No additional whitespace trimming is performed + *
+ * + * @return a list of target strings read from the file + * @throws RuntimeException if the file cannot be read (file not found, I/O error, etc.) + */ @Override public List getTargetList() { LOGGER.info("Reading hostName list"); diff --git a/src/main/java/de/rub/nds/crawler/targetlist/TrancoEmailListProvider.java b/src/main/java/de/rub/nds/crawler/targetlist/TrancoEmailListProvider.java index 81a03f0..d7a4e52 100644 --- a/src/main/java/de/rub/nds/crawler/targetlist/TrancoEmailListProvider.java +++ b/src/main/java/de/rub/nds/crawler/targetlist/TrancoEmailListProvider.java @@ -19,9 +19,64 @@ import org.apache.logging.log4j.Logger; /** - * Target list provider that downloads the most recent tranco list (https://tranco-list.eu/) and - * extracts the top x hosts from it and then searches for mail servers in the dns mx records of the - * hosts and returns these as targets. + * Email server target list provider that extracts mail servers from popular domain rankings. + * + *

The TrancoEmailListProvider builds upon existing target list providers (typically Tranco + * rankings) to discover and extract mail server hostnames through DNS MX record resolution. This + * enables TLS scanning of email infrastructure associated with popular websites. + * + *

Key capabilities: + * + *

    + *
  • MX Record Resolution - Queries DNS for mail exchange records + *
  • Mail Server Discovery - Identifies email infrastructure for popular + * domains + *
  • Duplicate Removal - Returns unique mail server hostnames only + *
  • Provider Agnostic - Works with any ITargetListProvider implementation + *
+ * + *

Processing Pipeline: + * + *

    + *
  1. Domain Acquisition - Obtain domain list from configured provider + *
  2. Hostname Extraction - Parse domains from provider-specific format + *
  3. MX Query - Perform DNS MX record lookups for each domain + *
  4. Mail Server Extraction - Extract mail server hostnames from MX records + *
  5. Deduplication - Return unique mail server list + *
+ * + *

DNS Resolution: Uses Java's InitialDirContext to perform DNS queries for MX + * records. Failed lookups are logged but don't prevent processing of other domains. + * + *

Error Handling: + * + *

    + *
  • Missing MX Records - Domains without mail servers are silently skipped + *
  • DNS Failures - Individual lookup failures are logged and ignored + *
  • Malformed Records - Invalid MX records are handled gracefully + *
+ * + *

Use Cases: + * + *

    + *
  • Email Security Studies - TLS adoption in email infrastructure + *
  • Mail Server Surveys - Protocol support across popular email services + *
  • Vulnerability Research - Security assessment of email systems + *
  • Performance Analysis - Email protocol performance evaluation + *
+ * + *

Usage Example: + * + *

{@code
+ * TrancoListProvider domains = new TrancoListProvider(10000);
+ * TrancoEmailListProvider emailProvider = new TrancoEmailListProvider(domains);
+ * List mailServers = emailProvider.getTargetList();
+ * // Returns mail servers for top 10,000 Tranco domains
+ * }
+ * + * @see ITargetListProvider + * @see TrancoListProvider + * @see CruxListProvider */ public class TrancoEmailListProvider implements ITargetListProvider { @@ -29,6 +84,15 @@ public class TrancoEmailListProvider implements ITargetListProvider { private final ITargetListProvider trancoList; + /** + * Creates a new email list provider using the specified domain list provider. + * + *

The constructor configures the provider to use any ITargetListProvider implementation as + * the source for domain names, which will be queried for MX records to discover associated mail + * servers. + * + * @param trancoList the target list provider to obtain domains from for MX record lookup + */ public TrancoEmailListProvider(ITargetListProvider trancoList) { this.trancoList = trancoList; } diff --git a/src/main/java/de/rub/nds/crawler/targetlist/TrancoListProvider.java b/src/main/java/de/rub/nds/crawler/targetlist/TrancoListProvider.java index 47d8784..483e175 100644 --- a/src/main/java/de/rub/nds/crawler/targetlist/TrancoListProvider.java +++ b/src/main/java/de/rub/nds/crawler/targetlist/TrancoListProvider.java @@ -13,8 +13,54 @@ import java.util.stream.Stream; /** - * Target list provider that downloads the most recent tranco list (...) and extracts the top x hosts from it. + * Tranco ranking target list provider for research-grade TLS scanning operations. + * + *

The TrancoListProvider downloads and processes the most recent Tranco ranking data to extract + * popular website targets for TLS security scanning. Tranco provides a research-oriented + * alternative to commercial rankings, designed specifically for security and privacy studies. + * + *

Key advantages: + * + *

    + *
  • Research Focus - Designed for academic and security research + *
  • Stable Rankings - Aggregates multiple sources for stability + *
  • Manipulation Resistant - Protected against gaming and artificial inflation + *
  • Regular Updates - Daily updated rankings reflecting current web usage + *
+ * + *

Data Source: Downloads the top 1 million domain ranking from tranco-list.eu, + * which aggregates data from multiple sources including Alexa, Umbrella, Majestic, and Quantcast to + * provide robust and manipulation-resistant rankings. + * + *

Processing Characteristics: + * + *

    + *
  • Simple Format - CSV format with rank,domain structure + *
  • Direct Extraction - Domains are ready for scanning without preprocessing + *
  • Configurable Limit - Supports any number up to 1 million targets + *
  • Sequential Order - Maintains ranking order for top-N selection + *
+ * + *

Usage Scenarios: + * + *

    + *
  • Academic Research - Security studies requiring stable rankings + *
  • TLS Surveys - Large-scale protocol analysis and evaluation + *
  • Vulnerability Research - Scanning popular sites for security issues + *
  • Performance Studies - Protocol performance across diverse targets + *
+ * + *

Usage Example: + * + *

{@code
+ * TrancoListProvider provider = new TrancoListProvider(10000);
+ * List targets = provider.getTargetList();
+ * // Returns top 10,000 domains from current Tranco ranking
+ * }
+ * + * @see ZipFileProvider + * @see ITargetListProvider + * @see Tranco Ranking Project */ public class TrancoListProvider extends ZipFileProvider { @@ -22,6 +68,14 @@ public class TrancoListProvider extends ZipFileProvider { private static final String ZIP_FILENAME = "tranco-1m.csv.zip"; private static final String FILENAME = "tranco-1m.csv"; + /** + * Creates a new Tranco list provider for the specified number of top-ranked domains. + * + *

The constructor configures the provider to download the current Tranco top 1 million + * ranking and extract the specified number of highest-ranked domains for scanning. + * + * @param number the maximum number of domains to extract from the ranking (1 to 1,000,000) + */ public TrancoListProvider(int number) { super(number, SOURCE, ZIP_FILENAME, FILENAME, "Tranco"); } diff --git a/src/main/java/de/rub/nds/crawler/targetlist/ZipFileProvider.java b/src/main/java/de/rub/nds/crawler/targetlist/ZipFileProvider.java index ee1419d..053df02 100644 --- a/src/main/java/de/rub/nds/crawler/targetlist/ZipFileProvider.java +++ b/src/main/java/de/rub/nds/crawler/targetlist/ZipFileProvider.java @@ -23,15 +23,94 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +/** + * Abstract base class for target list providers that download and extract targets from compressed + * archives. + * + *

The ZipFileProvider provides a foundation for implementing target list providers that obtain + * scan targets from remote compressed files (ZIP, GZIP). It handles the complete workflow of + * downloading, extracting, parsing, and cleaning up temporary files, allowing subclasses to focus + * on the specific target extraction logic. + * + *

Key capabilities: + * + *

    + *
  • Remote Download - Downloads compressed files from HTTP/HTTPS URLs + *
  • Format Support - Handles ZIP and GZIP compressed formats + *
  • Stream Processing - Efficient processing of large target lists + *
  • Automatic Cleanup - Removes temporary files after processing + *
+ * + *

Processing Workflow: + * + *

    + *
  1. Download - Fetch compressed file from remote URL + *
  2. Extract - Decompress file to temporary local storage + *
  3. Parse - Process extracted content via subclass implementation + *
  4. Cleanup - Remove temporary files to free disk space + *
+ * + *

Supported Formats: + * + *

    + *
  • ZIP Files - Standard ZIP compression with single entry support + *
  • GZIP Files - GNU ZIP compression for single file archives + *
  • Format Detection - Automatic format detection based on filename + *
+ * + *

Error Handling: + * + *

    + *
  • Download Failures - Logged but processing continues with cached data + *
  • Extraction Errors - Logged and may cause runtime exceptions + *
  • Cleanup Failures - Logged but don't prevent target list return + *
+ * + *

Performance Considerations: + * + *

    + *
  • Temporary Storage - Requires disk space for compressed and extracted files + *
  • Network I/O - Download time depends on file size and connection speed + *
  • Memory Usage - Uses streaming for processing large target lists + *
+ * + *

Implementation Requirements: Subclasses must implement + * getTargetListFromLines() to define how targets are extracted from the decompressed file content. + * + *

Common Subclasses: + * + *

    + *
  • TrancoListProvider - Processes Tranco web ranking data + *
  • CruxListProvider - Handles Chrome UX Report target lists + *
  • Custom Providers - Domain-specific compressed target sources + *
+ * + * @see ITargetListProvider + * @see TrancoListProvider + * @see CruxListProvider + */ public abstract class ZipFileProvider implements ITargetListProvider { + /** Logger instance for tracking download and extraction operations. */ protected static final Logger LOGGER = LogManager.getLogger(); + + /** Maximum number of targets to extract from the target list. */ protected final int number; + private final String sourceUrl; private final String zipFilename; private final String outputFile; private final String listName; + /** + * Creates a new ZIP file provider with the specified configuration. + * + * @param number the maximum number of targets to extract from the list + * @param sourceUrl the URL to download the compressed file from + * @param zipFilename the local filename for the downloaded compressed file + * @param outputFile the local filename for the extracted content + * @param listName the human-readable name of the list for logging + */ protected ZipFileProvider( int number, String sourceUrl, String zipFilename, String outputFile, String listName) { this.number = number; @@ -41,6 +120,29 @@ protected ZipFileProvider( this.listName = listName; } + /** + * Downloads, extracts, and processes the compressed target list file. + * + *

This method implements the complete workflow for obtaining targets from a remote + * compressed file. It downloads the file, extracts the content, processes it through the + * subclass implementation, and cleans up temporary files. + * + *

Processing Steps: + * + *

    + *
  1. Download compressed file from sourceUrl to zipFilename + *
  2. Extract compressed content to outputFile + *
  3. Process extracted content via getTargetListFromLines() + *
  4. Delete temporary files (compressed and extracted) + *
+ * + *

Error Recovery: Download and extraction errors are logged but don't + * prevent processing from continuing. Cleanup errors are logged but don't affect the returned + * target list. + * + * @return a list of target strings extracted from the compressed file + * @throws RuntimeException if the extracted file cannot be read + */ public List getTargetList() { List targetList; try { @@ -91,6 +193,16 @@ public List getTargetList() { return targetList; } + /** + * Creates an appropriate input stream for the compressed file based on filename. + * + *

This method automatically detects the compression format based on the filename and returns + * the appropriate decompression stream. It supports GZIP and ZIP formats. + * + * @param filename the name of the compressed file to open + * @return an InflaterInputStream for reading decompressed content + * @throws IOException if the file cannot be opened + */ private InflaterInputStream getZipInputStream(String filename) throws IOException { if (filename.contains(".gz")) { return new GZIPInputStream(new FileInputStream(filename)); @@ -99,5 +211,24 @@ private InflaterInputStream getZipInputStream(String filename) throws IOExceptio } } + /** + * Extracts scan targets from the decompressed file content. + * + *

This abstract method must be implemented by subclasses to define how targets are extracted + * from the decompressed file lines. Different target list formats require different parsing + * logic. + * + *

Implementation Guidelines: + * + *

    + *
  • Process the stream efficiently using stream operations + *
  • Limit results to the configured number of targets + *
  • Filter and format targets appropriately for scanning + *
  • Handle any format-specific parsing requirements + *
+ * + * @param lines a stream of lines from the extracted file + * @return a list of target strings formatted for scanning + */ protected abstract List getTargetListFromLines(Stream lines); } diff --git a/src/main/java/de/rub/nds/crawler/util/CanceallableThreadPoolExecutor.java b/src/main/java/de/rub/nds/crawler/util/CanceallableThreadPoolExecutor.java index f4d14fd..5c446d2 100644 --- a/src/main/java/de/rub/nds/crawler/util/CanceallableThreadPoolExecutor.java +++ b/src/main/java/de/rub/nds/crawler/util/CanceallableThreadPoolExecutor.java @@ -10,7 +10,50 @@ import java.util.concurrent.*; +/** + * Thread pool executor that creates futures with result preservation after cancellation. + * + *

The CanceallableThreadPoolExecutor extends ThreadPoolExecutor to use CancellableFuture + * instances instead of standard FutureTask objects. This enables tasks to preserve their results + * even after being cancelled, which is valuable for timeout scenarios and graceful degradation in + * distributed scanning operations. + * + *

Key features: + * + *

    + *
  • Result Preservation - Tasks retain results after cancellation + *
  • Standard Interface - Drop-in replacement for ThreadPoolExecutor + *
  • Timeout Handling - Better handling of scan timeouts with partial results + *
  • Resource Management - Improved resource cleanup with preserved data + *
+ * + *

Use Cases: + * + *

    + *
  • TLS Scanning - Preserve partial scan results when connections timeout + *
  • Long-Running Tasks - Cancel tasks while keeping intermediate results + *
  • Resource Constraints - Manage memory/CPU while preserving valuable data + *
  • Progress Tracking - Access results from cancelled operations + *
+ * + *

Behavior: All submitted tasks are wrapped in CancellableFuture instances, + * which provide the enhanced cancellation behavior. The executor maintains standard + * ThreadPoolExecutor semantics for all other operations. + * + * @see CancellableFuture + * @see ThreadPoolExecutor + */ public class CanceallableThreadPoolExecutor extends ThreadPoolExecutor { + /** + * Creates a new cancellable thread pool executor with basic configuration. + * + * @param corePoolSize the number of threads to keep in the pool + * @param maximumPoolSize the maximum number of threads to allow in the pool + * @param keepAliveTime when the number of threads is greater than the core, this is the maximum + * time that excess idle threads will wait for new tasks before terminating + * @param unit the time unit for the keepAliveTime argument + * @param workQueue the queue to use for holding tasks before they are executed + */ public CanceallableThreadPoolExecutor( int corePoolSize, int maximumPoolSize, @@ -20,6 +63,17 @@ public CanceallableThreadPoolExecutor( super(corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue); } + /** + * Creates a new cancellable thread pool executor with custom thread factory. + * + * @param corePoolSize the number of threads to keep in the pool + * @param maximumPoolSize the maximum number of threads to allow in the pool + * @param keepAliveTime when the number of threads is greater than the core, this is the maximum + * time that excess idle threads will wait for new tasks before terminating + * @param unit the time unit for the keepAliveTime argument + * @param workQueue the queue to use for holding tasks before they are executed + * @param threadFactory the factory to use when the executor creates a new thread + */ public CanceallableThreadPoolExecutor( int corePoolSize, int maximumPoolSize, @@ -30,6 +84,18 @@ public CanceallableThreadPoolExecutor( super(corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue, threadFactory); } + /** + * Creates a new cancellable thread pool executor with custom rejection handler. + * + * @param corePoolSize the number of threads to keep in the pool + * @param maximumPoolSize the maximum number of threads to allow in the pool + * @param keepAliveTime when the number of threads is greater than the core, this is the maximum + * time that excess idle threads will wait for new tasks before terminating + * @param unit the time unit for the keepAliveTime argument + * @param workQueue the queue to use for holding tasks before they are executed + * @param handler the handler to use when execution is blocked because the thread bounds and + * queue capacities are reached + */ public CanceallableThreadPoolExecutor( int corePoolSize, int maximumPoolSize, @@ -40,6 +106,19 @@ public CanceallableThreadPoolExecutor( super(corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue, handler); } + /** + * Creates a new cancellable thread pool executor with full configuration options. + * + * @param corePoolSize the number of threads to keep in the pool + * @param maximumPoolSize the maximum number of threads to allow in the pool + * @param keepAliveTime when the number of threads is greater than the core, this is the maximum + * time that excess idle threads will wait for new tasks before terminating + * @param unit the time unit for the keepAliveTime argument + * @param workQueue the queue to use for holding tasks before they are executed + * @param threadFactory the factory to use when the executor creates a new thread + * @param handler the handler to use when execution is blocked because the thread bounds and + * queue capacities are reached + */ public CanceallableThreadPoolExecutor( int corePoolSize, int maximumPoolSize, diff --git a/src/main/java/de/rub/nds/crawler/util/CancellableFuture.java b/src/main/java/de/rub/nds/crawler/util/CancellableFuture.java index d7706b1..25f9317 100644 --- a/src/main/java/de/rub/nds/crawler/util/CancellableFuture.java +++ b/src/main/java/de/rub/nds/crawler/util/CancellableFuture.java @@ -12,12 +12,61 @@ import java.util.concurrent.*; import java.util.concurrent.atomic.AtomicReference; +/** + * Enhanced Future implementation that preserves results even after cancellation. + * + *

The CancellableFuture provides a specialized Future implementation that allows retrieval of + * results even after the future has been cancelled. This is particularly useful in scenarios where + * partial results are valuable and should not be lost due to timeout or cancellation. + * + *

Key features: + * + *

    + *
  • Result Preservation - Results remain accessible after cancellation + *
  • Thread-Safe Access - Uses atomic references and semaphores for + * synchronization + *
  • Timeout Support - Supports both blocking and timed result retrieval + *
  • Standard Interface - Implements RunnableFuture for executor compatibility + *
+ * + *

Cancellation Behavior: Unlike standard FutureTask, this implementation allows + * access to the computed result even after the future is cancelled. The result is captured + * atomically before the cancellation takes effect. + * + *

Synchronization Mechanism: Uses a Semaphore to coordinate access to results + * after cancellation, ensuring thread-safe retrieval without blocking indefinitely. + * + *

Use Cases: + * + *

    + *
  • Timeout Scenarios - Preserve partial scan results when operations timeout + *
  • Resource Management - Cancel long-running tasks while keeping results + *
  • Progress Tracking - Access intermediate results during cancellation + *
  • Graceful Degradation - Use partial results when full completion fails + *
+ * + *

Thread Safety: All operations are thread-safe through atomic references and + * semaphore synchronization. Multiple threads can safely access the future concurrently. + * + * @param the type of result produced by this future + * @see RunnableFuture + * @see FutureTask + * @see CanceallableThreadPoolExecutor + */ public class CancellableFuture implements RunnableFuture { private final AtomicReference result = new AtomicReference<>(); private final RunnableFuture innerFuture; private final Semaphore resultWritten = new Semaphore(0); + /** + * Creates a new cancellable future for the specified callable task. + * + *

The future wraps the callable in a FutureTask that captures the result atomically and + * signals completion via semaphore release, enabling result access even after cancellation. + * + * @param callable the task to execute that produces a result + */ public CancellableFuture(Callable callable) { innerFuture = new FutureTask<>( @@ -29,6 +78,15 @@ public CancellableFuture(Callable callable) { }); } + /** + * Creates a new cancellable future for the specified runnable task with a fixed result. + * + *

The future wraps the runnable in a FutureTask that executes the task and returns the + * provided result value, with atomic result capture for post-cancellation access. + * + * @param runnable the task to execute + * @param res the result value to return upon successful completion + */ public CancellableFuture(Runnable runnable, V res) { innerFuture = new FutureTask<>( From 9aa9d06dfb4674224ec2ac2609d1676394c8f886 Mon Sep 17 00:00:00 2001 From: Robert Merget Date: Wed, 11 Jun 2025 21:35:35 +0400 Subject: [PATCH 20/24] Fix IPv6 parsing in ScanTarget - Enhanced target string parsing to properly handle IPv6 addresses with ports - Added support for bracket notation: [2001:db8::1]:8080 - Preserved existing IPv4 and hostname parsing functionality - Improved port validation with proper range checking (1-65535) - Added comprehensive test suite covering IPv6, IPv4, and hostname parsing scenarios - Removed debug System.out.println statement - Updated JavaDoc to reflect full IPv6 support The fix resolves the FIXME comment by implementing RFC-compliant IPv6 address parsing that distinguishes between colons in IPv6 addresses and port separators using bracket notation, while maintaining backward compatibility with existing formats. Closes #10 --- .../de/rub/nds/crawler/data/ScanTarget.java | 48 +++-- .../rub/nds/crawler/data/ScanTargetTest.java | 194 ++++++++++++++++++ 2 files changed, 230 insertions(+), 12 deletions(-) create mode 100644 src/test/java/de/rub/nds/crawler/data/ScanTargetTest.java diff --git a/src/main/java/de/rub/nds/crawler/data/ScanTarget.java b/src/main/java/de/rub/nds/crawler/data/ScanTarget.java index c40f33b..9b96a70 100644 --- a/src/main/java/de/rub/nds/crawler/data/ScanTarget.java +++ b/src/main/java/de/rub/nds/crawler/data/ScanTarget.java @@ -30,15 +30,17 @@ * *

    *
  • example.com - hostname only - *
  • 192.168.1.1 - IP address only + *
  • 192.168.1.1 - IPv4 address only + *
  • 2001:db8::1 - IPv6 address only *
  • example.com:8080 - hostname with port - *
  • 192.168.1.1:443 - IP address with port + *
  • 192.168.1.1:443 - IPv4 address with port + *
  • [2001:db8::1]:8080 - IPv6 address with port (bracket notation) *
  • 1,example.com - Tranco rank with hostname *
  • //example.com - hostname with URL prefix *
* *

The class performs hostname resolution and denylist checking during target creation. IPv6 - * addresses are currently not fully supported due to port parsing limitations. + * addresses are fully supported with proper bracket notation for port specification. * * @see JobStatus * @see IDenylistProvider @@ -68,7 +70,6 @@ public class ScanTarget implements Serializable { *

Known limitations: * *

    - *
  • IPv6 addresses with ports are not correctly parsed due to colon conflicts *
  • Only the first resolved IP address is used for multi-homed hosts *
* @@ -102,18 +103,41 @@ public static Pair fromTargetString( } if (targetString.startsWith("\"") && targetString.endsWith("\"")) { targetString = targetString.replace("\"", ""); - System.out.println(targetString); } - // check if targetString contains port (e.g. "www.example.com:8080") - // FIXME I guess this breaks any IPv6 parsing - if (targetString.contains(":")) { - int port = Integer.parseInt(targetString.split(":")[1]); - targetString = targetString.split(":")[0]; - if (port > 1 && port < 65535) { - target.setPort(port); + // Parse port from target string, handling IPv6 addresses properly + if (targetString.startsWith("[") && targetString.contains("]:")) { + // IPv6 address with port: [2001:db8::1]:8080 + int bracketEnd = targetString.indexOf("]:") + 1; + String portPart = targetString.substring(bracketEnd + 1); + targetString = targetString.substring(1, bracketEnd - 1); // Remove brackets + try { + int port = Integer.parseInt(portPart); + if (port > 0 && port <= 65535) { + target.setPort(port); + } else { + target.setPort(defaultPort); + } + } catch (NumberFormatException e) { + target.setPort(defaultPort); + } + } else if (targetString.contains(":") + && !InetAddressValidator.getInstance().isValidInet6Address(targetString)) { + // IPv4 address or hostname with port: www.example.com:8080 or 192.168.1.1:443 + String[] parts = targetString.split(":", 2); + targetString = parts[0]; + try { + int port = Integer.parseInt(parts[1]); + if (port > 0 && port <= 65535) { + target.setPort(port); + } else { + target.setPort(defaultPort); + } + } catch (NumberFormatException e) { + target.setPort(defaultPort); } } else { + // No port specified or IPv6 address without port target.setPort(defaultPort); } diff --git a/src/test/java/de/rub/nds/crawler/data/ScanTargetTest.java b/src/test/java/de/rub/nds/crawler/data/ScanTargetTest.java new file mode 100644 index 0000000..1807d26 --- /dev/null +++ b/src/test/java/de/rub/nds/crawler/data/ScanTargetTest.java @@ -0,0 +1,194 @@ +/* + * TLS-Crawler - A TLS scanning tool to perform large scale scans with the TLS-Scanner + * + * Copyright 2018-2023 Ruhr University Bochum, Paderborn University, and Hackmanit GmbH + * + * Licensed under Apache License, Version 2.0 + * http://www.apache.org/licenses/LICENSE-2.0.txt + */ +package de.rub.nds.crawler.data; + +import static org.junit.jupiter.api.Assertions.*; + +import de.rub.nds.crawler.constant.JobStatus; +import org.apache.commons.lang3.tuple.Pair; +import org.junit.jupiter.api.Test; + +/** Tests for ScanTarget parsing functionality, particularly IPv6 address handling. */ +class ScanTargetTest { + + private static final int DEFAULT_PORT = 443; + + @Test + void testIPv4AddressWithPort() { + Pair result = + ScanTarget.fromTargetString("192.168.1.1:8080", DEFAULT_PORT, null); + + assertEquals(JobStatus.TO_BE_EXECUTED, result.getRight()); + ScanTarget target = result.getLeft(); + assertEquals("192.168.1.1", target.getIp()); + assertNull(target.getHostname()); + assertEquals(8080, target.getPort()); + } + + @Test + void testIPv4AddressWithoutPort() { + Pair result = + ScanTarget.fromTargetString("192.168.1.1", DEFAULT_PORT, null); + + assertEquals(JobStatus.TO_BE_EXECUTED, result.getRight()); + ScanTarget target = result.getLeft(); + assertEquals("192.168.1.1", target.getIp()); + assertNull(target.getHostname()); + assertEquals(DEFAULT_PORT, target.getPort()); + } + + @Test + void testIPv6AddressWithPort() { + Pair result = + ScanTarget.fromTargetString("[2001:db8::1]:8080", DEFAULT_PORT, null); + + assertEquals(JobStatus.TO_BE_EXECUTED, result.getRight()); + ScanTarget target = result.getLeft(); + assertEquals("2001:db8::1", target.getIp()); + assertNull(target.getHostname()); + assertEquals(8080, target.getPort()); + } + + @Test + void testIPv6AddressWithoutPort() { + Pair result = + ScanTarget.fromTargetString("2001:db8::1", DEFAULT_PORT, null); + + assertEquals(JobStatus.TO_BE_EXECUTED, result.getRight()); + ScanTarget target = result.getLeft(); + assertEquals("2001:db8::1", target.getIp()); + assertNull(target.getHostname()); + assertEquals(DEFAULT_PORT, target.getPort()); + } + + @Test + void testIPv6AddressWithPortAndDefaultPort() { + Pair result = + ScanTarget.fromTargetString("[::1]:443", DEFAULT_PORT, null); + + assertEquals(JobStatus.TO_BE_EXECUTED, result.getRight()); + ScanTarget target = result.getLeft(); + assertEquals("::1", target.getIp()); + assertNull(target.getHostname()); + assertEquals(443, target.getPort()); + } + + @Test + void testHostnameWithPort() { + Pair result = + ScanTarget.fromTargetString("example.com:8080", DEFAULT_PORT, null); + + assertEquals(JobStatus.TO_BE_EXECUTED, result.getRight()); + ScanTarget target = result.getLeft(); + assertEquals("example.com", target.getHostname()); + assertEquals(8080, target.getPort()); + // IP will be resolved, so we just check it's not null + assertNotNull(target.getIp()); + } + + @Test + void testHostnameWithoutPort() { + Pair result = + ScanTarget.fromTargetString("example.com", DEFAULT_PORT, null); + + assertEquals(JobStatus.TO_BE_EXECUTED, result.getRight()); + ScanTarget target = result.getLeft(); + assertEquals("example.com", target.getHostname()); + assertEquals(DEFAULT_PORT, target.getPort()); + // IP will be resolved, so we just check it's not null + assertNotNull(target.getIp()); + } + + @Test + void testTrancoRankWithHostname() { + Pair result = + ScanTarget.fromTargetString("1,example.com", DEFAULT_PORT, null); + + assertEquals(JobStatus.TO_BE_EXECUTED, result.getRight()); + ScanTarget target = result.getLeft(); + assertEquals("example.com", target.getHostname()); + assertEquals(1, target.getTrancoRank()); + assertEquals(DEFAULT_PORT, target.getPort()); + assertNotNull(target.getIp()); + } + + @Test + void testUrlPrefixRemoval() { + Pair result = + ScanTarget.fromTargetString("//example.com", DEFAULT_PORT, null); + + assertEquals(JobStatus.TO_BE_EXECUTED, result.getRight()); + ScanTarget target = result.getLeft(); + assertEquals("example.com", target.getHostname()); + assertEquals(DEFAULT_PORT, target.getPort()); + assertNotNull(target.getIp()); + } + + @Test + void testQuotedHostname() { + Pair result = + ScanTarget.fromTargetString("\"example.com\"", DEFAULT_PORT, null); + + assertEquals(JobStatus.TO_BE_EXECUTED, result.getRight()); + ScanTarget target = result.getLeft(); + assertEquals("example.com", target.getHostname()); + assertEquals(DEFAULT_PORT, target.getPort()); + assertNotNull(target.getIp()); + } + + @Test + void testInvalidPortHandling() { + // Port out of range should default to defaultPort + Pair result = + ScanTarget.fromTargetString("[2001:db8::1]:99999", DEFAULT_PORT, null); + + assertEquals(JobStatus.TO_BE_EXECUTED, result.getRight()); + ScanTarget target = result.getLeft(); + assertEquals("2001:db8::1", target.getIp()); + assertEquals(DEFAULT_PORT, target.getPort()); // Should use default port for invalid port + } + + @Test + void testMalformedPortHandling() { + // Non-numeric port should default to defaultPort + Pair result = + ScanTarget.fromTargetString("[2001:db8::1]:abc", DEFAULT_PORT, null); + + assertEquals(JobStatus.TO_BE_EXECUTED, result.getRight()); + ScanTarget target = result.getLeft(); + assertEquals("2001:db8::1", target.getIp()); + assertEquals(DEFAULT_PORT, target.getPort()); // Should use default port for invalid port + } + + @Test + void testComplexIPv6Addresses() { + // Test various IPv6 address formats + String[] ipv6Addresses = { + "2001:0db8:85a3:0000:0000:8a2e:0370:7334", + "2001:db8:85a3::8a2e:370:7334", + "::1", + "::", + "2001:db8::8a2e:370:7334" + }; + + for (String ipv6 : ipv6Addresses) { + String targetString = "[" + ipv6 + "]:8080"; + Pair result = + ScanTarget.fromTargetString(targetString, DEFAULT_PORT, null); + + assertEquals(JobStatus.TO_BE_EXECUTED, result.getRight()); + ScanTarget target = result.getLeft(); + assertEquals(ipv6, target.getIp()); + assertEquals(8080, target.getPort()); + } + } + + // Note: Testing unresolvable hostnames is environment-dependent and not reliable + // for CI/CD environments, so we skip this test +} From ac7c438734dd2dca9b567bb3f9316a155c6b4c60 Mon Sep 17 00:00:00 2001 From: Robert Merget Date: Wed, 11 Jun 2025 21:49:06 +0400 Subject: [PATCH 21/24] Support multiple IPs per hostname in ScanTarget Resolves #11 - Modified ScanTarget.fromTargetString() to return List> - Implemented InetAddress.getAllByName() to resolve all IP addresses for hostnames - Updated JobSubmitter to handle multiple targets per hostname string - Added comprehensive test for multiple IP resolution - Updated ControllerTest to accommodate variable job counts based on DNS resolution - Each resolved IP creates separate ScanTarget for complete multi-homed host coverage - Preserves hostname, port, and Tranco rank across all IP instances - Maintains backward compatibility for direct IP address inputs Features: - Support for IPv4 and IPv6 multi-homed hosts - Individual denylist checking per resolved IP - Debug logging for hostname resolution counts - Efficient parallel processing of resolved targets --- .../crawler/core/jobs/PublishBulkScanJob.java | 58 ++++--- .../de/rub/nds/crawler/data/ScanTarget.java | 70 ++++++--- .../rub/nds/crawler/core/ControllerTest.java | 6 +- .../rub/nds/crawler/data/ScanTargetTest.java | 142 +++++++++++++----- 4 files changed, 192 insertions(+), 84 deletions(-) diff --git a/src/main/java/de/rub/nds/crawler/core/jobs/PublishBulkScanJob.java b/src/main/java/de/rub/nds/crawler/core/jobs/PublishBulkScanJob.java index e0c44a5..15366b1 100644 --- a/src/main/java/de/rub/nds/crawler/core/jobs/PublishBulkScanJob.java +++ b/src/main/java/de/rub/nds/crawler/core/jobs/PublishBulkScanJob.java @@ -276,11 +276,16 @@ public JobSubmitter( * filtering, and job submission or error persistence. It uses the * ScanTarget.fromTargetString method for DNS resolution and denylist checking. * + *

Multi-target Support: For hostnames that resolve to multiple IP + * addresses, multiple ScanJobDescription objects are created and processed. The returned + * JobStatus represents the primary outcome, with TO_BE_EXECUTED taking precedence if any + * targets were successfully submitted. + * *

Processing Flow: * *

    - *
  1. Parse target string using ScanTarget.fromTargetString - *
  2. Create ScanJobDescription with parsed target and determined status + *
  3. Parse target string using ScanTarget.fromTargetString (may return multiple targets) + *
  4. Create ScanJobDescription for each parsed target with appropriate status *
  5. For valid targets (TO_BE_EXECUTED): submit to orchestration provider *
  6. For invalid targets: create and persist ScanResult with error details *
@@ -290,36 +295,49 @@ public JobSubmitter( * debugging purposes. * * @param targetString the target string to process (e.g., "example.com:443") - * @return the JobStatus indicating how the target was processed + * @return the JobStatus indicating how the target was processed (TO_BE_EXECUTED if any + * targets were submitted successfully, otherwise the error status) */ @Override public JobStatus apply(String targetString) { - ScanJobDescription jobDescription; - ScanResult errorResult = null; try { - var targetInfo = + var targetInfoList = ScanTarget.fromTargetString(targetString, defaultPort, denylistProvider); - jobDescription = - new ScanJobDescription( - targetInfo.getLeft(), bulkScan, targetInfo.getRight()); + + boolean hasSuccessfulSubmission = false; + JobStatus primaryStatus = JobStatus.RESOLUTION_ERROR; + + for (var targetInfo : targetInfoList) { + ScanJobDescription jobDescription = + new ScanJobDescription( + targetInfo.getLeft(), bulkScan, targetInfo.getRight()); + + if (jobDescription.getStatus() == JobStatus.TO_BE_EXECUTED) { + orchestrationProvider.submitScanJob(jobDescription); + hasSuccessfulSubmission = true; + primaryStatus = JobStatus.TO_BE_EXECUTED; + } else { + ScanResult errorResult = new ScanResult(jobDescription, null); + persistenceProvider.insertScanResult(errorResult, jobDescription); + + // Update primary status if we haven't had a successful submission + if (!hasSuccessfulSubmission) { + primaryStatus = jobDescription.getStatus(); + } + } + } + + return primaryStatus; } catch (Exception e) { - jobDescription = + ScanJobDescription jobDescription = new ScanJobDescription( new ScanTarget(), bulkScan, JobStatus.RESOLUTION_ERROR); - errorResult = ScanResult.fromException(jobDescription, e); + ScanResult errorResult = ScanResult.fromException(jobDescription, e); LOGGER.error( "Error while creating ScanJobDescription for target '{}'", targetString, e); - } - - if (jobDescription.getStatus() == JobStatus.TO_BE_EXECUTED) { - orchestrationProvider.submitScanJob(jobDescription); - } else { - if (errorResult == null) { - errorResult = new ScanResult(jobDescription, null); - } persistenceProvider.insertScanResult(errorResult, jobDescription); + return JobStatus.RESOLUTION_ERROR; } - return jobDescription.getStatus(); } } } diff --git a/src/main/java/de/rub/nds/crawler/data/ScanTarget.java b/src/main/java/de/rub/nds/crawler/data/ScanTarget.java index 9b96a70..4957f91 100644 --- a/src/main/java/de/rub/nds/crawler/data/ScanTarget.java +++ b/src/main/java/de/rub/nds/crawler/data/ScanTarget.java @@ -13,6 +13,8 @@ import java.io.Serializable; import java.net.InetAddress; import java.net.UnknownHostException; +import java.util.ArrayList; +import java.util.List; import org.apache.commons.lang3.tuple.Pair; import org.apache.commons.validator.routines.InetAddressValidator; import org.apache.logging.log4j.LogManager; @@ -67,23 +69,24 @@ public class ScanTarget implements Serializable { *
  • Check against denylist if provider is available * * - *

    Known limitations: - * - *

      - *
    • Only the first resolved IP address is used for multi-homed hosts - *
    + *

    Multi-homed host support: For hostnames that resolve to multiple IP + * addresses, this method will create separate ScanTarget instances for each resolved IP + * address. This enables comprehensive scanning of domains with both IPv4 and IPv6 addresses or + * multiple A/AAAA records. * * @param targetString the string to parse (supports various formats as documented in class * description) * @param defaultPort the port to use when none is specified in the target string * @param denylistProvider optional provider for checking if targets are denylisted (may be * null) - * @return a pair containing the created ScanTarget and its status (TO_BE_EXECUTED, - * UNRESOLVABLE, or DENYLISTED) + * @return a list of pairs, each containing a ScanTarget and its status (TO_BE_EXECUTED, + * UNRESOLVABLE, or DENYLISTED). For hostnames resolving to multiple IPs, multiple pairs are + * returned. For IP addresses or single-resolution hostnames, a single-element list is + * returned. * @throws NumberFormatException if port or rank parsing fails * @see JobStatus */ - public static Pair fromTargetString( + public static List> fromTargetString( String targetString, int defaultPort, IDenylistProvider denylistProvider) { ScanTarget target = new ScanTarget(); @@ -141,30 +144,53 @@ public static Pair fromTargetString( target.setPort(defaultPort); } + List> results = new ArrayList<>(); + if (InetAddressValidator.getInstance().isValid(targetString)) { + // Direct IP address - create single target target.setIp(targetString); + + if (denylistProvider != null && denylistProvider.isDenylisted(target)) { + LOGGER.error("IP {} is denylisted and will not be scanned.", targetString); + results.add(Pair.of(target, JobStatus.DENYLISTED)); + } else { + results.add(Pair.of(target, JobStatus.TO_BE_EXECUTED)); + } } else { + // Hostname - resolve to potentially multiple IPs target.setHostname(targetString); try { - // TODO this only allows one IP per hostname; it may be interesting to scan all IPs - // for a domain, or at least one v4 and one v6 - target.setIp(InetAddress.getByName(targetString).getHostAddress()); + InetAddress[] addresses = InetAddress.getAllByName(targetString); + LOGGER.debug( + "Resolved hostname {} to {} IP address(es)", + targetString, + addresses.length); + + for (InetAddress address : addresses) { + ScanTarget ipTarget = new ScanTarget(); + ipTarget.setHostname(targetString); + ipTarget.setIp(address.getHostAddress()); + ipTarget.setPort(target.getPort()); + ipTarget.setTrancoRank(target.getTrancoRank()); + + if (denylistProvider != null && denylistProvider.isDenylisted(ipTarget)) { + LOGGER.error( + "IP {} for hostname {} is denylisted and will not be scanned.", + address.getHostAddress(), + targetString); + results.add(Pair.of(ipTarget, JobStatus.DENYLISTED)); + } else { + results.add(Pair.of(ipTarget, JobStatus.TO_BE_EXECUTED)); + } + } } catch (UnknownHostException e) { LOGGER.error( "Host {} is unknown or can not be reached with error {}.", targetString, e); - // TODO in the current design we discard the exception info; maybe we want to keep - // this in the future - return Pair.of(target, JobStatus.UNRESOLVABLE); + results.add(Pair.of(target, JobStatus.UNRESOLVABLE)); } } - if (denylistProvider != null && denylistProvider.isDenylisted(target)) { - LOGGER.error("Host {} is denylisted and will not be scanned.", targetString); - // TODO similar to the unknownHostException, we do not keep any information as to why - // the target is blocklisted it may be nice to distinguish cases where the domain is - // blocked or where the IP is blocked - return Pair.of(target, JobStatus.DENYLISTED); - } - return Pair.of(target, JobStatus.TO_BE_EXECUTED); + + return results; } /** The resolved IP address of the target host. */ diff --git a/src/test/java/de/rub/nds/crawler/core/ControllerTest.java b/src/test/java/de/rub/nds/crawler/core/ControllerTest.java index afddf0f..6f49eda 100644 --- a/src/test/java/de/rub/nds/crawler/core/ControllerTest.java +++ b/src/test/java/de/rub/nds/crawler/core/ControllerTest.java @@ -40,7 +40,11 @@ void submitting() throws IOException, InterruptedException { Thread.sleep(1000); - Assertions.assertEquals(2, orchestrationProvider.jobQueue.size()); + // With multi-IP hostname support, we expect at least 2 jobs (one per hostname) + // but may get more if hostnames resolve to multiple IPs + Assertions.assertTrue( + orchestrationProvider.jobQueue.size() >= 2, + "Expected at least 2 jobs but got " + orchestrationProvider.jobQueue.size()); Assertions.assertEquals(0, orchestrationProvider.unackedJobs.size()); } } diff --git a/src/test/java/de/rub/nds/crawler/data/ScanTargetTest.java b/src/test/java/de/rub/nds/crawler/data/ScanTargetTest.java index 1807d26..b31c39c 100644 --- a/src/test/java/de/rub/nds/crawler/data/ScanTargetTest.java +++ b/src/test/java/de/rub/nds/crawler/data/ScanTargetTest.java @@ -11,6 +11,7 @@ import static org.junit.jupiter.api.Assertions.*; import de.rub.nds.crawler.constant.JobStatus; +import java.util.List; import org.apache.commons.lang3.tuple.Pair; import org.junit.jupiter.api.Test; @@ -21,9 +22,11 @@ class ScanTargetTest { @Test void testIPv4AddressWithPort() { - Pair result = + List> results = ScanTarget.fromTargetString("192.168.1.1:8080", DEFAULT_PORT, null); + assertEquals(1, results.size()); + Pair result = results.get(0); assertEquals(JobStatus.TO_BE_EXECUTED, result.getRight()); ScanTarget target = result.getLeft(); assertEquals("192.168.1.1", target.getIp()); @@ -33,9 +36,11 @@ void testIPv4AddressWithPort() { @Test void testIPv4AddressWithoutPort() { - Pair result = + List> results = ScanTarget.fromTargetString("192.168.1.1", DEFAULT_PORT, null); + assertEquals(1, results.size()); + Pair result = results.get(0); assertEquals(JobStatus.TO_BE_EXECUTED, result.getRight()); ScanTarget target = result.getLeft(); assertEquals("192.168.1.1", target.getIp()); @@ -45,9 +50,11 @@ void testIPv4AddressWithoutPort() { @Test void testIPv6AddressWithPort() { - Pair result = + List> results = ScanTarget.fromTargetString("[2001:db8::1]:8080", DEFAULT_PORT, null); + assertEquals(1, results.size()); + Pair result = results.get(0); assertEquals(JobStatus.TO_BE_EXECUTED, result.getRight()); ScanTarget target = result.getLeft(); assertEquals("2001:db8::1", target.getIp()); @@ -57,9 +64,11 @@ void testIPv6AddressWithPort() { @Test void testIPv6AddressWithoutPort() { - Pair result = + List> results = ScanTarget.fromTargetString("2001:db8::1", DEFAULT_PORT, null); + assertEquals(1, results.size()); + Pair result = results.get(0); assertEquals(JobStatus.TO_BE_EXECUTED, result.getRight()); ScanTarget target = result.getLeft(); assertEquals("2001:db8::1", target.getIp()); @@ -69,9 +78,11 @@ void testIPv6AddressWithoutPort() { @Test void testIPv6AddressWithPortAndDefaultPort() { - Pair result = + List> results = ScanTarget.fromTargetString("[::1]:443", DEFAULT_PORT, null); + assertEquals(1, results.size()); + Pair result = results.get(0); assertEquals(JobStatus.TO_BE_EXECUTED, result.getRight()); ScanTarget target = result.getLeft(); assertEquals("::1", target.getIp()); @@ -81,73 +92,93 @@ void testIPv6AddressWithPortAndDefaultPort() { @Test void testHostnameWithPort() { - Pair result = + List> results = ScanTarget.fromTargetString("example.com:8080", DEFAULT_PORT, null); - assertEquals(JobStatus.TO_BE_EXECUTED, result.getRight()); - ScanTarget target = result.getLeft(); - assertEquals("example.com", target.getHostname()); - assertEquals(8080, target.getPort()); - // IP will be resolved, so we just check it's not null - assertNotNull(target.getIp()); + assertFalse(results.isEmpty()); + // Should have at least one result for example.com + for (Pair result : results) { + assertEquals(JobStatus.TO_BE_EXECUTED, result.getRight()); + ScanTarget target = result.getLeft(); + assertEquals("example.com", target.getHostname()); + assertEquals(8080, target.getPort()); + assertNotNull(target.getIp()); + } } @Test void testHostnameWithoutPort() { - Pair result = + List> results = ScanTarget.fromTargetString("example.com", DEFAULT_PORT, null); - assertEquals(JobStatus.TO_BE_EXECUTED, result.getRight()); - ScanTarget target = result.getLeft(); - assertEquals("example.com", target.getHostname()); - assertEquals(DEFAULT_PORT, target.getPort()); - // IP will be resolved, so we just check it's not null - assertNotNull(target.getIp()); + assertFalse(results.isEmpty()); + // Should have at least one result for example.com + for (Pair result : results) { + assertEquals(JobStatus.TO_BE_EXECUTED, result.getRight()); + ScanTarget target = result.getLeft(); + assertEquals("example.com", target.getHostname()); + assertEquals(DEFAULT_PORT, target.getPort()); + assertNotNull(target.getIp()); + } } @Test void testTrancoRankWithHostname() { - Pair result = + List> results = ScanTarget.fromTargetString("1,example.com", DEFAULT_PORT, null); - assertEquals(JobStatus.TO_BE_EXECUTED, result.getRight()); - ScanTarget target = result.getLeft(); - assertEquals("example.com", target.getHostname()); - assertEquals(1, target.getTrancoRank()); - assertEquals(DEFAULT_PORT, target.getPort()); - assertNotNull(target.getIp()); + assertFalse(results.isEmpty()); + // Should have at least one result for example.com + for (Pair result : results) { + assertEquals(JobStatus.TO_BE_EXECUTED, result.getRight()); + ScanTarget target = result.getLeft(); + assertEquals("example.com", target.getHostname()); + assertEquals(1, target.getTrancoRank()); + assertEquals(DEFAULT_PORT, target.getPort()); + assertNotNull(target.getIp()); + } } @Test void testUrlPrefixRemoval() { - Pair result = + List> results = ScanTarget.fromTargetString("//example.com", DEFAULT_PORT, null); - assertEquals(JobStatus.TO_BE_EXECUTED, result.getRight()); - ScanTarget target = result.getLeft(); - assertEquals("example.com", target.getHostname()); - assertEquals(DEFAULT_PORT, target.getPort()); - assertNotNull(target.getIp()); + assertFalse(results.isEmpty()); + // Should have at least one result for example.com + for (Pair result : results) { + assertEquals(JobStatus.TO_BE_EXECUTED, result.getRight()); + ScanTarget target = result.getLeft(); + assertEquals("example.com", target.getHostname()); + assertEquals(DEFAULT_PORT, target.getPort()); + assertNotNull(target.getIp()); + } } @Test void testQuotedHostname() { - Pair result = + List> results = ScanTarget.fromTargetString("\"example.com\"", DEFAULT_PORT, null); - assertEquals(JobStatus.TO_BE_EXECUTED, result.getRight()); - ScanTarget target = result.getLeft(); - assertEquals("example.com", target.getHostname()); - assertEquals(DEFAULT_PORT, target.getPort()); - assertNotNull(target.getIp()); + assertFalse(results.isEmpty()); + // Should have at least one result for example.com + for (Pair result : results) { + assertEquals(JobStatus.TO_BE_EXECUTED, result.getRight()); + ScanTarget target = result.getLeft(); + assertEquals("example.com", target.getHostname()); + assertEquals(DEFAULT_PORT, target.getPort()); + assertNotNull(target.getIp()); + } } @Test void testInvalidPortHandling() { // Port out of range should default to defaultPort - Pair result = + List> results = ScanTarget.fromTargetString("[2001:db8::1]:99999", DEFAULT_PORT, null); + assertEquals(1, results.size()); + Pair result = results.get(0); assertEquals(JobStatus.TO_BE_EXECUTED, result.getRight()); ScanTarget target = result.getLeft(); assertEquals("2001:db8::1", target.getIp()); @@ -157,9 +188,11 @@ void testInvalidPortHandling() { @Test void testMalformedPortHandling() { // Non-numeric port should default to defaultPort - Pair result = + List> results = ScanTarget.fromTargetString("[2001:db8::1]:abc", DEFAULT_PORT, null); + assertEquals(1, results.size()); + Pair result = results.get(0); assertEquals(JobStatus.TO_BE_EXECUTED, result.getRight()); ScanTarget target = result.getLeft(); assertEquals("2001:db8::1", target.getIp()); @@ -179,9 +212,11 @@ void testComplexIPv6Addresses() { for (String ipv6 : ipv6Addresses) { String targetString = "[" + ipv6 + "]:8080"; - Pair result = + List> results = ScanTarget.fromTargetString(targetString, DEFAULT_PORT, null); + assertEquals(1, results.size()); + Pair result = results.get(0); assertEquals(JobStatus.TO_BE_EXECUTED, result.getRight()); ScanTarget target = result.getLeft(); assertEquals(ipv6, target.getIp()); @@ -189,6 +224,31 @@ void testComplexIPv6Addresses() { } } + @Test + void testMultipleIPResolution() { + // Test with google.com which typically has multiple A records + List> results = + ScanTarget.fromTargetString("google.com", DEFAULT_PORT, null); + + assertFalse(results.isEmpty()); + // All results should be successful + for (Pair result : results) { + assertEquals(JobStatus.TO_BE_EXECUTED, result.getRight()); + ScanTarget target = result.getLeft(); + assertEquals("google.com", target.getHostname()); + assertEquals(DEFAULT_PORT, target.getPort()); + assertNotNull(target.getIp()); + // Verify it's a valid IP address format + assertTrue( + target.getIp() + .matches( + "^([0-9]{1,3}\\.){3}[0-9]{1,3}$|^([0-9a-fA-F]*:)+[0-9a-fA-F]*$")); + } + + // Log the number of IPs found for debugging + System.out.println("google.com resolved to " + results.size() + " IP address(es)"); + } + // Note: Testing unresolvable hostnames is environment-dependent and not reliable // for CI/CD environments, so we skip this test } From ecf6b57375cff13fabe24f0fab0298a27a03f454 Mon Sep 17 00:00:00 2001 From: Robert Merget Date: Wed, 11 Jun 2025 22:02:37 +0400 Subject: [PATCH 22/24] Improve exception handling information in ScanTarget Resolves #12 Enhanced ScanResult.fromException to store structured exception data: - Exception type, message, cause, and timestamp - Target context (hostname, IP, port) for debugging - Added overloaded method with additional error context parameter - Eliminated serialization issues with raw exception objects Added error tracking to ScanTarget: - New errorMessage and errorType fields with getters/setters - Detailed error information for DNS resolution failures - Specific denylist rejection context with IP and hostname details - Enhanced error propagation from target parsing to scan results Created ErrorContext utility class: - Standardized error context formatting - Support for DNS, denylist, parsing, and processing failures - Consistent error message patterns for analysis Improved JobSubmitter exception handling: - Enhanced error context for target parsing failures - Better integration with structured error reporting Features: - Structured error documents with separable fields - Comprehensive debugging information preservation - Backward compatible with existing error handling - Detailed failure analysis for large-scale scanning operations --- .../crawler/core/jobs/PublishBulkScanJob.java | 3 +- .../de/rub/nds/crawler/data/ErrorContext.java | 108 ++++++++++++++++++ .../de/rub/nds/crawler/data/ScanResult.java | 76 +++++++++++- .../de/rub/nds/crawler/data/ScanTarget.java | 67 +++++++++++ .../rub/nds/crawler/data/ScanTargetTest.java | 21 ++++ 5 files changed, 271 insertions(+), 4 deletions(-) create mode 100644 src/main/java/de/rub/nds/crawler/data/ErrorContext.java diff --git a/src/main/java/de/rub/nds/crawler/core/jobs/PublishBulkScanJob.java b/src/main/java/de/rub/nds/crawler/core/jobs/PublishBulkScanJob.java index 15366b1..ebc0b7e 100644 --- a/src/main/java/de/rub/nds/crawler/core/jobs/PublishBulkScanJob.java +++ b/src/main/java/de/rub/nds/crawler/core/jobs/PublishBulkScanJob.java @@ -332,7 +332,8 @@ public JobStatus apply(String targetString) { ScanJobDescription jobDescription = new ScanJobDescription( new ScanTarget(), bulkScan, JobStatus.RESOLUTION_ERROR); - ScanResult errorResult = ScanResult.fromException(jobDescription, e); + String errorContext = "Failed to parse target string: '" + targetString + "'"; + ScanResult errorResult = ScanResult.fromException(jobDescription, e, errorContext); LOGGER.error( "Error while creating ScanJobDescription for target '{}'", targetString, e); persistenceProvider.insertScanResult(errorResult, jobDescription); diff --git a/src/main/java/de/rub/nds/crawler/data/ErrorContext.java b/src/main/java/de/rub/nds/crawler/data/ErrorContext.java new file mode 100644 index 0000000..28f64a0 --- /dev/null +++ b/src/main/java/de/rub/nds/crawler/data/ErrorContext.java @@ -0,0 +1,108 @@ +/* + * TLS-Crawler - A TLS scanning tool to perform large scale scans with the TLS-Scanner + * + * Copyright 2018-2023 Ruhr University Bochum, Paderborn University, and Hackmanit GmbH + * + * Licensed under Apache License, Version 2.0 + * http://www.apache.org/licenses/LICENSE-2.0.txt + */ +package de.rub.nds.crawler.data; + +/** + * Utility class for creating structured error context information in scan results. + * + *

    This class provides static methods to generate standardized error context strings that can be + * used with {@link ScanResult#fromException(ScanJobDescription, Exception, String)} to provide + * detailed debugging information for scan failures. + * + *

    The error context strings follow a consistent format to facilitate parsing and analysis of + * error patterns across large-scale scan operations. Each context type includes relevant + * operational details and failure specifics. + * + *

    Context Categories: + * + *

      + *
    • DNS Resolution Failures - Hostname resolution errors with target details + *
    • Denylist Rejections - Blocking reasons with target and rule information + *
    • Target Parsing Failures - Input format issues with problematic strings + *
    • Network Connectivity - Connection and timeout failures + *
    + * + *

    Usage Example: + * + *

    {@code
    + * try {
    + *     // Perform hostname resolution
    + *     InetAddress.getAllByName(hostname);
    + * } catch (UnknownHostException e) {
    + *     String context = ErrorContext.dnsResolutionFailure(hostname, "A record lookup failed");
    + *     ScanResult errorResult = ScanResult.fromException(jobDescription, e, context);
    + * }
    + * }
    + * + * @see ScanResult#fromException(ScanJobDescription, Exception, String) + * @see de.rub.nds.crawler.constant.JobStatus + */ +public final class ErrorContext { + + private ErrorContext() { + // Utility class - prevent instantiation + } + + /** + * Creates error context for DNS resolution failures. + * + * @param hostname the hostname that failed to resolve + * @param reason the specific DNS failure reason + * @return formatted error context string + */ + public static String dnsResolutionFailure(String hostname, String reason) { + return String.format("DNS resolution failed for hostname '%s': %s", hostname, reason); + } + + /** + * Creates error context for denylist rejections. + * + * @param target the target that was rejected + * @param ruleType the type of denylist rule that triggered (IP, domain, etc.) + * @return formatted error context string + */ + public static String denylistRejection(String target, String ruleType) { + return String.format("Target '%s' rejected by %s denylist rule", target, ruleType); + } + + /** + * Creates error context for target string parsing failures. + * + * @param targetString the unparseable target string + * @param parseStage the parsing stage where failure occurred + * @return formatted error context string + */ + public static String targetParsingFailure(String targetString, String parseStage) { + return String.format( + "Failed to parse target string '%s' during %s", targetString, parseStage); + } + + /** + * Creates error context for port parsing failures. + * + * @param portString the invalid port string + * @param targetString the full target string for context + * @return formatted error context string + */ + public static String portParsingFailure(String portString, String targetString) { + return String.format("Invalid port '%s' in target string '%s'", portString, targetString); + } + + /** + * Creates error context for general target processing failures. + * + * @param targetString the target string being processed + * @param operation the operation that failed + * @return formatted error context string + */ + public static String targetProcessingFailure(String targetString, String operation) { + return String.format( + "Target processing failed for '%s' during %s", targetString, operation); + } +} diff --git a/src/main/java/de/rub/nds/crawler/data/ScanResult.java b/src/main/java/de/rub/nds/crawler/data/ScanResult.java index 70af899..6901ce2 100644 --- a/src/main/java/de/rub/nds/crawler/data/ScanResult.java +++ b/src/main/java/de/rub/nds/crawler/data/ScanResult.java @@ -130,8 +130,10 @@ public ScanResult(ScanJobDescription scanJobDescription, Document result) { * description has an error status (ERROR, CANCELLED, INTERNAL_ERROR, etc.) before creating the * error result, ensuring consistency between status and result content. * - *

    Exception Handling: The exception is embedded in a BSON document under - * the "exception" key, allowing for structured storage and later analysis of scan failures. + *

    Exception Handling: The exception information is stored in a structured + * format with separate fields for type, message, cause, and timestamp, enabling detailed + * analysis and debugging of scan failures while avoiding serialization issues with raw + * exception objects. * * @param scanJobDescription the scan job in an error state * @param e the exception that caused the scan to fail @@ -143,7 +145,75 @@ public static ScanResult fromException(ScanJobDescription scanJobDescription, Ex throw new IllegalArgumentException("ScanJobDescription must be in an error state"); } Document errorDocument = new Document(); - errorDocument.put("exception", e); + + // Store structured exception information for better analysis and debugging + errorDocument.put("exceptionType", e.getClass().getSimpleName()); + errorDocument.put("exceptionMessage", e.getMessage()); + errorDocument.put("exceptionCause", e.getCause() != null ? e.getCause().toString() : null); + errorDocument.put("timestamp", System.currentTimeMillis()); + + // Include target information if available for context + ScanTarget target = scanJobDescription.getScanTarget(); + if (target != null) { + errorDocument.put("targetHostname", target.getHostname()); + errorDocument.put("targetIp", target.getIp()); + errorDocument.put("targetPort", target.getPort()); + + // Include additional error context from the target if available + if (target.getErrorMessage() != null) { + errorDocument.put("targetErrorMessage", target.getErrorMessage()); + } + if (target.getErrorType() != null) { + errorDocument.put("targetErrorType", target.getErrorType()); + } + } + + return new ScanResult(scanJobDescription, errorDocument); + } + + /** + * Creates a scan result from an exception with additional error context. + * + *

    This overloaded method extends the basic exception handling by allowing additional + * contextual information to be included in the error document. This is particularly useful for + * providing specific failure reasons, debugging hints, or operational details. + * + * @param scanJobDescription the scan job in an error state + * @param e the exception that caused the scan to fail + * @param errorContext additional error context as key-value pairs + * @return a new ScanResult containing the exception details and additional context + * @throws IllegalArgumentException if the scan job is not in an error state + */ + public static ScanResult fromException( + ScanJobDescription scanJobDescription, Exception e, String errorContext) { + if (!scanJobDescription.getStatus().isError()) { + throw new IllegalArgumentException("ScanJobDescription must be in an error state"); + } + Document errorDocument = new Document(); + + // Store structured exception information + errorDocument.put("exceptionType", e.getClass().getSimpleName()); + errorDocument.put("exceptionMessage", e.getMessage()); + errorDocument.put("exceptionCause", e.getCause() != null ? e.getCause().toString() : null); + errorDocument.put("timestamp", System.currentTimeMillis()); + errorDocument.put("errorContext", errorContext); + + // Include target information if available for context + ScanTarget target = scanJobDescription.getScanTarget(); + if (target != null) { + errorDocument.put("targetHostname", target.getHostname()); + errorDocument.put("targetIp", target.getIp()); + errorDocument.put("targetPort", target.getPort()); + + // Include additional error context from the target if available + if (target.getErrorMessage() != null) { + errorDocument.put("targetErrorMessage", target.getErrorMessage()); + } + if (target.getErrorType() != null) { + errorDocument.put("targetErrorType", target.getErrorType()); + } + } + return new ScanResult(scanJobDescription, errorDocument); } diff --git a/src/main/java/de/rub/nds/crawler/data/ScanTarget.java b/src/main/java/de/rub/nds/crawler/data/ScanTarget.java index 4957f91..f0131fc 100644 --- a/src/main/java/de/rub/nds/crawler/data/ScanTarget.java +++ b/src/main/java/de/rub/nds/crawler/data/ScanTarget.java @@ -152,6 +152,11 @@ public static List> fromTargetString( if (denylistProvider != null && denylistProvider.isDenylisted(target)) { LOGGER.error("IP {} is denylisted and will not be scanned.", targetString); + + // Store denylist rejection information + target.setErrorMessage("Target blocked by denylist: IP address " + targetString); + target.setErrorType("DenylistRejection"); + results.add(Pair.of(target, JobStatus.DENYLISTED)); } else { results.add(Pair.of(target, JobStatus.TO_BE_EXECUTED)); @@ -178,6 +183,15 @@ public static List> fromTargetString( "IP {} for hostname {} is denylisted and will not be scanned.", address.getHostAddress(), targetString); + + // Store detailed denylist rejection information + ipTarget.setErrorMessage( + "Target blocked by denylist: IP " + + address.getHostAddress() + + " for hostname " + + targetString); + ipTarget.setErrorType("DenylistRejection"); + results.add(Pair.of(ipTarget, JobStatus.DENYLISTED)); } else { results.add(Pair.of(ipTarget, JobStatus.TO_BE_EXECUTED)); @@ -186,6 +200,11 @@ public static List> fromTargetString( } catch (UnknownHostException e) { LOGGER.error( "Host {} is unknown or can not be reached with error {}.", targetString, e); + + // Store detailed error information for debugging and analysis + target.setErrorMessage("DNS resolution failed: " + e.getMessage()); + target.setErrorType("UnknownHostException"); + results.add(Pair.of(target, JobStatus.UNRESOLVABLE)); } } @@ -205,6 +224,12 @@ public static List> fromTargetString( /** The Tranco ranking of the target (0 if not available or not specified). */ private int trancoRank; + /** Error message for debugging when target processing fails (may be null). */ + private String errorMessage; + + /** Error type classification for debugging (may be null). */ + private String errorType; + /** * Creates an empty ScanTarget. * @@ -298,4 +323,46 @@ public void setPort(int port) { public void setTrancoRank(int trancoRank) { this.trancoRank = trancoRank; } + + /** + * Gets the error message associated with this target. + * + *

    The error message provides detailed information about why target processing failed, + * including specific exception messages, DNS resolution failures, or parsing errors. + * + * @return the error message, or null if no error occurred + */ + public String getErrorMessage() { + return this.errorMessage; + } + + /** + * Sets the error message for this target. + * + * @param errorMessage the error message describing the failure + */ + public void setErrorMessage(String errorMessage) { + this.errorMessage = errorMessage; + } + + /** + * Gets the error type classification for this target. + * + *

    The error type provides a high-level classification of the failure type, such as + * "UnknownHostException", "NumberFormatException", or "DenylistRejection". + * + * @return the error type, or null if no error occurred + */ + public String getErrorType() { + return this.errorType; + } + + /** + * Sets the error type classification for this target. + * + * @param errorType the error type classification + */ + public void setErrorType(String errorType) { + this.errorType = errorType; + } } diff --git a/src/test/java/de/rub/nds/crawler/data/ScanTargetTest.java b/src/test/java/de/rub/nds/crawler/data/ScanTargetTest.java index b31c39c..3c8d3f2 100644 --- a/src/test/java/de/rub/nds/crawler/data/ScanTargetTest.java +++ b/src/test/java/de/rub/nds/crawler/data/ScanTargetTest.java @@ -243,12 +243,33 @@ void testMultipleIPResolution() { target.getIp() .matches( "^([0-9]{1,3}\\.){3}[0-9]{1,3}$|^([0-9a-fA-F]*:)+[0-9a-fA-F]*$")); + // Error fields should be null for successful resolution + assertNull(target.getErrorMessage()); + assertNull(target.getErrorType()); } // Log the number of IPs found for debugging System.out.println("google.com resolved to " + results.size() + " IP address(es)"); } + @Test + void testErrorInformationPreservation() { + // Test that error information fields are properly initialized and preserved + ScanTarget target = new ScanTarget(); + + // Initially error fields should be null + assertNull(target.getErrorMessage()); + assertNull(target.getErrorType()); + + // Set error information + target.setErrorMessage("Test error message"); + target.setErrorType("TestException"); + + // Verify error information is preserved + assertEquals("Test error message", target.getErrorMessage()); + assertEquals("TestException", target.getErrorType()); + } + // Note: Testing unresolvable hostnames is environment-dependent and not reliable // for CI/CD environments, so we skip this test } From 426dd699f5cbca4d7fa17bd0f472d778e276d72b Mon Sep 17 00:00:00 2001 From: Robert Merget Date: Tue, 17 Jun 2025 09:11:21 +0400 Subject: [PATCH 23/24] fixed versions in pom --- pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index f08ddf1..dfb71bd 100644 --- a/pom.xml +++ b/pom.xml @@ -8,7 +8,7 @@ crawler-core - 1.2.1-json + 1.2.1-SNAPSHOT Crawler-Core https://github.com/tls-attacker/TLS-Crawler @@ -125,7 +125,7 @@ de.rub.nds scanner-core - 6.1.3-json + 6.2.0 org.apache.commons From daf27f7a3284007d7e2b32714e04c71bb4abaee8 Mon Sep 17 00:00:00 2001 From: Robert Merget Date: Tue, 17 Jun 2025 09:25:55 +0400 Subject: [PATCH 24/24] compressed javadoc --- .../de/rub/nds/crawler/core/Controller.java | 45 +---------- .../de/rub/nds/crawler/data/BulkScan.java | 32 +------- .../de/rub/nds/crawler/data/ScanResult.java | 75 +++---------------- .../de/rub/nds/crawler/data/ScanTarget.java | 61 +++------------ .../RabbitMqOrchestrationProvider.java | 43 +---------- .../persistence/MongoPersistenceProvider.java | 59 ++------------- 6 files changed, 36 insertions(+), 279 deletions(-) diff --git a/src/main/java/de/rub/nds/crawler/core/Controller.java b/src/main/java/de/rub/nds/crawler/core/Controller.java index 7bbf3e3..42384e1 100644 --- a/src/main/java/de/rub/nds/crawler/core/Controller.java +++ b/src/main/java/de/rub/nds/crawler/core/Controller.java @@ -26,48 +26,9 @@ /** * Controller that orchestrates and schedules bulk scanning operations. * - *

    The Controller is the central coordination component of the TLS-Crawler system, responsible - * for managing the lifecycle of large-scale TLS scanning campaigns. It integrates with multiple - * subsystems to provide comprehensive scan orchestration. - * - *

    Core responsibilities: - * - *

      - *
    • Schedule Management - Uses Quartz scheduler for flexible scan timing - *
    • Job Publishing - Coordinates with orchestration providers to distribute - * scan jobs - *
    • Progress Monitoring - Optional real-time monitoring and notification - * system - *
    • Resource Integration - Manages target lists, denylists, and persistence - * layers - *
    - * - *

    Architecture Integration: - * - *

      - *
    • {@link IOrchestrationProvider} - Distributes scan jobs to worker instances - *
    • {@link IPersistenceProvider} - Handles scan result storage and retrieval - *
    • {@link ITargetListProvider} - Sources scan targets from various providers - *
    • {@link IDenylistProvider} - Filters prohibited targets - *
    • {@link ProgressMonitor} - Tracks scan progress and sends notifications - *
    - * - *

    Scheduling Options: - * - *

      - *
    • One-time execution - Immediate scan job publishing - *
    • Cron-based scheduling - Recurring scans with flexible timing - *
    • Simple scheduling - Basic interval-based execution - *
    - * - *

    Lifecycle: - * - *

      - *
    1. Controller initialization with configuration and providers - *
    2. Optional denylist and progress monitoring setup - *
    3. Quartz scheduler configuration and job registration - *
    4. Automatic shutdown when all scheduled jobs complete - *
    + *

    Central coordination component managing TLS scanning campaigns. Uses Quartz scheduler for + * timing, integrates with orchestration providers for job distribution, and supports progress + * monitoring. * * @see ControllerCommandConfig * @see PublishBulkScanJob diff --git a/src/main/java/de/rub/nds/crawler/data/BulkScan.java b/src/main/java/de/rub/nds/crawler/data/BulkScan.java index bce0245..8b80366 100644 --- a/src/main/java/de/rub/nds/crawler/data/BulkScan.java +++ b/src/main/java/de/rub/nds/crawler/data/BulkScan.java @@ -18,36 +18,10 @@ import javax.persistence.Id; /** - * Represents a bulk scanning operation with its configuration, progress tracking, and metadata. + * Represents a bulk scanning operation with configuration, progress tracking, and metadata. * - *

    A BulkScan encapsulates all information about a large-scale TLS scanning operation, including - * the scan configuration, target statistics, job status tracking, and version information. This - * class serves as the primary coordination entity for distributed scanning operations. - * - *

    The bulk scan lifecycle typically follows this pattern: - * - *

      - *
    1. Creation with scan configuration and target list - *
    2. Target processing and job publishing to worker queues - *
    3. Progress monitoring through job status counters - *
    4. Completion marking and result aggregation - *
    - * - *

    Key features: - * - *

      - *
    • Distributed coordination - Tracks jobs across multiple worker instances - *
    • Progress monitoring - Real-time status counters for different job states - *
    • Version tracking - Records scanner and crawler versions for - * reproducibility - *
    • Time tracking - Start and end time recording for performance analysis - *
    • Collection management - Automatic database collection naming with - * timestamps - *
    - * - *

    Persistence: This class is designed for MongoDB persistence with JPA - * annotations. Method naming follows serialization conventions and should not be changed without - * considering backward compatibility. + *

    Encapsulates large-scale TLS scanning operations with distributed coordination, progress + * monitoring, version tracking, and time recording. Designed for MongoDB persistence. * * @see ScanConfig * @see JobStatus diff --git a/src/main/java/de/rub/nds/crawler/data/ScanResult.java b/src/main/java/de/rub/nds/crawler/data/ScanResult.java index 6901ce2..d3a797d 100644 --- a/src/main/java/de/rub/nds/crawler/data/ScanResult.java +++ b/src/main/java/de/rub/nds/crawler/data/ScanResult.java @@ -15,47 +15,10 @@ import org.bson.Document; /** - * Immutable container for TLS scan results and associated metadata. + * Immutable container for TLS scan results and metadata. * - *

    The ScanResult class encapsulates the complete outcome of a TLS scan operation, including the - * scan target, execution status, result data, and traceability information. It serves as the - * primary data transfer object between the scanning engine, persistence layer, and monitoring - * systems in the distributed TLS-Crawler architecture. - * - *

    Key characteristics: - * - *

      - *
    • Immutability - All fields are final except the database-managed ID - *
    • Traceability - Links results back to their originating bulk scan - *
    • Status Tracking - Maintains job execution status for monitoring - *
    • Error Handling - Supports both successful results and exception storage - *
    • Serialization - Compatible with JSON/BSON for database persistence - *
    - * - *

    Construction Patterns: - * - *

      - *
    • Normal Constructor - Creates result from completed ScanJobDescription - *
    • Exception Factory - Creates error result via fromException() method - *
    • Validation - Enforces valid status transitions and error states - *
    - * - *

    Data Components: - * - *

      - *
    • Unique ID - UUID for database primary key and result identification - *
    • Bulk Scan ID - Reference to the parent bulk scanning campaign - *
    • Scan Target - The host/port combination that was scanned - *
    • Job Status - Final execution status (SUCCESS, ERROR, TIMEOUT, etc.) - *
    • Result Document - BSON document containing scan findings or error details - *
    - * - *

    Status Validation: The class enforces that results are only created from scan - * jobs that have completed execution (not in TO_BE_EXECUTED state) and that error results have - * appropriate error status codes. - * - *

    Database Integration: Uses Jackson annotations for JSON serialization and - * MongoDB integration, with the _id field mapping to the database primary key. + *

    Encapsulates scan outcome including target, status, result data, and traceability. Supports + * both successful results and error conditions. Uses Jackson/BSON for persistence. * * @see ScanJobDescription * @see ScanTarget @@ -120,25 +83,15 @@ public ScanResult(ScanJobDescription scanJobDescription, Document result) { } /** - * Factory method for creating scan results from exceptions during scan execution. - * - *

    This method provides a standardized way to create scan results when scan operations fail - * with exceptions. It creates a result document containing the exception details and ensures - * the scan job description is in an appropriate error state. - * - *

    Error State Validation: The method validates that the scan job - * description has an error status (ERROR, CANCELLED, INTERNAL_ERROR, etc.) before creating the - * error result, ensuring consistency between status and result content. + * Creates scan result from exception during scan execution. * - *

    Exception Handling: The exception information is stored in a structured - * format with separate fields for type, message, cause, and timestamp, enabling detailed - * analysis and debugging of scan failures while avoiding serialization issues with raw - * exception objects. + *

    Creates structured error document with exception details. Validates scan job is in error + * state. * - * @param scanJobDescription the scan job in an error state - * @param e the exception that caused the scan to fail - * @return a new ScanResult containing the exception details - * @throws IllegalArgumentException if the scan job is not in an error state + * @param scanJobDescription scan job in error state + * @param e exception that caused scan failure + * @return ScanResult containing exception details + * @throws IllegalArgumentException if scan job not in error state */ public static ScanResult fromException(ScanJobDescription scanJobDescription, Exception e) { if (!scanJobDescription.getStatus().isError()) { @@ -172,13 +125,9 @@ public static ScanResult fromException(ScanJobDescription scanJobDescription, Ex } /** - * Creates a scan result from an exception with additional error context. - * - *

    This overloaded method extends the basic exception handling by allowing additional - * contextual information to be included in the error document. This is particularly useful for - * providing specific failure reasons, debugging hints, or operational details. + * Creates scan result from exception with additional error context. * - * @param scanJobDescription the scan job in an error state + * @param scanJobDescription scan job in error state * @param e the exception that caused the scan to fail * @param errorContext additional error context as key-value pairs * @return a new ScanResult containing the exception details and additional context diff --git a/src/main/java/de/rub/nds/crawler/data/ScanTarget.java b/src/main/java/de/rub/nds/crawler/data/ScanTarget.java index f0131fc..5d5e836 100644 --- a/src/main/java/de/rub/nds/crawler/data/ScanTarget.java +++ b/src/main/java/de/rub/nds/crawler/data/ScanTarget.java @@ -23,26 +23,9 @@ /** * Represents a target for TLS scanning operations. * - *

    A scan target encapsulates the network location (hostname/IP address and port) and optional - * metadata (such as Tranco ranking) for a host to be scanned. This class provides parsing - * functionality to extract target information from various string formats commonly found in target - * lists and rankings. - * - *

    Supported target string formats: - * - *

      - *
    • example.com - hostname only - *
    • 192.168.1.1 - IPv4 address only - *
    • 2001:db8::1 - IPv6 address only - *
    • example.com:8080 - hostname with port - *
    • 192.168.1.1:443 - IPv4 address with port - *
    • [2001:db8::1]:8080 - IPv6 address with port (bracket notation) - *
    • 1,example.com - Tranco rank with hostname - *
    • //example.com - hostname with URL prefix - *
    - * - *

    The class performs hostname resolution and denylist checking during target creation. IPv6 - * addresses are fully supported with proper bracket notation for port specification. + *

    Encapsulates network location (hostname/IP and port) and optional metadata (Tranco ranking). + * Supports parsing various string formats: hostnames, IPs (IPv4/IPv6), ports, ranks, and URL + * prefixes. Performs hostname resolution and denylist checking. * * @see JobStatus * @see IDenylistProvider @@ -51,40 +34,16 @@ public class ScanTarget implements Serializable { private static final Logger LOGGER = LogManager.getLogger(); /** - * Creates a ScanTarget from a target string with comprehensive parsing and validation. - * - *

    This method parses various target string formats, performs hostname resolution, and checks - * against denylists. The parsing handles multiple formats including Tranco-ranked entries, - * URLs, and port specifications. - * - *

    Parsing logic: - * - *

      - *
    1. Extract Tranco rank if present (format: "rank,hostname") - *
    2. Remove URL prefixes ("//hostname") - *
    3. Remove quotes around hostnames - *
    4. Extract port number if specified ("hostname:port") - *
    5. Determine if target is IP address or hostname - *
    6. Resolve hostname to IP address if needed - *
    7. Check against denylist if provider is available - *
    + * Creates ScanTarget(s) from a target string with parsing and validation. * - *

    Multi-homed host support: For hostnames that resolve to multiple IP - * addresses, this method will create separate ScanTarget instances for each resolved IP - * address. This enables comprehensive scanning of domains with both IPv4 and IPv6 addresses or - * multiple A/AAAA records. + *

    Parses various formats (rank,hostname, URLs, ports), performs hostname resolution, and + * checks denylists. Creates separate targets for multi-homed hosts. * - * @param targetString the string to parse (supports various formats as documented in class - * description) - * @param defaultPort the port to use when none is specified in the target string - * @param denylistProvider optional provider for checking if targets are denylisted (may be - * null) - * @return a list of pairs, each containing a ScanTarget and its status (TO_BE_EXECUTED, - * UNRESOLVABLE, or DENYLISTED). For hostnames resolving to multiple IPs, multiple pairs are - * returned. For IP addresses or single-resolution hostnames, a single-element list is - * returned. + * @param targetString string to parse (hostname, IP, with optional rank/port) + * @param defaultPort port to use when none specified + * @param denylistProvider optional denylist checker (may be null) + * @return list of (ScanTarget, JobStatus) pairs - multiple for multi-homed hosts * @throws NumberFormatException if port or rank parsing fails - * @see JobStatus */ public static List> fromTargetString( String targetString, int defaultPort, IDenylistProvider denylistProvider) { diff --git a/src/main/java/de/rub/nds/crawler/orchestration/RabbitMqOrchestrationProvider.java b/src/main/java/de/rub/nds/crawler/orchestration/RabbitMqOrchestrationProvider.java index 64271ec..5dd5fcf 100644 --- a/src/main/java/de/rub/nds/crawler/orchestration/RabbitMqOrchestrationProvider.java +++ b/src/main/java/de/rub/nds/crawler/orchestration/RabbitMqOrchestrationProvider.java @@ -32,47 +32,10 @@ import org.apache.logging.log4j.Logger; /** - * RabbitMQ-based implementation of the orchestration provider for TLS-Crawler. + * RabbitMQ-based orchestration provider for TLS-Crawler. * - *

    This class implements a distributed messaging system using RabbitMQ for coordinating - * large-scale TLS scanning operations between controllers and workers. It handles job distribution, - * progress monitoring, and completion notifications across multiple worker instances. - * - *

    Key features: - * - *

      - *
    • Job Distribution - Publishes scan jobs to worker instances via queues - *
    • Load Balancing - Uses RabbitMQ's round-robin job distribution - *
    • Progress Monitoring - Optional completion notifications for tracking - *
    • Connection Management - Handles RabbitMQ connections with TLS support - *
    • Error Recovery - Graceful handling of serialization and network errors - *
    - * - *

    Queue Architecture: - * - *

      - *
    • scan-job-queue - Main queue for distributing scan jobs to workers - *
    • done-notify-queue_* - Per-scan completion notification queues - *
    • TTL Management - Automatic cleanup of unused notification queues - *
    - * - *

    Connection Features: - * - *

      - *
    • TLS/SSL support for secure communication - *
    • Authentication with username/password or password files - *
    • Configurable connection parameters (host, port, credentials) - *
    • Named thread factory for proper thread management - *
    - * - *

    Message Handling: - * - *

      - *
    • Java object serialization for scan job descriptions - *
    • Message acknowledgment for reliable delivery - *
    • Prefetch control for optimal worker performance - *
    • Error handling with message rejection for invalid data - *
    + *

    Implements distributed messaging for scan coordination using RabbitMQ. Handles job + * distribution, load balancing, progress monitoring, and TLS connections. * * @see IOrchestrationProvider * @see RabbitMqDelegate diff --git a/src/main/java/de/rub/nds/crawler/persistence/MongoPersistenceProvider.java b/src/main/java/de/rub/nds/crawler/persistence/MongoPersistenceProvider.java index d56324c..ee28515 100644 --- a/src/main/java/de/rub/nds/crawler/persistence/MongoPersistenceProvider.java +++ b/src/main/java/de/rub/nds/crawler/persistence/MongoPersistenceProvider.java @@ -45,55 +45,10 @@ import org.mongojack.JacksonMongoCollection; /** - * MongoDB implementation of the persistence provider for TLS-Crawler scan data. + * MongoDB persistence provider for TLS-Crawler scan data. * - *

    This class provides a comprehensive MongoDB-based persistence layer that handles storage and - * retrieval of bulk scan metadata and individual scan results. It implements sophisticated caching - * mechanisms and provides flexible JSON serialization support. - * - *

    Key features: - * - *

      - *
    • Dual Storage Model - Separate handling for bulk scan metadata and scan - * results - *
    • Database per Scan - Each bulk scan uses its own MongoDB database - *
    • Collection Caching - Guava cache for database and collection instances - *
    • Custom Serialization - Extensible Jackson mapper with custom serializers - *
    • Automatic Indexing - Performance-optimized indexes on scan target fields - *
    • Error Recovery - Graceful handling of serialization errors - *
    - * - *

    Storage Architecture: - * - *

      - *
    • Bulk Scans - Stored in a dedicated "bulkScans" collection within each scan - * database - *
    • Scan Results - Stored in dynamically named collections based on scan - * configuration - *
    • Database Naming - Each bulk scan creates a database named after the scan - *
    • Index Strategy - Automatic indexing on IP, hostname, Tranco rank, and - * result status - *
    - * - *

    Caching Strategy: - * - *

      - *
    • Database connections cached for 10 minutes after last access - *
    • Collection instances cached for 10 minutes after last access - *
    • Automatic cleanup of unused connections to prevent resource leaks - *
    - * - *

    Serialization Support: - * - *

      - *
    • Custom JsonSerializer registration for complex types - *
    • Jackson module support for extended functionality - *
    • BigDecimal serialization as strings for precision - *
    • Java Time API support through JavaTimeModule - *
    - * - *

    Error Handling: Implements sophisticated error recovery for serialization - * failures, creating error records instead of losing scan results. + *

    Provides MongoDB-based storage with separate databases per scan, collection caching, custom + * serialization, automatic indexing, and error recovery. * * @see IPersistenceProvider * @see MongoDbDelegate @@ -110,13 +65,9 @@ public class MongoPersistenceProvider implements IPersistenceProvider { private static final Set modules = new HashSet<>(); /** - * Registers a custom JSON serializer for use in MongoDB document serialization. - * - *

    This method allows registration of custom Jackson serializers that will be applied during - * JSON serialization of scan results before storing them in MongoDB. Serializers must be - * registered before the first MongoPersistenceProvider instance is created. + * Registers custom JSON serializer for MongoDB document serialization. * - *

    Registration Lifecycle: + *

    Must be registered before first MongoPersistenceProvider instance is created. * *

      *
    • Serializers can only be registered before initialization