Skip to content

Commit 0f590af

Browse files
authored
[Centipede] Perform corpus minimization during a fuzzing session (#4706)
This change call's centipede's minimize_corpus method on each fuzzing round. This allows us to distill the corpus and only add useful units. On a local run, this allowed to decrease the number of units added from 2000+ to 5 with the same coverage. The cl also extracts some common functionality into engine_common.
1 parent 0cd0f6e commit 0f590af

File tree

10 files changed

+227
-147
lines changed

10 files changed

+227
-147
lines changed

src/clusterfuzz/_internal/bot/fuzzers/centipede/constants.py

+4
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,10 @@
3939
EXTRA_BINARIES_FLAGNAME = 'extra_binaries'
4040
EXIT_ON_CRASH_FLAGNAME = 'exit_on_crash'
4141

42+
MAX_LEN_FLAGNAME = 'max_len'
43+
NUM_RUNS_FLAGNAME = 'num_runs'
44+
BATCH_SIZE_FLAGNAME = 'batch_size'
45+
4246
NUM_RUNS_PER_MINIMIZATION = 100000
4347

4448

src/clusterfuzz/_internal/bot/fuzzers/centipede/engine.py

+110-26
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
from clusterfuzz._internal.metrics import logs
3333
from clusterfuzz._internal.system import environment
3434
from clusterfuzz._internal.system import new_process
35+
from clusterfuzz._internal.system import shell
3536
from clusterfuzz.fuzz import engine
3637
from clusterfuzz.stacktraces import constants as stacktraces_constants
3738

@@ -46,6 +47,17 @@ class CentipedeError(Exception):
4647
"""Base exception class."""
4748

4849

50+
class CentipedeOptions(engine.FuzzOptions):
51+
"""Centipede engine options."""
52+
53+
def __init__(self, corpus_dir, arguments, strategies, workdir,
54+
new_corpus_dir):
55+
super().__init__(corpus_dir, arguments, strategies)
56+
# Directory to add new units
57+
self.new_corpus_dir = new_corpus_dir
58+
self.workdir = workdir
59+
60+
4961
def _get_runner(target_path):
5062
"""Gets the Centipede runner."""
5163
centipede_path = pathlib.Path(target_path).parent / 'centipede'
@@ -198,11 +210,13 @@ def prepare(self, corpus_dir, target_path, build_dir):
198210
# 1. Centipede-readable corpus file;
199211
# 2. Centipede-readable feature file;
200212
# 3. Crash reproducing inputs.
201-
workdir = self._create_temp_dir('workdir')
213+
workdir = engine_common.create_temp_fuzzing_dir('workdir')
202214
arguments[constants.WORKDIR_FLAGNAME] = str(workdir)
203215

204-
# Directory corpus_dir saves the corpus files required by ClusterFuzz.
205-
arguments[constants.CORPUS_DIR_FLAGNAME] = corpus_dir
216+
# Directory to place new units. While fuzzing, the new corpus
217+
# elements are written to the first dir in the list of corpus directories.
218+
new_corpus_dir = engine_common.create_temp_fuzzing_dir('new')
219+
arguments[constants.CORPUS_DIR_FLAGNAME] = f'{new_corpus_dir},{corpus_dir}'
206220

207221
target_binaries = self._get_binary_paths(target_path)
208222
if target_binaries.unsanitized is None:
@@ -214,7 +228,8 @@ def prepare(self, corpus_dir, target_path, build_dir):
214228
arguments[constants.EXTRA_BINARIES_FLAGNAME] = str(
215229
target_binaries.sanitized)
216230

217-
return engine.FuzzOptions(corpus_dir, arguments.list(), {})
231+
return CentipedeOptions(corpus_dir, arguments.list(), {}, workdir,
232+
new_corpus_dir)
218233

219234
def _get_binary_paths(self, target_path):
220235
"""Gets the paths to the main and auxiliary binaries based on |target_path|
@@ -284,11 +299,42 @@ def fuzz(self, target_path, options, reproducers_dir, max_time): # pylint: disa
284299
runner = _get_runner(target_path)
285300
_set_sanitizer_options(target_path)
286301
timeout = max_time + _CLEAN_EXIT_SECS
302+
303+
old_corpus_len = shell.get_directory_file_count(options.corpus_dir)
304+
logs.info(f'Corpus length before fuzzing: {old_corpus_len}')
305+
287306
fuzz_result = runner.run_and_wait(
288307
additional_args=options.arguments, timeout=timeout)
289308
log_lines = fuzz_result.output.splitlines()
290309
fuzz_result.output = Engine.trim_logs(fuzz_result.output)
291310

311+
workdir = options.workdir
312+
313+
try:
314+
time_for_minimize = timeout - fuzz_result.time_executed
315+
316+
self.minimize_corpus(
317+
target_path=target_path,
318+
arguments=[],
319+
# New units, in addition to the main corpus units,
320+
# are placed in new_corpus_dir. Minimize and merge back
321+
# to the main corpus_dir.
322+
input_dirs=[options.new_corpus_dir],
323+
output_dir=options.corpus_dir,
324+
reproducers_dir=reproducers_dir,
325+
max_time=time_for_minimize,
326+
# Use the same workdir that was used for fuzzing.
327+
# This allows us to skip rerunning the fuzzing inputs.
328+
workdir=workdir)
329+
except:
330+
# TODO(alhijazi): Convert to a warning if this becomes a problem
331+
# caused by user code rather than by ClusterFuzz or Centipede.
332+
logs.error('Corpus minimization failed.')
333+
# If we fail to minimize, fall back to moving the new units
334+
# from the new corpus_dir to the main corpus_dir.
335+
engine_common.move_mergeable_units(options.new_corpus_dir,
336+
options.corpus_dir)
337+
292338
reproducer_path = _get_reproducer_path(fuzz_result.output, reproducers_dir)
293339
crashes = []
294340
if reproducer_path:
@@ -298,11 +344,7 @@ def fuzz(self, target_path, options, reproducers_dir, max_time): # pylint: disa
298344
int(fuzz_result.time_executed)))
299345

300346
stats_filename = f'fuzzing-stats-{os.path.basename(target_path)}.000000.csv'
301-
args = fuzzer_options.FuzzerArguments.from_list(options.arguments)
302-
assert args is not None
303-
assert constants.WORKDIR_FLAGNAME in args
304347

305-
workdir = args[constants.WORKDIR_FLAGNAME]
306348
stats_file = os.path.join(workdir, stats_filename)
307349
stats = _parse_centipede_stats(stats_file)
308350
if not stats:
@@ -321,6 +363,11 @@ def fuzz(self, target_path, options, reproducers_dir, max_time): # pylint: disa
321363
num_execs_avg = stats.get('NumExecs_Avg', 0.0)
322364
stats['average_exec_per_sec'] = num_execs_avg / fuzz_time_secs_avg
323365
stats.update(_parse_centipede_logs(log_lines))
366+
367+
new_corpus_len = shell.get_directory_file_count(options.corpus_dir)
368+
logs.info(f'Corpus length after fuzzing: {new_corpus_len}')
369+
new_units_added = new_corpus_len - old_corpus_len
370+
stats['new_units_added'] = new_units_added
324371
return engine.FuzzResult(fuzz_result.output, fuzz_result.command, crashes,
325372
stats, fuzz_result.time_executed)
326373

@@ -379,14 +426,28 @@ def reproduce(self, target_path, input_path, arguments, max_time): # pylint: di
379426
return engine.ReproduceResult(result.command, result.return_code,
380427
result.time_executed, result.output)
381428

382-
def _create_temp_dir(self, name):
383-
"""Creates temporary directory for fuzzing."""
384-
new_directory = pathlib.Path(fuzzer_utils.get_temp_dir(), name)
385-
engine_common.recreate_directory(new_directory)
386-
return new_directory
429+
def _strip_fuzzing_arguments(self, arguments):
430+
"""Remove arguments only needed for fuzzing."""
431+
for argument in [
432+
constants.FORK_SERVER_FLAGNAME,
433+
constants.MAX_LEN_FLAGNAME,
434+
constants.NUM_RUNS_FLAGNAME,
435+
constants.EXIT_ON_CRASH_FLAGNAME,
436+
constants.BATCH_SIZE_FLAGNAME,
437+
]:
438+
if argument in arguments:
439+
del arguments[argument]
440+
441+
return arguments
387442

388-
def minimize_corpus(self, target_path, arguments, input_dirs, output_dir,
389-
reproducers_dir, max_time):
443+
def minimize_corpus(self,
444+
target_path,
445+
arguments,
446+
input_dirs,
447+
output_dir,
448+
reproducers_dir,
449+
max_time,
450+
workdir=None):
390451
"""Runs corpus minimization.
391452
Args:
392453
target_path: Path to the target.
@@ -401,16 +462,29 @@ def minimize_corpus(self, target_path, arguments, input_dirs, output_dir,
401462
A FuzzResult object.
402463
"""
403464
runner = _get_runner(target_path)
465+
_set_sanitizer_options(target_path)
466+
467+
minimize_arguments = self._get_arguments(target_path)
468+
self._strip_fuzzing_arguments(minimize_arguments)
469+
environment.set_value('ASAN_OPTIONS', 'detect_odr_violation=0')
404470

405471
# Step 1: Generate corpus file for Centipede.
406-
full_corpus_workdir = self._create_temp_dir('full_corpus_workdir')
472+
# When calling this during a fuzzing session, use the existing workdir.
473+
# This avoids us having to re-run inputs and waste time unnecessarily.
474+
# This saves a lot of time when the input corpus contains thousands
475+
# of files.
476+
full_corpus_workdir = workdir
477+
if not full_corpus_workdir:
478+
full_corpus_workdir = engine_common.create_temp_fuzzing_dir(
479+
'full_corpus_workdir')
407480
input_dirs_param = ','.join(str(dir) for dir in input_dirs)
408-
args = [
481+
args = minimize_arguments.list() + [
409482
f'--workdir={full_corpus_workdir}',
410483
f'--binary={target_path}',
411484
f'--corpus_dir={input_dirs_param}',
412485
'--num_runs=0',
413486
]
487+
logs.info(f'Running Generate Corpus file for Centipede with args: {args}')
414488
result = runner.run_and_wait(additional_args=args, timeout=max_time)
415489
max_time -= result.time_executed
416490

@@ -422,11 +496,12 @@ def minimize_corpus(self, target_path, arguments, input_dirs, output_dir,
422496
raise TimeoutError('Minimization timed out.')
423497

424498
# Step 2: Distill.
425-
args = [
499+
args = minimize_arguments.list() + [
426500
f'--workdir={full_corpus_workdir}',
427501
f'--binary={target_path}',
428-
'--distill',
502+
'--distill=true',
429503
]
504+
logs.info(f'Running Corpus Distillation with args: {args}')
430505
result = runner.run_and_wait(additional_args=args, timeout=max_time)
431506
max_time -= result.time_executed
432507

@@ -438,17 +513,21 @@ def minimize_corpus(self, target_path, arguments, input_dirs, output_dir,
438513

439514
# Step 3: Generate corpus files for output_dir.
440515
os.makedirs(output_dir, exist_ok=True)
441-
minimized_corpus_workdir = self._create_temp_dir('minimized_corpus_workdir')
516+
minimized_corpus_workdir = engine_common.create_temp_fuzzing_dir(
517+
'minimized_corpus_workdir')
518+
logs.info(f'Created a temporary minimized corpus '
519+
f'workdir {minimized_corpus_workdir}')
442520
distilled_file = os.path.join(
443521
full_corpus_workdir,
444522
f'distilled-{os.path.basename(target_path)}.000000')
445523
corpus_file = os.path.join(minimized_corpus_workdir, 'corpus.000000')
446524
shutil.copyfile(distilled_file, corpus_file)
447525

448-
args = [
526+
args = minimize_arguments.list() + [
449527
f'--workdir={minimized_corpus_workdir}',
450528
f'--corpus_to_files={output_dir}',
451529
]
530+
logs.info(f'Converting corpus to files with the following args: {args}')
452531
result = runner.run_and_wait(additional_args=args, timeout=max_time)
453532

454533
if result.timed_out or max_time < 0:
@@ -461,11 +540,16 @@ def minimize_corpus(self, target_path, arguments, input_dirs, output_dir,
461540
# Step 4: Copy reproducers from full_corpus_workdir.
462541
os.makedirs(reproducers_dir, exist_ok=True)
463542
crashes_dir = os.path.join(full_corpus_workdir, 'crashes')
464-
for file in os.listdir(crashes_dir):
465-
crasher_path = os.path.join(crashes_dir, file)
466-
shutil.copy(crasher_path, reproducers_dir)
467-
shutil.rmtree(full_corpus_workdir)
543+
544+
if os.path.exists(crashes_dir):
545+
for file in os.listdir(crashes_dir):
546+
crasher_path = os.path.join(crashes_dir, file)
547+
shutil.copy(crasher_path, reproducers_dir)
548+
468549
shutil.rmtree(minimized_corpus_workdir)
550+
if not workdir:
551+
# Only remove this directory if it was created in this method.
552+
shutil.rmtree(full_corpus_workdir)
469553

470554
return engine.ReproduceResult(result.command, result.return_code,
471555
result.time_executed, result.output)
@@ -507,7 +591,7 @@ def minimize_testcase(self, target_path, arguments, input_path, output_path,
507591
TimeoutError: If the testcase minimization exceeds max_time.
508592
"""
509593
runner = _get_runner(target_path)
510-
workdir = self._create_temp_dir('workdir')
594+
workdir = engine_common.create_temp_fuzzing_dir('workdir')
511595
args = [
512596
f'--binary={target_path}',
513597
f'--workdir={workdir}',

src/clusterfuzz/_internal/bot/fuzzers/engine_common.py

+33
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import re
2222
import shlex
2323
import shutil
24+
import string
2425
import sys
2526
import time
2627

@@ -71,6 +72,8 @@
7172
# Maximum number of seconds to run radamsa for.
7273
RADAMSA_TIMEOUT = 3
7374

75+
HEXDIGITS_SET = set(string.hexdigits)
76+
7477

7578
class Generator:
7679
"""Generators we can use."""
@@ -656,3 +659,33 @@ def get_log_header(command, time_executed):
656659
"""Get the log header."""
657660
quoted_command = get_command_quoted(command)
658661
return f'Command: {quoted_command}\nTime ran: {time_executed}\n'
662+
663+
664+
def is_sha1_hash(possible_hash):
665+
"""Returns True if |possible_hash| looks like a valid sha1 hash."""
666+
if len(possible_hash) != 40:
667+
return False
668+
669+
return all(char in HEXDIGITS_SET for char in possible_hash)
670+
671+
672+
def move_mergeable_units(merge_directory, corpus_directory):
673+
"""Move new units in |merge_directory| into |corpus_directory|."""
674+
initial_units = {
675+
os.path.basename(filename)
676+
for filename in shell.get_files_list(corpus_directory)
677+
}
678+
679+
for unit_path in shell.get_files_list(merge_directory):
680+
unit_name = os.path.basename(unit_path)
681+
if unit_name in initial_units and is_sha1_hash(unit_name):
682+
continue
683+
dest_path = os.path.join(corpus_directory, unit_name)
684+
shell.move(unit_path, dest_path)
685+
686+
687+
def create_temp_fuzzing_dir(name):
688+
"""Create a temporary directory for fuzzing."""
689+
new_corpus_directory = os.path.join(fuzzer_utils.get_temp_dir(), name)
690+
recreate_directory(new_corpus_directory)
691+
return new_corpus_directory

src/clusterfuzz/_internal/bot/fuzzers/honggfuzz/engine.py

+2-8
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
from clusterfuzz._internal.base import utils
2222
from clusterfuzz._internal.bot.fuzzers import dictionary_manager
2323
from clusterfuzz._internal.bot.fuzzers import engine_common
24-
from clusterfuzz._internal.bot.fuzzers import utils as fuzzer_utils
2524
from clusterfuzz._internal.metrics import logs
2625
from clusterfuzz._internal.system import environment
2726
from clusterfuzz._internal.system import new_process
@@ -219,12 +218,6 @@ def reproduce(self, target_path, input_path, arguments, max_time): # pylint: di
219218
return engine.ReproduceResult(result.command, result.return_code,
220219
result.time_executed, result.output)
221220

222-
def _create_temp_corpus_dir(self, name):
223-
"""Creates temporary corpus directory."""
224-
new_corpus_directory = os.path.join(fuzzer_utils.get_temp_dir(), name)
225-
engine_common.recreate_directory(new_corpus_directory)
226-
return new_corpus_directory
227-
228221
def minimize_corpus(self, target_path, arguments, input_dirs, output_dir,
229222
reproducers_dir, max_time):
230223
"""Optional (but recommended): run corpus minimization.
@@ -244,7 +237,8 @@ def minimize_corpus(self, target_path, arguments, input_dirs, output_dir,
244237
del reproducers_dir
245238

246239
runner = _get_runner()
247-
combined_corpus_dir = self._create_temp_corpus_dir('minimize-workdir')
240+
combined_corpus_dir = engine_common.create_temp_fuzzing_dir(
241+
'minimize-workdir')
248242

249243
# Copy all of the seeds into corpus.
250244
idx = 0

0 commit comments

Comments
 (0)