diff --git a/bap-superset-disasm.opam b/bap-superset-disasm.opam new file mode 100644 index 000000000..d227ffb05 --- /dev/null +++ b/bap-superset-disasm.opam @@ -0,0 +1,38 @@ +# This file is generated by dune, edit dune-project instead +opam-version: "2.0" +version: "dev" +synopsis: "A bap disassembler that converges on a minimal superset." +maintainer: ["Ivan Gotovchits "] +authors: ["The BAP Team"] +license: "MIT" +homepage: "https://github.com/BinaryAnalysisPlatform/bap" +bug-reports: "https://github.com/BinaryAnalysisPlatform/bap/issues" +depends: [ + "dune" {>= "3.1"} + "core_kernel" + "bap" + "ppx_inline_test" + "graphlib" + "landmarks" + "bap-future" + "zmq" + "gnuplot" + "odoc" {with-doc} +] +build: [ + ["dune" "subst"] {dev} + [ + "dune" + "build" + "-p" + name + "-j" + jobs + "--promote-install-files=false" + "@install" + "@runtest" {with-test} + "@doc" {with-doc} + ] + ["dune" "install" "-p" name "--create-install-files" name] +] +dev-repo: "git+https://github.com/BinaryAnalysisPlatform/bap.git" diff --git a/dune-project b/dune-project index 52e68ced4..99b2141b2 100644 --- a/dune-project +++ b/dune-project @@ -143,6 +143,24 @@ (ocaml (> 4.08.0)) (stdio (and (>= v0.14) (< v0.16))))) +(package + (name bap-superset-disasm) + (synopsis "A bap disassembler that converges on a minimal superset.") + (sites + (lib plugins) + ) + (depends + core_kernel + bap + ppx_inline_test + graphlib + landmarks + bap-future + zmq + gnuplot) +) + + (package (name bap-primus-support) (synopsis "Provides essential Primus components") diff --git a/lib/bap_superset_disasm/bap_superset_disasm.ml b/lib/bap_superset_disasm/bap_superset_disasm.ml new file mode 100644 index 000000000..ca399c832 --- /dev/null +++ b/lib/bap_superset_disasm/bap_superset_disasm.ml @@ -0,0 +1,9 @@ +module Superset_impl = Superset_impl +module Superset = Superset +module Heuristics = Heuristics +module Invariants = Invariants +module Metrics = Metrics +module Report = Report +module Traverse = Traverse +module Trim = Trim + diff --git a/lib/bap_superset_disasm/dune b/lib/bap_superset_disasm/dune new file mode 100644 index 000000000..f41d3bec3 --- /dev/null +++ b/lib/bap_superset_disasm/dune @@ -0,0 +1,18 @@ +(library + (name bap_superset_disasm) + (public_name bap-superset-disasm) + (wrapped false) + (preprocess (pps ppx_bap)) + (libraries + bap + bap-core-theory + bap-future + bap-knowledge + graphlib + ppx_inline_test + landmarks + zmq + gnuplot + ) +) + diff --git a/lib/bap_superset_disasm/fixpoint.ml b/lib/bap_superset_disasm/fixpoint.ml new file mode 100644 index 000000000..2d33a46d7 --- /dev/null +++ b/lib/bap_superset_disasm/fixpoint.ml @@ -0,0 +1,27 @@ +open Core +open Bap.Std + +let iterate rounds f superset = + let (superset) = f superset in + let rec do_analysis round superset = + if round = rounds then superset else + let (superset) = f superset in + do_analysis (round+1) superset in + do_analysis 1 superset + +let protect superset f = + let visited = Addr.Hash_set.create () in + let callsites = Heuristics.get_callsites ~threshold:0 superset in + let superset = Heuristics.tag_callsites visited ~callsites superset in + let superset = f superset in + Superset.Core.clear_each superset visited; + Trim.run superset + +let converge superset heuristics feature_pmap = + let superset = Trim.run superset in + let cache = Addr.Hash_set.create () in + List.iter Map.(keys feature_pmap) ~f:(fun addr -> + Traverse.mark_descendent_bodies_at superset ~visited:cache addr + ); + superset + diff --git a/lib/bap_superset_disasm/fixpoint.mli b/lib/bap_superset_disasm/fixpoint.mli new file mode 100644 index 000000000..6113d21c1 --- /dev/null +++ b/lib/bap_superset_disasm/fixpoint.mli @@ -0,0 +1,6 @@ +open Bap.Std + + +val iterate : int -> ('a -> 'a) -> 'a -> 'a +val protect : Superset_impl.t -> (Superset_impl.t -> Superset_impl.t) -> Superset_impl.t +val converge : Superset_impl.t -> 'a -> 'b Addr.Map.t -> Superset_impl.t diff --git a/lib/bap_superset_disasm/grammar.ml b/lib/bap_superset_disasm/grammar.ml new file mode 100644 index 000000000..4ce8511d6 --- /dev/null +++ b/lib/bap_superset_disasm/grammar.ml @@ -0,0 +1,118 @@ +open Core +open Bap.Std + +(** The objective here is to tag grammar structures while traversing + topologically in such a manner that we can converge the + probability of recognizing an intended sequence by the + compiler. After we've hit some recognition threshold, we begin + traversing forward from some activation point whereby we trim + occlusive instructions. To recognize grammars, we have several + means: one, loops are strongly connected components, and if + sequences must branch at some point only to reify at a common + point, expressing a path by which they can finally rejoin. *) +let tag_by_traversal ?(threshold=8) superset = + let visited = Addr.Hash_set.create () in + (*let callsites = Superset.get_callsites ~threshold:6 superset in + let superset = tag_callsites visited ~callsites superset in + let superset = Invariants.tag_layer_violations superset in + let superset = Invariants.tag_branch_violations superset in*) + let entries = Superset.entries_of_isg superset in + let branches = Superset.get_branches superset in + (*let branches = identify_branches superset in*) + (*let branches = linear_branch_sweep superset entries in*) + let cur_total = ref 0 in + let positives = ref [] in + let entry = ref None in + let tps = Addr.Hash_set.create () in + (* In the case that our current starting point, entry, is none, set *) + (* it to being the address of the lambda parameter, addr. Then, set *) + (* the current total number of recognized grammar items to zero, *) + (* as well as the positives since we're starting over *) + let pre addr = + if Option.is_none !entry then ( + entry := Some(addr); + cur_total := 0; + positives := []; + ); + if Hash_set.mem branches addr then ( + cur_total := !cur_total + 1; + positives := addr :: !positives; + if !cur_total >= threshold then ( + let open Option in + ignore (List.nth !positives threshold >>| + (fun convergent_point -> + Hash_set.add tps convergent_point)); + ) + ) in + let post addr = + entry := Option.value_map !entry ~default:!entry + ~f:(fun e -> if Addr.(e = addr) then None else Some(e)); + if Hash_set.mem branches addr then ( + cur_total := !cur_total - 1; + match !positives with + | _ :: remaining -> positives := remaining + | [] -> (); + ) in + Traverse.visit ~visited + ~pre ~post superset entries; + Hash_set.iter tps ~f:(fun tp -> + if not (Hash_set.mem visited tp) then ( + Traverse.with_descendents_at superset tp ~pre:(fun tp -> + let mark_bad addr = + if Superset.ISG.mem_vertex superset addr then ( + Superset.Core.mark_bad superset addr + ) in + Superset.Occlusion.with_data_of_insn superset tp ~f:mark_bad; + Hash_set.add visited tp; + ) ; + ) + ); + Hash_set.iter visited + ~f:(fun tp -> Superset.Core.clear_bad superset tp); + superset + + +let parents_of_insns superset component = + Set.fold component ~init:Addr.Set.empty ~f:(fun potential_parents addr -> + List.fold (Superset.ISG.ancestors superset addr) + ~init:potential_parents + ~f:(fun potential_parents ancestor -> + if not Set.(mem component ancestor) then + Set.add potential_parents ancestor + else potential_parents + ) + ) + +let addrs_of_loops loops = + List.fold_left loops ~init:Addr.Set.empty + ~f:(fun keep loop -> + Addr.Set.(union keep (of_list loop)) + ) + +let filter_loops ?(min_size=20) loops = + let loops = + List.filter loops ~f:(fun l -> List.length l > min_size) in + addrs_of_loops loops + +let addrs_of_filtered_loops ?(min_size=20) superset = + filter_loops ~min_size @@ Superset.ISG.raw_loops superset + +(** In the body of a loop, instructions fall through eventually to + themselves, which amounts to effectively a trigger of an + invariant. But the level at which invariants operate is too fine + grained to see the consequence propagated from conflicts that are + potentially in loops that are many instructions long. This + function cleanses the bodies of instructions that occur in loops + of a minimum size. *) +let tag_loop_contradictions ?(min_size=20) superset = + let keep = addrs_of_filtered_loops ~min_size superset in + (* Here we have to be careful; we only want to find instructions + that occur within a loop that produce a self-contradiction *) + let parents = parents_of_insns superset keep in + let to_remove = + Superset.Occlusion.conflicts_within_insns superset keep in + let to_remove = Set.inter to_remove parents in + let to_remove = Set.diff to_remove keep in + Set.iter to_remove ~f:(Superset.Core.mark_bad superset); + superset + diff --git a/lib/bap_superset_disasm/grammar.mli b/lib/bap_superset_disasm/grammar.mli new file mode 100644 index 000000000..f6cceeedd --- /dev/null +++ b/lib/bap_superset_disasm/grammar.mli @@ -0,0 +1,5 @@ +open Bap.Std + +val tag_loop_contradictions : ?min_size:int -> Superset_impl.t -> Superset_impl.t +val tag_by_traversal : ?threshold:int -> Superset_impl.t -> Superset_impl.t + diff --git a/lib/bap_superset_disasm/heuristics.ml b/lib/bap_superset_disasm/heuristics.ml new file mode 100644 index 000000000..b68c1aaa4 --- /dev/null +++ b/lib/bap_superset_disasm/heuristics.ml @@ -0,0 +1,236 @@ +open Core +open Bap.Std + +let defaults = [ + "ImgEntry"; + "InterpretationDepthOne"; + "Callsites3"; + "FixpointGrammar"; +] +let defaults = List.rev defaults + +let transform = Hash_set.fold ~init:Addr.Set.empty ~f:Set.add + +(** A callsite is a location which is shared as the target of a call + by several other locations in the binary. *) +let get_callsites ?(threshold=6) superset = + let callsites = Addr.Hash_set.create () in + Superset.ISG.iter_vertex superset + (fun v -> + let callers = Superset.ISG.ancestors superset v in + let num_callers = + List.fold callers ~init:0 ~f:(fun total caller -> + if not (Superset.is_fall_through superset caller v) then + total + 1 + else total) in + if num_callers > threshold then ( + Hash_set.add callsites v; + ) + ) ; + callsites + +(** Adds to the set visited the set of reachable descendents of a + callsite of a given sufficient threshold number of external + callers *) +let tag_callsites visited ?callsites superset = + let callsites = Option.value callsites + ~default:(get_callsites superset) in + Hash_set.iter callsites ~f:(fun callsite -> + Traverse.with_descendents_at ~visited + ?post:None ?pre:None superset callsite; + ); + superset + +let find_free_insns superset = + let mem = Superset.Core.mem superset in + let conflict_free_addrs = + Superset.Core.fold superset ~init:([]) + ~f:(fun ~key ~data free_insns -> + let (addr,(memory,_)) = key, data in + let len = Memory.length memory in + let conflicts = Superset.Occlusion.range_seq_of_conflicts + ~mem addr len in + let no_conflicts = Seq.is_empty conflicts in + if no_conflicts then + addr :: free_insns + else ( + free_insns + ) + ) in + let conflict_free_addrs = Addr.Set.of_list conflict_free_addrs in + Set.diff conflict_free_addrs @@ + Superset.Occlusion.find_all_conflicts superset + +let extract_trim_interpretation_depth superset = + let free_insns = find_free_insns superset in + let visited = Addr.Hash_set.create () in + let datas = Addr.Hash_set.create () in + Set.iter free_insns ~f:(fun c -> + if not Hash_set.(mem visited c) then + if Superset.Core.mem superset c then ( + Traverse.mark_descendent_bodies_at + ~visited ~datas superset c + ) + ); + Superset.Core.clear_each superset visited; + Set.iter free_insns ~f:(Superset.Core.clear_bad superset); + superset + +(** Interpretation depth is a heuristics that recognizes that true *) +(** positive compiler intended instructions are the only kind of *) +(** instruction that can occur where the body of the instruction is *) +(** completely free of occlusion, aside from random false positives *) +(** that happen to be free. These false positives should not affect *) +(** the convergence on the true positive set. It is called *) +(** interpretation depth one because, without noise in the body, *) +(** there is a single instruction that can be interpreted. *) +let extract_trim_protected_interpretation_depth superset = + let protection = Addr.Hash_set.create () in + let callsites = get_callsites ~threshold:0 superset in + let superset = tag_callsites protection ~callsites superset in + Superset.Core.clear_all_bad superset; + let superset = extract_trim_interpretation_depth superset in + Superset.Core.clear_each superset protection; + superset + +let fixpoint_descendants superset extractf depth = + let rec fix_descendants cur_features d = + if d >= depth then + cur_features + else + let visited = Addr.Hash_set.create () in + let subset_features = Addr.Hash_set.create () in + Hash_set.iter cur_features ~f:(fun cur -> + if not Hash_set.(mem visited cur) then + Traverse.with_descendents_at superset + ~pre:(fun v -> + if Hash_set.(mem cur_features v) + && not Addr.(cur = v) then + Hash_set.add subset_features v + ) ~visited cur + else Hash_set.add subset_features cur + ); + fix_descendants subset_features (d+1) + in + let cur_features = extractf superset in + fix_descendants cur_features 0 + +let fixpoint_map superset feature_pmap = + let visited = Addr.Hash_set.create () in + let entries = Superset.frond_of_isg superset in + Hash_set.fold ~init:feature_pmap entries ~f:(fun feature_pmap cur -> + if not Hash_set.(mem visited cur) then + let prev = ref [] in + let feature_pmap = ref feature_pmap in + Traverse.with_descendents_at ~pre:(fun v -> + match Map.find !feature_pmap v with + | None -> () + | Some(p) -> + prev := List.append p !prev; + feature_pmap := Map.set !feature_pmap ~key:v ~data:!prev; + ) ~visited superset cur; + !feature_pmap + else feature_pmap + ) + +let fixpoint_grammar superset depth = + let extractf superset = + Superset.get_branches superset in + fixpoint_descendants superset extractf depth + +let collect_descendants superset ?visited ?datas targets = + let visited = Option.value visited ~default:(Addr.Hash_set.create ()) in + let datas = Option.value datas ~default:(Addr.Hash_set.create ()) in + Hash_set.iter targets ~f:(fun v -> + if not Hash_set.(mem visited v) then + Traverse.mark_descendent_bodies_at ~visited ~datas superset v + ) + +let extract_img_entry superset = + let e = Addr.Set.empty in + match Superset.Inspection.get_main_entry superset with + | Some mentry -> Set.add e mentry + | None -> e + +let extract_trim_callsites superset = + let visited = Addr.Hash_set.create () in + let callsites = get_callsites ~threshold:2 superset in + let protection = get_callsites ~threshold:0 superset in + collect_descendants superset ~visited protection; + Superset.Core.clear_all_bad superset; + let superset = tag_callsites visited ~callsites superset in + Superset.Core.clear_each superset visited; + superset + +let extract_trim_entry superset = + let imgentry = extract_img_entry superset in + Set.iter imgentry ~f:Traverse.(mark_descendent_bodies_at superset); + superset + +type extractor = (Superset.t -> Addr.Set.t) +type ('b) mapextractor = (Superset.t -> 'b Addr.Map.t) +type setfilter = (Superset.t -> Addr.Set.t -> Addr.Set.t) +type ('b) mapfilter = (Superset.t -> 'b Addr.Map.t -> 'b Addr.Map.t) +type setexfilt = extractor * setfilter +type ('a, 'b) mapexfilt = ('b) mapextractor * ('b) mapfilter +let unfiltered _ = Fn.id + +let _exfiltset = [ + ("FixpointGrammar", + ((fun x -> transform (fixpoint_grammar x 0)), unfiltered)); + ("Callsites3", + ((fun x -> transform (get_callsites + ~threshold:6 x)), unfiltered)); + ("ImgEntry", (extract_img_entry, unfiltered)); +] +let exfiltset :(setexfilt) String.Map.t + = List.fold ~init:String.Map.empty _exfiltset + ~f:(fun exfiltset (name, f) -> + String.Map.set exfiltset ~key:name ~data:f + ) + +let featureflist = + [("Callsites3", extract_trim_callsites); + ("ImgEntry",extract_trim_entry); + ("InterpretationDepthOne" , + extract_trim_protected_interpretation_depth); + ] +let featuremap = List.fold featureflist ~init:String.Map.empty + ~f:(fun featuremap (name, f) -> + Map.set featuremap ~key:name ~data:f + ) + +let with_featureset ~f ~init featureset superset = + let superset = List.fold ~init featureset ~f:(fun accu fname -> + match Map.(find featuremap fname) with + | None -> accu + | Some (feature) -> + f fname feature accu + ) in + superset + +let fdists = String.Map.empty +let fdists = String.Map.set fdists ~key:"FixpointGrammar" ~data:1 + +let make_featurepmap featureset superset = + List.fold ~f:(fun (feature_pmap) feature -> + let p = Map.find fdists feature in + let p = Option.value p ~default:2 in + match Map.(find exfiltset feature) with + | None -> feature_pmap + | Some (extract,filter) -> + let fset = extract superset in + Set.fold fset ~init:feature_pmap + ~f:(fun feature_pmap x -> + Map.update feature_pmap x ~f:(function + | Some l -> (p, x, feature) :: l + | None -> [(p, x, feature)] + ) + ) + ) ~init:Addr.Map.empty featureset + +let with_featurepmap featureset superset ~f = + let feature_pmap = make_featurepmap featureset superset in + let feature_pmap = fixpoint_map superset feature_pmap in + f feature_pmap featureset superset + diff --git a/lib/bap_superset_disasm/heuristics.mli b/lib/bap_superset_disasm/heuristics.mli new file mode 100644 index 000000000..4d6863ddf --- /dev/null +++ b/lib/bap_superset_disasm/heuristics.mli @@ -0,0 +1,7 @@ +open Bap.Std + +val get_callsites : ?threshold:(int) -> Superset_impl.t -> Addr.Hash_set.t +val tag_callsites : Addr.Hash_set.t -> ?callsites:Addr.Hash_set.t -> Superset_impl.t -> Superset_impl.t +val with_featureset : f:(string -> (Superset_impl.t -> Superset_impl.t) -> 'a -> 'a) -> init:'a -> string list -> 'b -> 'a +val with_featurepmap : string list -> Superset_impl.t -> f:((int * word * string) list Addr.Map.t -> string list -> Superset_impl.t -> 'a) -> 'a +val defaults : string list diff --git a/lib/bap_superset_disasm/invariants.ml b/lib/bap_superset_disasm/invariants.ml new file mode 100644 index 000000000..8b893ff20 --- /dev/null +++ b/lib/bap_superset_disasm/invariants.ml @@ -0,0 +1,103 @@ +open Bap.Std +open Core + +module Dis = Disasm_expert.Basic + +(** Build the static successors of the current instruction, and tag + the superset with the function f in a micro-context. *) +let tag_with ~f (mem, insn) superset = + let targets = Superset.Inspection.static_successors superset mem insn in + f superset mem insn targets + +(** Looks for control flow edges to addresses that are not statically + known. *) +let tag_target_not_in_mem superset mem insn targets = + List.iter targets + ~f:(fun (target,_) -> + match target with + | Some(target) -> + if not (Superset.Inspection.contains_addr superset target) then + Superset.Core.mark_bad superset target + | None -> () + ); + superset + +(** If a jmp to NULL occurs, then this is triggered. *) +let tag_target_is_bad superset mem insn targets = + let width = Addr.bitwidth @@ Memory.min_addr mem in + let z = Addr.zero width in + List.iter targets + ~f:(fun (target,_) -> + match target with + | Some(target) -> + if Addr.(target = z) then + Superset.Core.mark_bad superset target + | None -> () + ); + superset + +(** If a jmp specifies the body of the instruction itself, this is + triggered. *) +let tag_target_in_body superset mem insn targets = + let src = Memory.min_addr mem in + List.iter targets + ~f:(fun (target,_) -> + match target with + | Some(target) -> + if (Memory.contains mem target) && + not Addr.(src = target) then + Superset.Core.mark_bad superset src + | None -> () + ); + superset + +(** Applies the tag together with the visitor and smaller functions + under the hood. *) + +(** If the instruction could not be disassembled or lifted, then tag + it. *) +let tag_non_insn superset mem insn targets = + let src = Memory.min_addr mem in + if Option.is_none insn then ( + Superset.Core.mark_bad superset src + ); + superset + +(** Used for the maintenance and construction of the superset. *) +let tag_success superset mem insn targets = + let src = Memory.min_addr mem in + List.fold targets ~init:superset ~f:(fun superset (target,_) -> + match target with + | Some (target) -> + Superset.ISG.link superset target src + | None -> superset) + +let default_tags = [(*"Tag non insn", tag_non_insn;*) + (*"Tag target not in mem", tag_target_not_in_mem;*) + "Tag target is bad", tag_target_is_bad; + "Tag target in body", tag_target_in_body; + (*tag_success;*)] + +let default_funcs = List.map default_tags ~f:snd + +(** Tag an individual instruction of a superset with a list of + invariants. *) +let tag ?invariants = + let invariants = Option.value invariants ~default:default_funcs in + let f superset mem insn targets = + List.fold_left invariants ~init:superset ~f:(fun superset f -> + (f superset mem insn targets)) in + tag_with ~f + + +(** Tag with a list of invariants over an entire superset. *) +let tag_superset ?invariants superset = + let invariants = Option.value invariants ~default:default_funcs in + let f superset mem insn targets = + List.fold ~init:superset invariants + ~f:(fun superset invariant -> + invariant superset mem insn targets) in + Superset.Core.fold ~init:superset superset ~f:(fun ~key ~data superset -> + let mem, insn = data in + tag_with ~f (mem, insn) superset + ) diff --git a/lib/bap_superset_disasm/invariants.mli b/lib/bap_superset_disasm/invariants.mli new file mode 100644 index 000000000..85a7eea13 --- /dev/null +++ b/lib/bap_superset_disasm/invariants.mli @@ -0,0 +1,12 @@ +open Bap.Std + +module Dis = Disasm_expert.Basic + +val tag_success : Superset_impl.t -> mem -> Dis.full_insn option -> Brancher.dests -> Superset_impl.t + +val tag : ?invariants:((Superset_impl.t -> mem -> Dis.full_insn option -> Brancher.dests -> Superset_impl.t) list) -> + Bap.Std.mem * Superset.Dis.full_insn option -> + Superset_impl.t -> Superset_impl.t + +val default_tags : (string * (Superset_impl.t -> mem -> Dis.full_insn option -> Brancher.dests -> Superset_impl.t)) list + diff --git a/lib/bap_superset_disasm/metrics.ml b/lib/bap_superset_disasm/metrics.ml new file mode 100644 index 000000000..b03659cbb --- /dev/null +++ b/lib/bap_superset_disasm/metrics.ml @@ -0,0 +1,308 @@ +open Bap.Std +open Core +open Or_error + +module Linear = Disasm_expert.Linear + +let read arch ic : (string * addr * addr) list = + let sym_of_sexp x = [%of_sexp:string * int64 * int64] x in + let addr_of_int64 x = + let width = Arch.addr_size arch |> Size.in_bits in + Addr.of_int64 ~width x in + List.(Sexp.input_sexps ic >>| sym_of_sexp >>| (fun (s, es, ef) -> + s, addr_of_int64 es, addr_of_int64 ef)) + +let read_addrs ic : addr list = + List.t_of_sexp Addr.t_of_sexp @@ Sexp.input_sexp ic + +let ground_truth_of_unstripped_bin bin : addr seq Or_error.t = + let name = Filename.basename bin in + let tmp = Filename.temp_dir_name ^ "/bw_" ^ name ^ ".symbol" in + let cmd = sprintf "bap-byteweight dump -i symbols %S > %S" + bin tmp in + if Stdlib.Sys.command cmd = 0 + then return (Seq.of_list @@ In_channel.with_file tmp ~f:read_addrs) + else errorf + "failed to fetch symbols from unstripped binary, command `%s' + failed" cmd + +let true_positives_of_ground_truth superset ground_truth = + let true_positives = Addr.Hash_set.create () in + Set.iter ground_truth ~f:(fun addr -> + if Superset.ISG.mem_vertex superset addr then + Traverse.with_descendents_at + ~visited:true_positives + superset addr; + ); + true_positives + +let true_positives superset f = + let function_starts = ground_truth_of_unstripped_bin f |> ok_exn + in + let ground_truth = + Addr.Set.of_list @@ Seq.to_list function_starts in + true_positives_of_ground_truth superset ground_truth + +module Cache = struct + open Bap_knowledge + open Bap_core_theory + + let package = "superset-disasm-metrics" + let sym_label = + KB.Symbol.intern "superset_analysis" Theory.Program.cls + + let bool_t = Knowledge.Domain.optional + ~inspect:sexp_of_bool ~equal:Bool.equal "bool" + let bool_persistent = + Knowledge.Persistent.of_binable + (module struct type t = bool option [@@deriving bin_io] end) + + let int_t = Knowledge.Domain.optional + ~inspect:sexp_of_int ~equal:Int.equal "int" + + let int_persistent = + Knowledge.Persistent.of_binable + (module struct type t = int option [@@deriving bin_io] end) + + let addrs_t = + Knowledge.Domain.optional + ~inspect:Addr.Set.sexp_of_t ~equal:Addr.Set.equal "addr.set" + + let addrs_persistent = + Knowledge.Persistent.of_binable + (module struct type t = Addr.Set.t option [@@deriving bin_io] end) + + let attr ty persistent name desc = + let open Theory.Program in + Knowledge.Class.property ~package cls name ty + ~persistent ~public:true ~desc + + let string_persistent = + Knowledge.Persistent.of_binable + (module struct type t = string [@@deriving bin_io] end) + + let ground_truth_source = + let open Knowledge.Domain in + attr string string_persistent "ground_truth_source" + "Binary containing debug information" + + let function_entrances = + attr addrs_t addrs_persistent "function_entrances" + "List of compiler intended function entrances" + + let ground_truth = + attr addrs_t addrs_persistent "ground_truth" + "Statically reachable descendents of function entrances" + + let size = + attr int_t int_persistent "code_size" + "Total number of byte that the code memory segments occupy" + + let time = + attr int_t int_persistent "processing_time" + "Time required to process this binary" + + let occlusive_space = + attr int_t int_persistent "occlusive_space" + "Number of addresses are in the bodies (addrs) of ground truth" + + let reduced_occlusion = + attr int_t int_persistent "reduced_occlusion" + "Of the occlusive space, how many are occupied" + + let false_negatives = + attr int_t int_persistent "false_negatives" + "Number of compiler intended instructions missing" + + let false_positives = + attr int_t int_persistent "false_positives" + "Number of compiler intended instructions missing" + + let true_positives = + attr int_t int_persistent "true_positives" + "Number of retained compiler intended instructions" + + let clean_functions = + attr addrs_t addrs_persistent "clean_functions" + "Functions with completely empty occlusive space" + + let occluded_entrances = + attr addrs_t addrs_persistent "occluded_entrances" + "Functions with at least one address that is obstructed" +end + +let set_ground_truth superset = + let open Bap_knowledge in + let open Bap_core_theory in + let open KB.Syntax in + KB.promise Cache.function_entrances (fun label -> + KB.collect Cache.ground_truth_source label >>= fun bin -> + (* list of compiler intended entrances *) + let function_addrs = ground_truth_of_unstripped_bin bin + |> ok_exn in + let function_addrs = + Addr.Set.of_list @@ Seq.to_list function_addrs in + KB.return (Some function_addrs) + ); + + KB.promise Cache.ground_truth (fun label -> + (* List of compiler intended addresses *) + KB.collect Cache.function_entrances label >>= + fun function_addrs -> + match function_addrs with + | None -> KB.return None + | Some function_addrs -> + let visited = Addr.Hash_set.create () in + Set.iter function_addrs ~f:(fun x -> + Traverse.with_descendents_at ~visited superset x + ); + let ground_truth = Hash_set.fold visited ~init: + Addr.Set.empty ~f:Set.add in + KB.return (Some ground_truth) + ) + +let compute_metrics superset = + let open Bap_knowledge in + let open Bap_core_theory in + let open KB.Syntax in + KB.promise Cache.size (fun label -> + KB.return (Some (Superset.Inspection.total_bytes superset)) + ); + + KB.promise Cache.occlusive_space (fun label -> + KB.collect Cache.ground_truth label >>= fun ground_truth -> + match ground_truth with + | None -> KB.return None + | Some ground_truth -> + KB.return @@ + Some (Set.fold ground_truth ~init:0 + ~f:(fun occ addr -> + occ + (Superset.Inspection.len_at superset addr) + )) + ); + + (* per feature metrics *) + KB.promise Cache.reduced_occlusion (fun label -> + KB.collect Cache.ground_truth label >>= fun ground_truth -> + match ground_truth with + | None -> KB.return None + | Some ground_truth -> + KB.return @@ + Some (Set.fold ground_truth ~init:0 + ~f:(fun ro addr -> + let conflicts = + Superset.Occlusion.conflict_seq_at + superset addr in + ro + (Seq.length conflicts) + )) + + ); + + KB.promise Cache.clean_functions (fun label -> + KB.collect Cache.function_entrances label >>= + fun function_entrances -> + match function_entrances with + | None -> KB.return None + | Some (function_entrances) -> + let ro_at x = + let ro = ref false in + let pre x = + let c = Superset.Occlusion.conflict_seq_at superset x in + ro := Seq.exists c ~f:(fun _ -> true); in + Traverse.with_descendents_at superset ~pre x; !ro in + let init = Addr.Set.empty in + KB.return @@ + (Some + (Set.fold function_entrances ~init ~f:(fun clean x -> + if ro_at x then clean else Set.add clean x + )) + ) + ); + + KB.promise Cache.occluded_entrances (fun label -> + KB.collect Cache.function_entrances label >>= function + | None -> KB.return None + | Some funcs -> + let occluded_starts = + Set.fold funcs ~init:Addr.Set.empty ~f:(fun occluded start -> + let behind = Addr.(start -- 20) in + let range = Superset.Core.seq_of_addr_range behind 21 in + let range = Seq.filter range ~f:(fun v -> not Addr.(v = start)) in + let addr = + Seq.find range ~f:(fun current -> + Seq.exists + (Superset.Occlusion.conflict_seq_at superset current) + ~f:(fun conflict -> Addr.((not (conflict = current)) + && conflict = start) + )) in + Option.value_map addr ~default:occluded ~f:Set.(add occluded) + ) in + KB.return (Some occluded_starts) + ); + + KB.promise Cache.true_positives (fun label -> + KB.collect Cache.ground_truth label >>= fun ground_truth -> + match ground_truth with + | None -> KB.return None + | Some ground_truth -> + KB.return + (Some (Set.fold ground_truth ~init:0 ~f:(fun tp_cnt x -> + if Superset.Core.mem superset x then + tp_cnt + 1 + else tp_cnt + )) + ) + ); + + KB.promise Cache.false_negatives (fun label -> + KB.collect Cache.ground_truth label >>= fun ground_truth -> + match ground_truth with + | Some ground_truth -> + let fn_cnt = + Set.fold ground_truth ~init:0 ~f:(fun cnt x -> + if not (Superset.Core.mem superset x) then + cnt + 1 + else cnt + ) in + KB.return @@ Some fn_cnt + | None -> KB.return None + ); + + KB.promise Cache.false_positives (fun label -> + KB.collect Cache.ground_truth label >>= fun ground_truth -> + match ground_truth with + | Some ground_truth -> + let false_positives = + Superset.Core.fold superset ~init:0 + ~f:(fun ~key ~data c -> + if not Set.(mem ground_truth key) then c+1 + else c) in + KB.return (Some false_positives) + | None -> KB.return None + ) + +type t = { + size : int option; + time : int option; + occ : int option; + occ_space : int option; + fe : int option; + clean : int option; + fns : int option; + fps : int option; + tps : int option; + } [@@deriving sexp] + +let get_summary () = { + size = Toplevel.eval Cache.size Cache.sym_label; + time = Toplevel.eval Cache.time Cache.sym_label; + fe = Option.map ~f:Set.length @@ + Toplevel.eval Cache.function_entrances Cache.sym_label; + occ_space = Toplevel.eval Cache.occlusive_space Cache.sym_label; + occ = Toplevel.eval Cache.reduced_occlusion Cache.sym_label; + fns = Toplevel.eval Cache.false_negatives Cache.sym_label; + fps = Toplevel.eval Cache.false_positives Cache.sym_label; + tps = Toplevel.eval Cache.true_positives Cache.sym_label; + clean = Option.map ~f:Set.length @@ + Toplevel.eval Cache.clean_functions Cache.sym_label; + } diff --git a/lib/bap_superset_disasm/metrics.mli b/lib/bap_superset_disasm/metrics.mli new file mode 100644 index 000000000..dedc27b14 --- /dev/null +++ b/lib/bap_superset_disasm/metrics.mli @@ -0,0 +1,52 @@ +open Core +open Bap.Std + +module Cache : sig + open Bap_knowledge + open Bap_core_theory + open Theory + + val sym_label : program Knowledge.obj KB.t + val ground_truth_source : (program, string) Knowledge.slot + + val function_entrances : (program, Addr.Set.t option) Knowledge.slot + + val ground_truth : (program, Addr.Set.t option) Knowledge.slot + + val size : (program, int option) Knowledge.slot + + val time : (program, int option) Knowledge.slot + + val occlusive_space : (program, int option) Knowledge.slot + + val reduced_occlusion : (program, int option) Knowledge.slot + + val false_negatives : (program, int option) Knowledge.slot + + val false_positives : (program, int option) Knowledge.slot + + val true_positives : (program, int option) Knowledge.slot + + val clean_functions : (program, Addr.Set.t option) Knowledge.slot + +end + +val set_ground_truth : Superset.t -> unit +val compute_metrics : Superset.t -> unit +val true_positives : Superset.t -> string -> Addr.Hash_set.t +type t = { + size : int option; + time : int option; + occ : int option; + occ_space : int option; + fe : int option; + clean : int option; + fns : int option; + fps : int option; + tps : int option; + } + +val t_of_sexp : Sexp.t -> t +val sexp_of_t : t -> Sexp.t + +val get_summary : unit -> t diff --git a/lib/bap_superset_disasm/report.ml b/lib/bap_superset_disasm/report.ml new file mode 100644 index 000000000..8613b7a92 --- /dev/null +++ b/lib/bap_superset_disasm/report.ml @@ -0,0 +1,103 @@ +open Core +open Bap.Std + +let () = Random.self_init () + +module Distribution = struct + type t = { + (* The ith elem says the number (int) of insns with i + accumulated total heurism instances *) + fp_at : int array; + tp_at : int array; + (* The ith list elem (x, y list) says the number (x) of true + positives that have i many heurisms in their favor and how + many false positives (y) they occlude with j number of heurisms. *) + fp_competitors_at : (int * int array) array; + } [@@deriving sexp,bin_io] + + let empty threshold = + let len = threshold + 1 in + { + fp_at = Array.create ~len 0; + tp_at = Array.create ~len 0; + fp_competitors_at = Array.create ~len (0,Array.create ~len 0); + } + + let add dist tps inst = + let (p,addr,heurism) = inst in + let p = min p @@ ((Array.length dist.fp_at) - 1) in + let insert c = + let cur = Array.get c p in + Array.set c p (cur+1) in + let _ = + if Set.mem tps addr then + insert dist.tp_at + else insert dist.fp_at in + dist +end + +let equal _ _ = false + +module Cache = struct + open Bap_knowledge + open Bap_core_theory + module SMap = struct + type t = Distribution.t String.Map.t [@@deriving bin_io,sexp] + let compare = String.Map.compare String.compare + let empty = String.Map.empty + end + + let distributions = + let package = "superset-disasm-reports" in + let map_persistent = + Knowledge.Persistent.of_binable (module SMap) in + let attr ty persistent name desc = + let open Theory.Program in + Knowledge.Class.property ~package cls name ty + ~persistent + ~public:true + ~desc in + let open Knowledge.Domain in + let smap_domain = + mapping (module String) ~equal:(equal) "string_map" + in + attr smap_domain map_persistent "reports" + "The reports for a given analysis" + +end + +let collect_distributions superset threshold pmap = + let open Bap_knowledge in + let open Bap_core_theory in + KB.promise Cache.distributions (fun o -> + let open KB.Syntax in + KB.collect Metrics.Cache.ground_truth o >>= fun tps -> + let default = Cache.SMap.empty in + KB.return @@ Option.value_map ~default tps ~f:(fun tps -> + let init = String.Map.empty in + let pmap = + Map.mapi pmap ~f:(fun ~key ~data -> + List.fold data ~init:String.Map.empty + ~f:(fun dist (p,addr,ftname) -> + String.Map.update dist ftname ~f:(fun total -> + (Option.value total ~default:0) + p + ) + ) + ) in + let reports = + List.fold (Addr.Map.to_alist pmap) ~init ~f:(fun reports (p_at,fttot) -> + String.Map.fold fttot ~init:reports + ~f:(fun ~key ~data reports -> + let name = key in + let p = data in + String.Map.update reports name ~f:(fun dist -> + let dist = + Option.value dist + ~default:(Distribution.empty threshold) + in Distribution.add dist tps (p,p_at,name) + ) + ) + ) in + reports + ) + ) diff --git a/lib/bap_superset_disasm/report.mli b/lib/bap_superset_disasm/report.mli new file mode 100644 index 000000000..cad2a7726 --- /dev/null +++ b/lib/bap_superset_disasm/report.mli @@ -0,0 +1,3 @@ +open Bap.Std + +val collect_distributions : Superset_impl.t -> int -> (int * word * string) list Addr.Map.t -> unit diff --git a/lib/bap_superset_disasm/superset.ml b/lib/bap_superset_disasm/superset.ml new file mode 100644 index 000000000..206fbd9f2 --- /dev/null +++ b/lib/bap_superset_disasm/superset.ml @@ -0,0 +1,653 @@ +open Bap.Std +open Regular.Std +open Core +open Or_error +open Graphlib.Std +open Bap_knowledge +open Bap_core_theory + +module Dis = Disasm_expert.Basic +open Superset_impl +type elem = Superset_impl.elem +type t = Superset_impl.t + +(* private accessors *) +let get_graph superset = superset.insn_risg +let get_insns superset = superset.insns + +(* private modifiers *) +let add_to_map superset mem insn = + let insns = get_insns superset in + let addr = (Memory.min_addr mem) in + Addr.Table.set insns ~key:addr ~data:(mem, insn); + superset + +module OG = Graphlib.To_ocamlgraph(G) + +let add_to_graph superset mem insn = + let addr = Memory.min_addr mem in + let insn_risg = G.Node.insert addr superset.insn_risg in + { superset with insn_risg } + +module Cache = struct + let package = "superset-disasm" + let sym_label = + KB.Symbol.intern "superset" Theory.Program.cls + + let superset_graph_t = + let sexp_of_edge (s,d) = + Tuple2.sexp_of_t Addr.sexp_of_t Addr.sexp_of_t (s,d) in + let equal (s1,d1) (s2,d2) = Addr.equal s1 s2 && Addr.equal d1 d2 in + let inspect = List.sexp_of_t sexp_of_edge in + Knowledge.Domain.optional + ~inspect ~equal:(List.equal equal) "edges" + + let superset_graph_persistent = + Knowledge.Persistent.of_binable + (module struct type t = (addr * addr) list option [@@deriving bin_io] end) + + let superset_graph = + let attr ty persistent name desc = + let open Theory.Program in + Knowledge.Class.property ~package cls name ty + ~persistent + ~public:true + ~desc in + attr superset_graph_t superset_graph_persistent "superset_graph" + "Graph, including all edges and single nodes." +end + +module Core = struct + let add superset mem insn = + let superset = add_to_graph superset mem insn in + let superset = add_to_map superset mem insn in + superset + + let remove superset addr = + let insn_risg = OG.remove_vertex superset.insn_risg addr in + Addr.Table.remove superset.insns addr; + Addr.Table.remove superset.lifted addr; + { superset with insn_risg; } + + let empty arch = + let brancher = Brancher.of_bil arch in + let module Target = (val target_of_arch arch) in + let lifter = Target.lift in + { + arch; + filename = None; + main_entry = None; + sections = Memmap.empty; + brancher; + endianness= None; + lifter; + insns = Addr.Table.create (); + lifted = Addr.Table.create (); + insn_risg = Graphlib.create (module G) (); + bad = Addr.Hash_set.create (); + keep = Addr.Hash_set.create (); + } + + let lookup superset addr = + Addr.Table.find superset.insns addr + + let mem superset addr = + OG.mem_vertex superset.insn_risg addr + + let fold superset = + Addr.Table.fold superset.insns + + let clear_bad superset addr = + Hash_set.remove superset.bad addr + + let clear_each superset s = + Hash_set.iter s ~f:(fun v -> + clear_bad superset v + ) + + let clear_all_bad superset = + Hash_set.clear superset.bad + + let mark_bad superset addr = + if OG.mem_vertex superset.insn_risg addr then + Hash_set.add superset.bad addr + + let copy_bad superset = + Hash_set.copy superset.bad + + let next_chunk mem ~addr = + let next_addr = Addr.succ addr in + Memory.view ~from:next_addr mem + + let seq_of_addr_range addr len = + let open Seq.Generator in + let rec gen_next_addr cur_addr = + if Addr.(cur_addr >= (addr ++ len)) then + return () + else + yield cur_addr >>= fun () -> + let next_addr = Addr.succ cur_addr in + gen_next_addr next_addr + in run (gen_next_addr Addr.(succ addr)) + + (** Builds a sequence disassembly sequence at every byte offset of + the memory mem. *) + let run_seq dis mem = + let start_addr = Memory.min_addr mem in + let len = Memory.length mem in + let addrs = seq_of_addr_range start_addr len in + Seq.filter_map addrs ~f:(fun addr -> + let m = Memory.view ~from:addr mem in + match m with + | Error _ -> None + | Ok m -> ( + match Dis.insn_of_mem dis m with + | Ok (m, insn, _) -> Some (m, insn) + | Error _ -> Some (m, None) + ) + ) + + (** Fold over the memory at every byte offset with function f *) + let run dis ~accu ~f mem = + Seq.fold ~init:accu ~f:(fun x y -> f y x) (run_seq dis mem) + + (** This builds the disasm type, and runs it on the memory. *) + let disasm ?(backend="llvm") ~addrs ~accu ~f arch memry = + Or_error.map + (Dis.with_disasm ~backend (Arch.to_string arch) + ~f:(fun d -> + let rec next state (addrs,accu) = + match Seq.next addrs with + | None -> Dis.stop state (addrs,accu) + | Some(addr,addrs) -> + match Memory.view ~from:addr memry with + | Error _ -> next state (addrs,accu) + | Ok jtgt -> Dis.jump state jtgt (addrs,accu) in + let invalid state m (addrs,accu) = + let accu = f (m, None) accu in + next state (addrs,accu) in + let hit state m insn (addrs,accu) = + let accu = f (m, (Some insn)) accu in + next state (addrs,accu) in + Ok(Dis.run ~backlog:1 ~stop_on:[`Valid] ~invalid + ~hit d ~init:(addrs,accu) ~return:Fn.id memry) + )) ~f:(fun (_,accu) -> accu) + + let disasm_all ?(backend="llvm") ~accu ~f arch memry = + let addrs = seq_of_addr_range + (Memory.min_addr memry) (Memory.length memry) in + disasm ~backend ~addrs ~accu ~f arch memry + + let lift_at superset addr = + match Addr.Table.find superset.lifted addr with + | Some (bil) -> Some (bil) + | None -> ( + match lookup superset addr with + | Some (mem, insn) -> ( + let bil = + Option.value_map insn ~default:[] ~f:(fun insn -> + match (superset.lifter mem insn) with + | Ok bil -> + Addr.Table.set superset.lifted ~key:addr ~data:bil; + bil + | _ -> [] + ) in + Some (bil) + ) + | None -> None + ) + + let lift_insn superset (mem,insn) = + let addr = Memory.(min_addr mem) in + lift_at superset addr + + (** Perform superset disassembly on mem and add the results. *) + let update_with_mem ?backend ?addrs ?f superset mem = + let f = Option.value f ~default:(fun (m, i) a -> a) in + let f (mem, insn) superset = + let superset = add superset mem insn in + f (mem, insn) superset in + let default = seq_of_addr_range + (Memory.min_addr mem) (Memory.length mem) in + let addrs = Option.value addrs ~default in + disasm ?backend ~accu:superset ~f ~addrs superset.arch mem |> ok_exn + + let is_unbalanced superset = + Addr.Table.length superset.insns <> OG.nb_vertex superset.insn_risg + + (** This function is required by the differences between propagating + data removal maximally and maintaining associations between the + address and the instruction decoded thereof. It strives to make the + results of trimming and tagging consistent between the map and + graph, in order to keep a consistent view. But this may not produce + the one to one needed, since there are scenarios in which there can + be mismatch in some scenarios that are possible in binaries. Using + the tagging and other library together with fixpoint is sufficient + to rule that out, however. *) + let rebalance superset = + let insns = get_insns superset in + let insn_risg = get_graph superset in + if is_unbalanced superset then + OG.iter_vertex (fun vert -> + if not Addr.Table.(mem insns vert) + && Memmap.contains superset.sections vert then ( + mark_bad superset vert; + ) + ) insn_risg; + let insns = + if Hash_set.length superset.bad > 0 || + is_unbalanced superset then + Addr.Table.filteri ~f:(fun ~key ~data -> + let vert = key in + OG.(mem_vertex insn_risg vert) + ) insns + else superset.insns in + let lifted = + if is_unbalanced superset then + let f ~key ~data = OG.mem_vertex insn_risg key in + Addr.Table.filteri superset.lifted ~f + else superset.lifted in + { superset with insn_risg; insns; lifted; } + +end + + +module ISG = struct + let ancestors superset = + OG.succ superset.insn_risg + + let descendants superset = + OG.pred superset.insn_risg + + let iter_vertex superset f = + OG.iter_vertex f superset.insn_risg + + let fold_vertex superset f = + OG.fold_vertex f superset.insn_risg + + let fold_edges superset f = + let insn_risg = superset.insn_risg in + OG.fold_edges f insn_risg + + let to_list superset = + let init = [] in + let f s d acc = (s,d) :: acc in + let l = fold_edges superset f init in + let f v acc = + let g = superset.insn_risg in + if OG.in_degree g v = 0 && OG.out_degree g v = 0 then + (v,v) :: acc + else acc in + fold_vertex superset f l + + let link superset v1 v2 = + let e = G.Edge.create v1 v2 () in + let insn_risg = G.Edge.insert e superset.insn_risg in + { superset with insn_risg } + + let unlink superset v1 v2 = + let insn_risg = OG.remove_edge superset.insn_risg v1 v2 in + { superset with insn_risg } + + let mem_vertex superset = OG.mem_vertex superset.insn_risg + + let check_connected superset e1 e2 = + match OG.find_all_edges + superset.insn_risg e1 e2 with + | [] -> false | _ -> true + + let raw_loops superset = + StrongComponents.scc_list superset.insn_risg + + let dfs_fold ?visited superset = + fold_component ?visited superset.insn_risg + + let dfs ?(terminator=(fun _ -> true)) + ?visited ?(pre=fun _ -> ()) ?(post=fun _ -> ()) explore superset v = + let visited = Option.value visited + ~default:(Addr.Hash_set.create ()) in + let rec visit v = + Hash_set.add visited v; + pre v; + List.iter (explore superset v) + ~f:(fun w -> + if (not (Hash_set.mem visited w)) && (terminator w) then + visit w) ; + post v + in if Core.mem superset v then visit v + + let fixpoint ?steps ?start ?rev ?step superset = + Graphlib.fixpoint ?steps ?start ?rev ?step + (module G) superset.insn_risg + + let print_dot ?colorings superset = + (*if not (colorings = String.Map.empty) then*) + let colorings = Option.value colorings ~default:String.Map.empty in + let fout = + Out_channel.create @@ Option.value_exn superset.filename + ^ ".dot" in + let superset_isg = Oper.mirror superset.insn_risg in + let insns = superset.insns in + let module Layout = + Make(struct + let instance = (superset_isg, colorings, insns) + end) in + Layout.Dot.output_graph fout (superset_isg, colorings, insns) + + let format_isg ?format superset = + let format = Option.value format ~default:Format.std_formatter in + let format = + fold_edges superset (fun src dst format -> + Format.pp_print_string format @@ Addr.to_string src; + Format.pp_print_string format @@ "-"; + Format.pp_print_string format @@ Addr.to_string dst; + Format.pp_print_string format @@ "\n"; + format + ) format in + Format.pp_print_string format @@ "\n"; + iter_vertex superset (fun v -> + Format.pp_print_string format @@ Addr.to_string v; + Format.pp_print_string format @@ "\n"; + ) + + let isg_to_string superset = + let format = Format.str_formatter in + format_isg ~format superset; + Format.flush_str_formatter () + + let parse_isg bin = + let str = In_channel.read_all bin in + let gstr = String.split_lines str in (* heh *) + let init = false,Graphlib.create (module G) () in + let _,g = List.fold ~init gstr ~f:(fun (status,g) line -> + if not status then ( + if String.equal line "" then true,g else ( + let l = String.split line ~on:'-' in + let l = List.map l ~f:Addr.of_string in + match l with + | [src;dst] -> + let e = G.Edge.create src dst () in + status,G.Edge.insert e g + | _ -> status,g + ) + ) else ( + status, OG.add_vertex g @@ Addr.of_string line + ) + ) in g + + let filter superset subgraph = + let insn_risg = superset.insn_risg in + let g = Graphlib.create (module G) () in + let g = + Hash_set.fold subgraph ~init:g ~f:(fun g addr -> + let g = OG.add_vertex g addr in + let g = OG.fold_succ + (fun s g -> + if Hash_set.mem subgraph s then + OG.add_edge g addr s + else g + ) insn_risg addr g in + let g = OG.fold_pred + (fun s g -> + if Hash_set.mem subgraph s then + OG.add_edge g s addr + else g + ) insn_risg addr g in g + ) in + { superset with insn_risg =g; } + +end + +module Inspection = struct + + let contains_addr superset addr = + Memmap.contains superset.sections addr + + let total_bytes superset = + Seq.fold (Memmap.to_sequence superset.sections) ~init:0 + ~f:(fun total (mem,_) -> (total + (Memory.length mem))) + + let count superset = OG.nb_vertex superset.insn_risg + + let count_unbalanced superset = Addr.Table.length superset.insns + + let unbalanced_diff superset = + let kys = (Addr.Table.keys superset.insns) in + let mapaddrs = + List.fold kys ~init:Addr.Set.empty ~f:Set.add in + let gaddrs = OG.fold_vertex (fun x s -> Set.add s x) + superset.insn_risg Addr.Set.empty in + Set.diff mapaddrs gaddrs, Set.diff gaddrs mapaddrs + + let get_memmap superset = superset.sections + + let get_main_entry superset = superset.main_entry + + let filename superset = superset.filename + + let static_successors superset mem insn = + let brancher = superset.brancher in + let addr = Memory.min_addr mem in + if Addr.Table.mem superset.lifted addr then + let l = ISG.descendants superset addr in + List.map l ~f:(fun dst -> (Some dst, `Fall)) + else + match insn with + | None -> [None, `Fall] + | Some insn -> + try + Brancher.resolve brancher mem insn + with _ -> ( + print_endline @@ + "Target resolve failed on memory at " ^ Memory.to_string mem; + [None, `Fall] (*KB.return []*) + ) + + let len_at superset at = + let insns = get_insns superset in + match Addr.Table.find insns at with + | None -> 0 + | Some(mem, _) -> Memory.length mem + + let num_bad superset = + Hash_set.length superset.bad + + let is_bad_at superset at = Hash_set.mem superset.bad at + + let get_segments superset = superset.sections + + let get_endianness superset = superset.endianness + + let get_arch superset = superset.arch + +end + +module Occlusion = struct + + let range_seq_of_conflicts ~mem addr len = + let range_seq = Core.seq_of_addr_range addr len in + Seq.filter range_seq ~f:mem + + let conflict_seq_at superset addr = + let insns = superset.insns in + let check_mem = Addr.Table.(mem insns) in + match Addr.Table.find insns addr with + | Some(mem, _) -> + let len = Memory.length mem in + range_seq_of_conflicts ~mem:check_mem addr len + | None -> Seq.empty + + let with_data_of_insn superset at ~f = + let len = Inspection.len_at superset at in + let body = Core.seq_of_addr_range at len in + Seq.iter body ~f + + let conflicts_within_insn_at superset ?mem ?conflicts addr = + let default = (OG.mem_vertex superset.insn_risg) in + let mem = Option.value mem ~default in + let conflicts = Option.value conflicts ~default:Addr.Set.empty in + match Core.lookup superset addr with + | Some ((m, _)) -> + let len = (Memory.length m) in + let rng = range_seq_of_conflicts ~mem addr len in + let conflicts = + if not (Seq.is_empty rng) then + Set.add conflicts addr + else conflicts in + Seq.fold rng ~init:conflicts ~f:Set.add + | None -> conflicts + + let find_all_conflicts ?mem superset = + let insns = superset.insns in + List.fold Addr.Table.(keys insns) ~init:Addr.Set.empty + ~f:(fun conflicts addr -> + conflicts_within_insn_at superset ?mem ~conflicts addr + ) + + let conflicts_within_insns superset keep = + Set.fold keep ~init:Addr.Set.empty + ~f:(fun conflicts addr -> + conflicts_within_insn_at superset + ~conflicts addr + ) + + (* It is possible that scenarios admit an instruction that is not + the parent of a shared child that contains this addr *) + let parent_conflict_at superset addr = + let children = ISG.descendants superset addr in + List.fold children ~init:Addr.Set.empty ~f:(fun cparents child -> + let parents = ISG.ancestors superset child in + List.fold parents ~init:cparents ~f:(fun cparents parent -> + if not Addr.(parent = addr) then + match Core.lookup superset parent with + | Some(mem, _) -> + let len = Memory.length mem in + if Addr.(parent < addr) && Addr.(addr < (parent ++ len)) then + Set.add cparents parent + else cparents + | None -> cparents + else cparents + ) + ) + +end + +(** An address is an entry for an isg if it could be the return + ** terminating instruction at the end of a function. *) +let is_entry superset addr = + let insn_isg = superset.insn_risg in + OG.in_degree insn_isg addr = 0 && + OG.out_degree insn_isg addr > 0 + +let entries_of_isg superset = + let insn_isg = superset.insn_risg in + OG.fold_vertex (fun addr accu -> + if is_entry superset addr then + (Hash_set.add accu addr; accu) + else accu) + insn_isg (Addr.Hash_set.create ()) + +(** A frond point is a point that is distant most of a terminating + ** instruction, meaning it may be the first instruction of a + ** function. However, these could actually occur either within or + ** beyond the body of instruction sequence intended. *) +let is_frond_point superset addr = + let insn_isg = superset.insn_risg in + OG.in_degree insn_isg addr > 0 && + OG.out_degree insn_isg addr = 0 + +let frond_of_isg superset = + let insn_isg = superset.insn_risg in + OG.fold_vertex (fun addr accu -> + if is_frond_point superset addr then + (Hash_set.add accu addr; accu) + else accu) + insn_isg (Addr.Hash_set.create ()) + +let mergers superset = + let insn_risg= superset.insn_risg in + OG.fold_vertex (fun addr mergers -> + if OG.out_degree insn_risg addr > 1 then + Addr.Set.add mergers addr + else mergers) insn_risg Addr.Set.empty + +let is_branch superset addr = + OG.in_degree superset.insn_risg addr >= 2 + +let get_branches superset = + let branches = Addr.Hash_set.create () in + ISG.iter_vertex superset (fun vert -> + if is_branch superset vert then + Hash_set.add branches vert; + ); + branches + +(* This is a traversal + val with_bad : + t -> + ?visited:'b Addr.Hash_set.t_ -> + pre:('c -> addr -> 'd) -> + post:('d -> addr -> 'c) -> 'c -> 'c +*) +let with_bad superset ?visited ~pre ~post accu = + let visited = + match visited with + | None -> (Addr.Hash_set.create ()) + | Some visited -> visited in + Hash_set.fold ~init:accu superset.bad ~f:(fun accu b -> + if OG.mem_vertex superset.insn_risg b then + ISG.dfs_fold superset ~visited + ~pre ~post accu b + else accu + ) + +let fall_through_of superset addr = + let len = Inspection.len_at superset addr in + Addr.(addr ++ len) + +let is_fall_through superset parent child = + let ft = fall_through_of superset parent in + Addr.(child = ft) + +let get_non_fall_through_edges superset = + ISG.fold_edges superset + (fun child parent jmps -> + if is_fall_through superset parent child then + Map.set jmps ~key:child ~data:parent + else jmps + ) Addr.Map.empty + +let get_callers superset addr = + let g = (get_graph superset) in + if OG.mem_vertex g addr && + OG.out_degree g addr > 0 then + let callers = OG.succ g addr in + List.filter callers ~f:(fun caller -> + not (is_fall_through superset caller addr)) + else [] + +let with_img ~accu img ~f = + let segments = Table.to_sequence @@ Image.segments img in + Seq.fold segments ~init:accu ~f:(fun accu (mem, segment) -> + if Image.Segment.is_executable segment then + f ~accu mem + else accu + ) + +let superset_of_img ?f ?addrs ~backend img = + let arch = Image.arch img in + let segments = Image.memory img in + let main_entry = Image.entry_point img in + let filename = Image.filename img in + let f = Option.value f ~default:(fun (m, i) a -> a) in + let superset = + of_components ~main_entry ?filename ~segments arch in + with_img ~accu:superset img + ~f:(fun ~accu mem -> + Core.update_with_mem ~backend ?addrs accu mem ~f + ) + +let superset_disasm_of_file ?(backend="llvm") ?f ?addrs binary = + let img, errs = Image.create ~backend binary |> ok_exn in + superset_of_img ~backend img ?addrs ?f + diff --git a/lib/bap_superset_disasm/superset.mli b/lib/bap_superset_disasm/superset.mli new file mode 100644 index 000000000..a8ebf29df --- /dev/null +++ b/lib/bap_superset_disasm/superset.mli @@ -0,0 +1,325 @@ +open Bap.Std +open Core + +module Dis = Disasm_expert.Basic +type elem = mem * Dis.full_insn option + +type t = Superset_impl.t + +module ISG : sig + open Graphlib.Std + + (** Returns a list of those addresses for which the argument + address could be the immediate next instruction of. *) + val ancestors : t -> addr -> addr list + + (** Returns a list of those addresses for which the argument + address could potentially lead to. *) + val descendants : t -> addr -> addr list + + val mem_vertex : t -> addr -> bool + + val iter_vertex : t -> (addr -> unit) -> unit + + val fold_vertex : t -> (addr -> 'a -> 'a) -> 'a -> 'a + + val fold_edges : t -> (addr -> addr -> 'a -> 'a) -> 'a -> 'a + + val check_connected : t -> addr -> addr -> bool + + (** Adds an associated directed link from src to dst, tracking + addresses if they are not already. *) + val link : t -> addr -> addr -> t + + (** Removes a link between two addresses, but not stop tracking + those addresses even if they each have no more links *) + val unlink : t -> addr -> addr -> t + + (** Uses strongly connected components to determine loop lists, but + does no filtering. *) + val raw_loops : t -> addr list list + + val dfs_fold : + ?visited:Addr.Hash_set.t -> t -> pre:('a -> addr -> 'a) -> + post:('a -> addr -> 'a) -> 'a -> addr -> 'a + + val dfs : ?terminator:(addr -> bool) -> ?visited:Addr.Hash_set.t -> + ?pre:(addr -> unit) -> ?post:(addr -> unit) -> + (t -> addr -> addr list) -> t -> addr -> unit + + val to_list : t -> (addr * addr) list + + val fixpoint : ?steps:int -> + ?start:addr -> + ?rev:bool -> + ?step:(int -> addr -> 'a -> 'a -> 'a) -> + t -> + init:(addr, 'a) Solution.t -> + equal:('a -> 'a -> bool) -> + merge:('a -> 'a -> 'a) -> + f:(addr -> 'a -> 'a) -> (addr, 'a) Solution.t + + (** Print the graph to file for a given superset *) + val print_dot : ?colorings:Addr.Hash_set.t String.Map.t -> t -> unit + + (** For all items in the address hash set, remove them from the + superset. This is a raw removal, so it does not mark bad and + traverse to perform maximal removals. *) + val filter : t -> Addr.Hash_set.t -> t + + (** Prints the isg via a formatter in gml format. *) + val format_isg : ?format:Format.formatter -> t -> unit + + (** Prints the isg to a string buffer in gml format. *) + val isg_to_string : t -> string +end + +module Core : sig + + (** Insert the memory and disassembled instruction into the superset *) + val add : t -> mem -> Dis.full_insn option -> t + (** Stops tracking an address. Can unbalance the internal + structure, requiring further balanace and trim calls *) + val remove : t -> addr -> t + val empty : arch -> t + + (** This primary core function is the core of disassembly, and + simply reads each byte consecutively in memory by address + successor. *) + val run_seq : + ('a, 'b) Dis.t -> + mem -> + (mem * (Dis.asm, Dis.kinds) Dis.insn option) seq + + val seq_of_addr_range : addr -> int -> addr seq + + (** This primary core function is the core of disassembly, and simply + reads each byte consecutively in memory by address successor. It + is alike to run_seq, but it hides the sequence part, and accepts + a parameter lambda. *) + val run : + ('a, 'b) Dis.t -> + accu:'c -> + f:(mem * (Dis.asm, Dis.kinds) Dis.insn option -> 'c -> 'c) -> + mem -> 'c + + (** This function is the fundamental superset disassembly, and + disassembles at the addresses given by the supplied sequence. *) + val disasm : + ?backend:string -> addrs:addr seq -> accu:'a -> + f:(mem * (Dis.asm, Dis.kinds) Dis.insn option -> 'a -> 'a) -> + Arch.t -> mem -> 'a Or_error.t + + (** This function is the core of disassembly, and simply + reads each byte consecutively in memory by address successor. It + builds the disassembler and runs the superset behind the + scenes. One can accumulate with any arbitrary type. Later + accessories tuck the results into a custom superset + representation with graph specializations suited to the + invariants, heuristics and performance that are vital to good + operation. *) + val disasm_all : + ?backend:string -> accu:'a -> + f:(mem * (Dis.asm, Dis.kinds) Dis.insn option -> 'a -> 'a) -> + Arch.t -> mem -> 'a Or_error.t + + (** Lift a single disassembled memory and instruction pair *) + val lift_insn : + t -> (mem * Dis.full_insn option) -> (bil) option + + (** Given an address, lift a single instruction at that address *) + val lift_at : + t -> (addr) -> bil option + + (** The primary disassembler design interface. Implementing a + disassembler from the ground up almost certainly uses this as + it loads the memory images into the superset. *) + val update_with_mem : + ?backend:string -> ?addrs:addr seq -> + ?f:(mem * (Dis.asm, Dis.kinds) Dis.insn option -> t -> t) -> + t -> mem -> t + + (** Marking an address bad means that it is temporarily maintained + until a later phase in which it is removed, together with as + many other other instructions that might have accessed it as + possible. *) + val mark_bad : t -> addr -> unit + + (** Internally, performance is important, and it may be the case + that after many bad instructions are marked and removed, that + there is some mismatch between the internal tracking that is + done. So, this library imposes that after a trim that this be + called. *) + val rebalance : t -> t + + (** This removes bad entries from the tracker without pruning them + from the superset. If this is called before trimming the + superset, then the bad instructions that were marked are no + longer distinguishable as previously. *) + val clear_bad : t -> addr -> unit + + val clear_each : t -> Addr.Hash_set.t -> unit + + (** Removes all addresses from being tracked as bad, without + removing them from the superset. *) + val clear_all_bad : t -> unit + + (** Returns a copy of the set of addresses that have been marked + bad *) + val copy_bad : t -> Addr.Hash_set.t + + val lookup : t -> addr -> (mem * Dis.full_insn option) option + + (** Accumulate over each current disassembled instruction in the + current superset. *) + val fold : t -> init:'a -> f:(key:addr -> data:elem -> 'a -> 'a) -> 'a + val mem : t -> addr -> bool +end + +module Inspection : sig + (** Returns if the addr is still in the container representing the + superset. Note, an addr may be marked as bad for removal, but isn't + removed until the trim module traverses for cleaning. Marking for + removal can be reversed by calling clear_bad. *) + val contains_addr : t -> addr -> bool + + val get_endianness : t -> endian option + + val get_arch : t -> arch + + (** Mark and track the argument address for removal upon trimming. *) + val num_bad : t -> int + + (** Current number of disassembled instructions in the superset *) + val count : t -> int + + (** Returns information reporting unbalanced the superset has + become, which ideally resides at zero. *) + val count_unbalanced : t -> int + + (** Returns a tuple representing differences of addresses not in + each of either the disassembly set and the graph. *) + val unbalanced_diff : t -> (Addr.Set.t * Addr.Set.t) + + (** Returns true if the address is currently marked for removal *) + val is_bad_at : t -> addr -> bool + + (** Returns the length of the instruction at addr *) + val len_at : t -> addr -> int + + (** Returns the total overall size of memory in bytes that has been + processed by the superset *) + val total_bytes : t -> int + + (** A carefully written function that visits the address in the body + of any instruction that is longer than one byte. So, addr + 1, + addr + 2, ... addr + n. *) + val static_successors : t -> mem -> Dis.full_insn option -> + Brancher.dests + + val get_memmap : t -> value memmap + + (** Returns the entry of the image that was originally loaded into + this superset, if any was used or if it was discovered when + loaded. *) + val get_main_entry : t -> addr option + + (** Returns the filename, if any was used, to compute this superset. *) + val filename : t -> string option +end + +module Cache : sig + open Bap_knowledge + open Bap_core_theory + open Theory + + val package : string + val sym_label : program Knowledge.obj KB.t + val superset_graph_t : + (addr * addr) list option Knowledge.domain + val superset_graph_persistent : + (addr * addr) list option Knowledge.persistent + val superset_graph : + (program, (addr * addr) list option) Knowledge.slot + +end + +module Occlusion : sig + (** For each address within the set, search within the body of the + corresponding disassembled instruction, looking for conflicts + (shingles). Add all such addresses, including the original + instruction if any occurred in it's body. Only exclude an + address from the output set if it didn't occur in the body of + another, and if no instruction conflicted. *) + val conflicts_within_insns : t -> Addr.Set.t -> Addr.Set.t + + (** For a given address, produce a set of all addresses + within the body of the disassembly that possess a conflict. *) + val conflicts_within_insn_at : + t -> ?mem:(addr -> bool) -> ?conflicts:Addr.Set.t -> addr -> Addr.Set.t + + (** For a given superset, look at every single instruction. Defers + to conflicts_within_insns in implementation. *) + val find_all_conflicts : ?mem:(addr -> bool) -> t -> Addr.Set.t + + (** A sequence view of conflicts within a given disassembled + instruction at a given address for a given length. *) + val range_seq_of_conflicts : mem:(addr -> bool) -> addr -> int -> addr seq + + (** At a given address, return all addresses within its body for + which there exists another conflicting instruction. *) + val conflict_seq_at : t -> addr -> addr seq + + (** Compute at a given address given with those addresses reside + within the body of the disassembly. *) + val with_data_of_insn : + t -> addr -> f:(addr -> unit) -> unit +end + +(** The instruction immediately after argument addr. *) +val fall_through_of : t -> addr -> addr +(** A helper function meant to tuck away the representation + underneath that tracks bad addresses. *) +val is_fall_through : + t -> addr -> addr -> bool + +(** Entry is a connotation that denotes no other instruction leads + into this one. *) +val is_entry : t -> addr -> bool + +(** Return all addresses in a set for which is_entry returns true *) +val entries_of_isg : t -> Addr.Hash_set.t + +(** A frond point is a point that is distant most of a terminating + ** instruction, meaning it may be the first instruction of a + ** function. However, these could actually occur either within or + ** beyond the body of instruction sequence intended. *) +val is_frond_point : t -> addr -> bool + +(** Return all addresses in a set for which is_front_point is true *) +val frond_of_isg : t -> Addr.Hash_set.t + +(** Return the set of addreses for which more than one other + instruction targets it. *) +val mergers : t -> Addr.Set.t + +(** Checks for the existence of successors other than fall through. *) +val is_branch : t -> addr -> bool + +(** Returns a set of the addresses for which is_branch is true *) +val get_branches : t -> Addr.Hash_set.t + +(** DFS traverses the set of addresses that are currently marked as + bad, applying functions pre and post, from the bad to all + ancestors. *) +val with_bad : + t -> ?visited:Addr.Hash_set.t -> pre:('b -> addr -> 'c) -> + post:('c -> addr -> 'b) -> 'b -> 'b + +(** Take the raw superset of a given file name of a compiled object of + any kind that Image can parse. *) +val superset_disasm_of_file : + ?backend:string -> + ?f:(mem * (Dis.asm, Dis.kinds) Dis.insn option -> t -> t) -> + ?addrs:addr seq -> string -> t + diff --git a/lib/bap_superset_disasm/superset_impl.ml b/lib/bap_superset_disasm/superset_impl.ml new file mode 100644 index 000000000..c2b886689 --- /dev/null +++ b/lib/bap_superset_disasm/superset_impl.ml @@ -0,0 +1,241 @@ +open Bap.Std +open Regular.Std +open Core +open Or_error +open Graphlib.Std + +module Dis = Disasm_expert.Basic + +type elem = mem * (Dis.full_insn option) + +module G = Graphlib.Make(Addr)(Unit) + +type t = { + arch : arch; + filename : string option; + main_entry : addr option; + sections : value memmap; + brancher : Brancher.t; + endianness : endian option; + lifter : lifter; + insns : (mem * (Dis.full_insn option)) Addr.Table.t; + lifted : bil Addr.Table.t; + insn_risg : G.t; + bad : Addr.Hash_set.t; + keep : Addr.Hash_set.t; + (* marked data *) + (* visited *) + (* union_find *) +} + +let of_components + ?main_entry ?insns ?insn_risg ?lifted ?segments ?endianness ? + filename arch = + let insn_risg = + match insn_risg with + | Some insn_risg -> insn_risg + | None -> Graphlib.create (module G) () in + let segments = Option.value segments ~default:Memmap.empty in + let insns = Option.value insns ~default:(Addr.Table.create ()) in + let lifted = Option.value lifted ~default:(Addr.Table.create ()) in + let module Target = (val target_of_arch arch) in + let lifter = Target.lift in + { + arch = arch; + filename; + sections = segments; + brancher = Brancher.of_bil arch; + endianness = None; + lifter = lifter; + main_entry; + insns; + insn_risg; + lifted; + bad = Addr.Hash_set.create (); + keep = Addr.Hash_set.create (); + } + +module M = struct + module G = struct + include G + module V = G.Node + type vertex = V.t + module E = G.Edge + end + include Graphlib.To_ocamlgraph(G) + let empty = G.empty + let iter_vertex f g = + Seq.iter (G.nodes g) ~f + let iter_succ f g v = + Seq.iter (G.Node.succs v g) ~f + let fold_succ f g v init = + Seq.fold ~init (G.Node.succs v g) ~f:(fun v g -> f g v) + let fold_vertex f g init = + Seq.fold ~init (G.nodes g) ~f:(fun v g -> f g v) + let is_directed = true + + let iter_edges_e f g = + Seq.iter (G.edges g) ~f + let iter_edges f g = + Seq.iter (G.edges g) ~f:(fun e -> + f (G.Edge.src e) (G.Edge.dst e) + ) + let remove_edge e g = + G.Edge.remove e g + let remove_edge_e v1 v2 g = + let e = G.Edge.create v1 v2 () in + remove_edge e g + let add_edge e g = + G.Edge.insert e g + let add_edge_e v1 v2 g = + let e = G.Edge.create v1 v2 () in + add_edge e g + let remove_vertex v g = + G.Node.remove v g + let add_vertex v g = G.Node.insert v g + let copy g = + let gcpy = Graphlib.create (module G) () in + Seq.fold ~init:gcpy G.(edges g) ~f:(fun gcpy e -> + add_edge e gcpy + ) +end + +module Topological = + Graph.Topological.Make(M) +module StrongComponents = Graph.Components.Make(M) +(*module DiscreteComponents = Components.Undirected(G)*) +module P = Graph.Builder.P(Graphlib.To_ocamlgraph(G)) +module Oper = Graph.Oper.Make(P) +module Dfs = Graph.Traverse.Dfs(M) + +type colored_superset = G.t * Addr.Hash_set.t String.Map.t + * elem Addr.Table.t + +module Make(T : sig val instance : colored_superset end) = struct + open T + module Dottable = struct + type t = colored_superset + + module V = struct + type t = M.V.t + end + + module E = struct + type t = M.E.t + let src = M.E.src + let dst = M.E.dst + end + + let iter_vertex f (g, _, _) = + M.iter_vertex f g + + let iter_edges_e f (g, _, _) = + M.iter_edges_e f g + + let graph_attributes _ = [ + `Fontsize 14; + ] + let default_vertex_attributes gr = [ + `Shape `Box; + (*`Height 1.0*.Memory.(length mem);*) + `Fontsize 14; + `Fontcolor 0x666699; + `Fontname "Monospace"; + `Width 1.0 + ] + + let red = 0xff0000 + let green = 0x009900 + let yellow = 0xffff00 + let blue = 0x0000ff + let orange = 0xff6600 + let purple = 0x660066 + let brown = 0x663300 + let cyan = 0x0099cc + + let vertex_name name = + let fmt = Format.str_formatter in + Addr.(pp_generic ~prefix:`none ~suffix:`none ~format:`dec + fmt name); + Format.flush_str_formatter () + + let vertex_attributes v = + let default_attrs = + [ + `Label ((vertex_name v)); + ] in + let g, colors, insns = instance in + let contains name = + match Map.find colors name with + | Some(s) -> + Hash_set.mem s v + | None -> false in + let find_update default_attrs name color = + if contains name then + `Color color :: default_attrs + else default_attrs in + let default_attrs = + find_update default_attrs "False Negatives" red in + let default_attrs = + find_update default_attrs "True Positives" green in + let default_attrs = + find_update default_attrs "False Positives" yellow in + let default_attrs = + match List.hd default_attrs with + | Some (`Color _) -> + default_attrs + | _ -> `Color 0X660000 :: default_attrs in + match Addr.Table.find insns v with + | Some(mem,insn) -> + let len = float_of_int Memory.(length mem) in + `Height (1.0 *. len) :: + default_attrs + | None -> default_attrs + + + let get_subgraph _ = None + let default_edge_attributes _ = [ + `Penwidth 1.0; + `Arrowsize 0.5; + `Headport `N; + `Tailport `S; + `Labelfloat true; + ] + + let edge_attributes e = + (*let (src,dst) = M.E.src e,M.E.dst e in*) + (*let color,weight = match kind,arity with + | `Fall,`Many -> 0x660000, 4 + | `Fall,`Mono -> 0x000066, 8 + | `Cond,_ -> 0x006600, 2 + | `Jump,_ -> 0x000066, 2 in*) + [ + (*`Color color;*) + (*`Weight weight;*) + ] + end + module Dot = Graph.Graphviz.Dot(Dottable) +end + +let fold_component ?visited ~pre ~post g accu addr = + let visited = Option.value visited + ~default:(Addr.Hash_set.create ()) in + let s = Stack.create () in + (* invariant: [h] contains exactly the vertices which have been pushed *) + let push v = + if not (Hash_set.mem visited v) then begin + Hash_set.add visited v; + Stack.push s v + end + in + push addr; + let rec loop acc = + match Stack.pop s with + | Some v -> + let acc = pre acc v in + M.iter_succ push g v; + loop @@ post acc v + | None -> acc + in + loop accu + diff --git a/lib/bap_superset_disasm/traverse.ml b/lib/bap_superset_disasm/traverse.ml new file mode 100644 index 000000000..2fa17bd97 --- /dev/null +++ b/lib/bap_superset_disasm/traverse.ml @@ -0,0 +1,82 @@ +open Core +open Bap.Std +open Superset + +(** Abstract dfs from a given starting point. *) +let iter_component ?(terminator=(fun _ -> true)) + ?visited ?(pre=fun _ -> ()) ?(post=fun _ -> ()) = + ISG.dfs ~terminator ?visited ~pre ~post ISG.ancestors + +(** This function starts at a given address and traverses toward + every statically visible descendant. It is used to maximally + propagate a given function application. *) +let with_descendents_at ?visited ?post ?pre superset addr = + ISG.dfs ?visited ?post ?pre ISG.descendants superset addr + +(** This function starts at a given address and traverses toward + every statically visible ancestor. It is used to maximally + propagate a given function application. *) +let with_ancestors_at ?visited ?post ?pre superset addr = + ISG.dfs ?visited ?post ?pre ISG.ancestors superset addr + +(** From the starting point specified, this reviews all descendants + and marks their bodies as bad. *) +let mark_descendent_bodies_at ?visited ?datas superset addr = + let datas = Option.value datas + ~default:(Addr.Hash_set.create ()) in + let mark_bad = Core.mark_bad superset in + with_descendents_at ?visited superset addr + ~pre:(fun v -> + Occlusion.with_data_of_insn superset v ~f:mark_bad; + Occlusion.with_data_of_insn superset v ~f:(Hash_set.add datas); + ) + +(** A clean wrapper around raw superset that does some management of + visited nodes for efficiency behind the scenes. *) +let visit ?visited ~pre ~post superset entries = + let visited = Option.value visited + ~default:(Addr.Hash_set.create ()) in + let pre addr = + Hash_set.add visited addr; + pre addr in + Hash_set.iter entries ~f:(fun addr -> + if not (Hash_set.mem visited addr) then + with_ancestors_at superset ~visited ~pre ~post addr + ) + + +(** This traversal unlinks all non-branch jumps, collects every + entry, which now includes newly unlinked blocks in order to + provide a separation over traversal. *) +let visit_by_block superset + ?(pre=(fun _ _ _ -> ())) ?(post=(fun _ _ _ -> ())) entries = + let (jmps,targets) = Superset.ISG.fold_edges superset + (fun src target (jmps,targets) -> + let is_branch = Superset.is_branch superset target in + let is_jmp_edge = not (Superset.is_fall_through superset src target) in + if is_branch && is_jmp_edge then + (Map.set jmps ~key:src ~data:target, Set.add targets target) + else (jmps, targets) + ) (Addr.Map.empty,Addr.Set.empty) in + let superset = + Map.fold jmps ~init:superset ~f:(fun ~key ~data superset -> + Superset.ISG.unlink superset key data; + ) in + let entries = Superset.entries_of_isg superset in + let visited = Addr.Hash_set.create () in + let rec visit v = + Hash_set.add visited v; + pre jmps targets v; + let f w = + if not (Hash_set.mem visited w) then + visit w + else pre jmps targets w in + let ancs = Superset.ISG.ancestors superset v in + List.iter ancs ~f; + post jmps targets v; + in + Hash_set.iter entries ~f:visit; + Map.fold jmps ~init:superset ~f:(fun ~key ~data superset -> + Superset.ISG.link superset key data; + ) + diff --git a/lib/bap_superset_disasm/traverse.mli b/lib/bap_superset_disasm/traverse.mli new file mode 100644 index 000000000..46bdd77cd --- /dev/null +++ b/lib/bap_superset_disasm/traverse.mli @@ -0,0 +1,16 @@ +open Bap.Std + +val mark_descendent_bodies_at : ?visited:Bap.Std.Addr.Hash_set.t -> + ?datas:Addr.Hash_set.t -> + Superset_impl.t -> addr -> unit +val with_descendents_at : ?visited:Addr.Hash_set.t -> + ?post:(addr -> unit) -> + ?pre:(addr -> unit) -> Superset_impl.t -> addr -> unit +val with_ancestors_at : ?visited:Addr.Hash_set.t -> + ?post:(addr -> unit) -> + ?pre:(addr -> unit) -> Superset_impl.t -> addr -> unit +val visit : ?visited:Addr.Hash_set.t -> pre:(addr -> unit) -> + post:(addr -> unit) -> + Superset_impl.t -> Addr.Hash_set.t -> unit +val visit_by_block : Superset_impl.t -> ?pre:(addr Addr.Map.t -> Addr.Set.t -> addr -> unit) -> + ?post:(addr Addr.Map.t -> Addr.Set.t -> addr -> unit) -> Addr.Hash_set.t -> Superset_impl.t diff --git a/lib/bap_superset_disasm/trim.ml b/lib/bap_superset_disasm/trim.ml new file mode 100644 index 000000000..6b3fadc2f --- /dev/null +++ b/lib/bap_superset_disasm/trim.ml @@ -0,0 +1,27 @@ +open Core +open Bap.Std + +let run superset = + let accu = () in + let check_pre _ accu _ = accu in + let check_post _ accu _ = accu in + let check_elim _ _ _ = true in + let mark _ _ _ = () in + let post superset accu addr = + let module G = Superset.ISG in + if check_elim superset accu addr then ( + mark superset accu addr; + ); + check_post superset accu addr in + let visited = Addr.Hash_set.create () in + (*let superset = Superset.Core.rebalance superset in*) + let post = post superset in + let pre = check_pre superset in + let _ = Superset.with_bad superset ~visited ~pre ~post accu in + let superset = + Hash_set.fold visited ~init:superset ~f:(fun superset addr -> + Superset.Core.remove superset addr + ) in + Hash_set.clear visited; + Superset.Core.clear_all_bad superset; + superset diff --git a/lib/bap_superset_disasm/trim.mli b/lib/bap_superset_disasm/trim.mli new file mode 100644 index 000000000..464b6bf6d --- /dev/null +++ b/lib/bap_superset_disasm/trim.mli @@ -0,0 +1 @@ +val run : Superset_impl.t -> Superset_impl.t diff --git a/lib_test/bap_superset_disasm/disasm_corpora.sh b/lib_test/bap_superset_disasm/disasm_corpora.sh new file mode 100755 index 000000000..96b713cd6 --- /dev/null +++ b/lib_test/bap_superset_disasm/disasm_corpora.sh @@ -0,0 +1,5 @@ +find /Volumes -type f -executable -exec sh -c "file -i '{}' | grep -q 'x-executable; charset=binary'" \; -print > files.txt + +time cat files.txt | parallel "bap superset-disasm --find-fn-culprit --analyses="StronglyConnectedComponentData","Grammar-convergent" --heuristics=Callsites3,FixpointGrammar,ImgEntry,InterpretationDepthOne,FixpointInterpDepthOne --ground-truth-bin={} {}" + +bap supersetd-graph-metrics ./files.txt --print-fn-bins diff --git a/lib_test/bap_superset_disasm/dune b/lib_test/bap_superset_disasm/dune new file mode 100644 index 000000000..402550de0 --- /dev/null +++ b/lib_test/bap_superset_disasm/dune @@ -0,0 +1,5 @@ +(library + (name test_superset_disasm) + (preprocess (pps ppx_bap)) + (wrapped false) + (libraries bap core_kernel ounit2 str)) diff --git a/lib_test/bap_superset_disasm/test_superset_disasm.ml b/lib_test/bap_superset_disasm/test_superset_disasm.ml new file mode 100644 index 000000000..1be894db7 --- /dev/null +++ b/lib_test/bap_superset_disasm/test_superset_disasm.ml @@ -0,0 +1,823 @@ +open OUnit2 +open Core_kernel +open Bap.Std +open Or_error +open Superset +open Bap_plugins.Std +open Graphlib.Std +open Bap_future.Std + +let requires = ["llvm"; "lifter"; "disassemble"; "disassembler"; + "semantics"] +let () = match Bap_main.init ~requires () with + | Ok () -> () + | Error err -> + let open Bap_main in + Bap_main.Extension.Error.pp Format.std_formatter err; + exit 1 + +let create_memory arch min_addr data = + let data = Bigstring.of_string data in + Memory.create (Arch.endian arch) min_addr data + +let arch = `x86 + +let segments = Table.empty + +let width = 8*(Arch.size_in_bytes arch);; +let zero = Addr.(of_int ~width 0) + +module G = Graphlib.Make(Addr)(Unit) +module Topological = Superset_impl.Topological + +let add_edge g v1 v2 = + let e = G.Edge.create v1 v2 () in + G.Edge.insert e g + +let mem_edge g v1 v2 = + let e = G.Edge.create v1 v2 () in + G.Edge.mem e g + +let init () = + let insn_isg = Graphlib.create (module G) () in + let insn_map = Addr.Map.empty in + insn_map, insn_isg + +let min_addr = 1 +let addr_size= Size.in_bits @@ Arch.addr_size arch +let min_addr = Addr.of_int ~width:addr_size min_addr + +let make_params ?(mina=min_addr) bytes = + let memory = create_memory arch min_addr bytes |> ok_exn in + memory, arch + +let check_results sizes expected_results = + let sizes = Seq.to_list sizes in + List.iter2_exn sizes expected_results + ~f:(fun actual_size expected_size -> + assert_equal ~msg:((List.to_string ~f:string_of_int sizes) + ^ (List.to_string ~f:string_of_int + expected_results)) actual_size + expected_size) + +let superset_to_length_list superset = + List.map superset + ~f:(fun (mem, insn) -> (Memory.length mem)) + +(* This test affirms that both the order and the inner sequences of a set of bytes + will be interpreted appropriately by the superset conservative disassembler *) +let test_hits_every_byte test_ctxt = + let memory, arch = make_params "\x2d\xdd\xc3\x54\x55" in + let raw_superset = Superset.Core.disasm_all + ~accu:[] ~f:List.cons arch memory |> ok_exn in + let sizes = superset_to_length_list raw_superset in + let expected_results = List.rev [ 5; 2; 1; 1; 1; ] in + check_results (Seq.of_list sizes) expected_results + +let of_mem arch mem = + let superset = Superset.Core.empty arch in + let f = (Invariants.tag ~invariants:[Invariants.tag_success]) in + Superset.Core.update_with_mem superset mem ~f + +let get_bads superset mem = + let maddr = Memory.min_addr mem in + let l = Memory.length mem in + let bds = Superset.Core.seq_of_addr_range maddr l in + Seq.filter bds + ~f:(Superset.Inspection.is_bad_at superset) + +let str_of_bads superset mem = + let bds = get_bads superset mem in + let bds = Seq.to_list bds in + List.to_string ~f:Addr.to_string bds + +let debug_msg superset mem = + let msg = Superset.ISG.isg_to_string superset in + let bads = str_of_bads superset mem in + let msg = sprintf "%s\n%s" + msg bads in + (*let pattern = ": " in + let msi = + String.substr_index mems ~pattern |> Option.value_exn in + let start = (msi+(String.length pattern)) in + let finish = String.((length mems)) - start in + let ms = String.sub mems start finish in*) + let cnt = Superset.Inspection.count superset in + let unb = Superset.Inspection.count_unbalanced superset in + sprintf "%s\ncount (graph): %d, unbalanced (map): %d" msg cnt unb + +(* TODO make default initialization from bytes much shorter *) + +let test_trim test_ctxt = + let bytes = "\x2d\xdd\xc3\x54\x55" in + let mem, arch = make_params bytes in + let superset = of_mem arch mem in + let superset = + Invariants.tag_superset superset in + let superset = Trim.run superset in + let superset = Superset.Core.rebalance superset in + let dbg = debug_msg superset mem in + let bads = str_of_bads superset mem in + let explanation = + sprintf "Expect one instruction, got %d. bad: %s" + (Superset.Inspection.count superset) bads in + let msg = sprintf "%s\n%s" + dbg explanation in + (* Only the return opcode ( 0xc3 ) can survive trimming *) + (* After refactoring, it may be that some targets that fall outside + the memory bounds are still in the graph, but this only accounts + for one edge, so it is negligible. *) + assert_equal ~msg 1 @@ Superset.Inspection.count superset + +let test_can_lift test_ctxt = + let bytes = "\x2d\xdd\xc3\x54\x55" in + let mem, arch = make_params bytes in + let superset = of_mem arch mem in + try + Superset.Core.fold superset ~init:() ~f:(fun ~key ~data () -> + let (mem, insn) = data in + let bil = Superset.Core.lift_insn superset (mem, insn) in + let msg = + sprintf "couldn't lift at %s" (Memory.to_string mem) in + match bil with + | Some _ -> () + | _ -> + assert_bool msg false + ) + with _ -> () + +let test_brancher test_ctxt = + let bytes = "\x77\x77" in + let mem, arch = make_params bytes in + let superset = of_mem arch mem in + match Superset.Core.lookup superset Memory.(min_addr mem) with + | Some (mem, insn) -> + let ss = + Superset.Inspection.static_successors superset mem insn in + let msg = sprintf "Should be two static successors here" in + assert_bool msg (List.(length ss) > 1) + | None -> assert_bool "should be an instruction at 0" false + +let test_lift test_ctxt = + let bytes = "\x77\x77" in + let mem, arch = make_params bytes in + let superset = of_mem arch mem in + match Superset.Core.lookup superset Memory.(min_addr mem) with + | Some (mem, insn) -> + let lifted = + Superset.Core.lift_insn superset (mem, insn) in + let msg = sprintf "Should be able to lift" in + assert_bool msg Option.(is_some lifted); + let r = Option.value_map ~default:false lifted ~f:(fun (bil) -> + List.length bil > 0) in + assert_bool "Should support this!" r + | None -> assert_bool "should be an instruction at 0" false + +(* TODO want a bil language oriented way to specify the construction of a superset *) +let dis_with_invariants ?superset bytes invariants = + let mem, arch = make_params bytes in + let default = + of_mem arch mem in + let superset = Option.value superset ~default in + let f = (Invariants.tag ~invariants) in + let superset = Superset.Core.update_with_mem + superset mem ~f in + let msg = debug_msg superset mem in + let msg = sprintf "Should be bad at %s\n%s" + Addr.(to_string min_addr) msg in + let superset = Trim.run superset in + let superset = Superset.Core.rebalance superset in + assert_bool msg @@ not @@ Superset.Core.mem superset min_addr; + let msg = debug_msg superset mem in + let offset_one = + Superset.Core.mem superset min_addr in + assert_bool msg (not offset_one) + +let test_tag_non_insn test_ctxt = + dis_with_invariants "\x0f\xff" + [Invariants.tag_success; Invariants.tag_non_insn] + +let test_tag_target_is_bad test_ctxt = () +(* TODO don't want to use this until have functor on superset + * interface allowing to set the semantics directly for test *) + (*let bytes = "\x77\xfe" in + dis_with_invariants bytes + [Invariants.tag_success; Invariants.tag_target_is_bad]*) + +let test_target_in_body test_ctxt = + let bytes = "\x77\xFF" in + dis_with_invariants bytes + [Invariants.tag_success; Invariants.tag_target_in_body] + +let test_target_not_in_mem test_ctxt = + let bytes = "\x77\x77" in + let invariants = [Invariants.tag_success; + Invariants.tag_target_not_in_mem] in + dis_with_invariants bytes invariants + +let test_static_successors_includes_fall_through test_ctxt = + let bytes = "\x54\x55" in + let mem, arch = make_params bytes in + let superset = of_mem arch mem in + let maddr = Memory.(min_addr mem) in + match Superset.Core.lookup superset maddr with + | Some (mem, insn) -> + let tgts = + Superset.Inspection.static_successors superset mem insn in + let b = List.fold ~init:false tgts ~f:(fun status (addro, e) -> + match addro with + | Some addr -> status || Addr.(addr = (succ maddr)) + | None -> status + ) in + assert_bool "static successors doesn't contain fall through" b + | None -> assert_bool "insn expected here" false + + +let test_successor_calculation test_ctxt = + let bytes = "\x2d\xdd\xc3\x54\x55" in + let mem, arch = make_params bytes in + let superset = of_mem arch mem in + let mn_addr = Memory.(min_addr mem) in + let mx_addr = Memory.(max_addr mem) in + match Superset.Core.lookup superset mn_addr with + | Some (mem, insn) -> + let tgts = + Superset.Inspection.static_successors superset mem insn in + let b = List.fold ~init:false tgts ~f:(fun status (addro, e) -> + match addro with + | Some addr -> + status || + (Addr.(addr > (mx_addr)) && (not Memory.(contains mem addr))) + | None -> status + ) in + let b = b && (List.length tgts > 0) in + assert_bool "static successors doesn't contain fall through" b + | None -> assert_bool "insn expected here" false + +let test_superset_contains_addr test_ctxt = + let bytes = "\x2d\xdd\xc3\x54\x55" in + let mem, arch = make_params bytes in + let superset = of_mem arch mem in + let mn_addr = Memory.(min_addr mem) in + match Superset.Core.lookup superset mn_addr with + | Some (mem, insn) -> + let tgts = + Superset.Inspection.static_successors superset mem insn in + let b = List.fold ~init:false tgts ~f:(fun status (addro, e) -> + match addro with + | Some addr -> + status || + (not (Superset.Inspection.contains_addr superset addr)) + | None -> status + ) in + let b = b && (List.length tgts > 0) in + assert_bool "static successors doesn't contain fall through" b + | None -> assert_bool "insn expected here" false + +let test_trims_invalid_jump test_ctxt = + let bytes = "\x55\x54\xE9\xFC\xFF\xFF\xFF" in + let memory, arch = make_params bytes in + let superset = of_mem arch memory in + let superset = Superset.Core.update_with_mem + superset memory ~f:Invariants.tag in + let superset = Trim.run superset in + let superset = Superset.Core.rebalance superset in + let expected_results = [ ] in + assert_equal ~msg:"lengths unequal" + (Superset.Inspection.count superset) + (List.length expected_results) + +let test_addr_map test_ctxt = + let min_addr = Addr.of_int ~width:addr_size 0 in + let insn_map = Addr.Map.empty in + let insn_map = Addr.Map.set insn_map ~key:min_addr ~data:() in + let insn_map = Addr.Map.set insn_map ~key:min_addr ~data:() in + let msg = "expected length to be one" in + assert_bool msg ((Addr.Map.length insn_map) = 1) + +let test_insn_isg test_ctxt = + let insn_risg = Graphlib.create (module G) () in + let addr = Addr.of_int ~width:addr_size 0 in + let insn_risg = G.Node.insert addr insn_risg in + let insn_risg = G.Node.insert addr insn_risg in + let msg = "expected length to be one" in + assert_bool msg ((G.number_of_nodes insn_risg) = 1) + +let test_consistent_superset test_ctxt = + let memory, arch = make_params "\x55\x54\xE9\xFC\xFF\xFF\xFF" in + let superset = of_mem arch memory in + let m_neg_g, g_neg_m = Superset.Inspection.unbalanced_diff superset in + let msg = "insn in map but not in after shingled of superset" in + assert_bool msg (Set.is_empty m_neg_g); + let msg = "insn in isg but not in map after shingled of superset" in + assert_bool msg (Set.is_empty g_neg_m) + +let construct_loop insn_map insn_isg start finish = + if Addr.(finish > start) then ( + (* Where we would otherwise connect the nodes from the tail + condition back up to the loop body entry, here all the edges are + reversed in the spirit of the disassembled insn_isg. *) + let insn_isg = add_edge insn_isg start finish in + let junk_data = String.of_char ' ' in + let start_mem = create_memory arch start junk_data |> ok_exn in + let insn_map = Addr.Map.set insn_map ~key:start ~data:(start_mem, None) in + let finish_mem = create_memory arch finish junk_data |> ok_exn in + let insn_map = Addr.Map.set insn_map ~key:finish ~data:(finish_mem, None) in + let one = (Addr.of_int 1 ~width) in + let two = (Addr.of_int 2 ~width) in + let rec construct_loop_body insn_isg insn_map start finish = + if not (Addr.equal start finish) then + let dist = Addr.max Addr.((finish - start)/two) one in + let step = Addr.(start + dist) in + (* Add edge from some intermediate point between the start and + the finish going from the calculated step toward the parameter + start, decreasing the distance between the outermost start + and finish. As this function executes, it creates log(dist) + nodes, each going from finish to step (reverse as it would + be a flow in a real binary) before finally reaching the + start *) + let insn_isg = add_edge insn_isg step start in + (* Because the algorithm at this point relies on the graph + and map entirely, it doesn't matter the contents of the + memory. *) + let junk_data = String.make (Addr.to_int dist |> ok_exn) ' ' in + let insn_map = Addr.Map.set insn_map ~key:start + ~data:(create_memory arch start junk_data |> ok_exn, None) in + construct_loop_body insn_isg insn_map step finish + else insn_map, insn_isg in + construct_loop_body insn_isg insn_map start finish + ) else insn_map, insn_isg + +let construct_entry_conflict insn_map insn_isg at conflict_len = + let junk_data = String.make conflict_len ' ' in + let conflict = Addr.(at ++ conflict_len) in + let insn_isg = add_edge insn_isg at conflict in + let insn_isg = add_edge insn_isg + Addr.(at ++ 1) Addr.(conflict ++ 1) in + let insn_map = Addr.Map.set insn_map ~key:at + ~data:(create_memory arch at junk_data |> ok_exn, None) in + let insn_map = Addr.Map.set insn_map ~key:conflict + ~data:(create_memory arch conflict junk_data |> ok_exn, None) in + let insn_map = Addr.Map.set insn_map ~key:Addr.(conflict ++1) + ~data:(create_memory arch Addr.(conflict ++1) junk_data |> ok_exn, None) in + let insn_map = Addr.Map.set insn_map ~key:Addr.(at ++ 1) + ~data:(create_memory arch Addr.(at ++ 1) junk_data |> ok_exn, None) in + insn_map, insn_isg + +let construct_tail_conflict + insn_map insn_isg tail_addr conflict_count = + let orig = + if G.Node.mem tail_addr insn_isg && + G.Node.degree ~dir:`Out tail_addr insn_isg > 0 then + let orig = G.Node.succs tail_addr insn_isg in + let orig = Addr.Set.of_list @@ Seq.to_list orig in orig + else Addr.Set.empty in + let insn_map, tail_len = match Addr.Map.find insn_map tail_addr with + | Some (mem, _) -> insn_map, Memory.length mem + | None -> + let tail_len = 1 in + let tail_data = String.make tail_len ' ' in + let mem = create_memory arch tail_addr tail_data |> ok_exn in + let insn_map = Addr.Map.set insn_map ~key:tail_addr + ~data:(mem, None) in + insn_map, tail_len in + let rec make_tail_options insn_map insn_isg conflict_count = + if conflict_count > 0 then + let junk_data = String.make conflict_count ' ' in + let conflict_addr = Addr.(tail_addr -- conflict_count) in + let insn_isg = add_edge insn_isg tail_addr conflict_addr in + let insn_map = Addr.Map.set insn_map ~key:conflict_addr + ~data:(create_memory arch conflict_addr junk_data |> ok_exn, + None) in + make_tail_options insn_map insn_isg (conflict_count - 1) + else + insn_map, insn_isg in + let insn_map, insn_isg = + make_tail_options insn_map insn_isg conflict_count in + let opts = G.Node.succs tail_addr insn_isg in + let opts = Addr.Set.of_list @@ Seq.to_list opts in + let opts = Set.diff opts orig in + let msg = sprintf + "expected %d, got %d" conflict_count Set.(length opts) in + assert_equal ~msg true (Set.(length opts) = conflict_count); + insn_map, insn_isg + +let test_loop_scc test_ctxt = + let insn_map, insn_isg = init () in + let entry = Addr.(of_int ~width 1) in + let insn_map, insn_isg = + construct_loop insn_map insn_isg entry Addr.(entry ++ 20) in + let loop_points = Addr.Hash_set.create () in + Seq.iter (G.nodes insn_isg) + ~f:(fun vert -> Hash_set.add loop_points vert); + let superset = + Superset_impl.of_components ~insn_map ~insn_risg:insn_isg arch in + let scc = Superset.ISG.raw_loops superset in + let scc_points = Addr.Hash_set.create () in + List.iter scc ~f:(fun scc -> + List.iter scc ~f:(fun component_addr -> + Hash_set.add scc_points component_addr; + )); + let in_loop_not_scc = "Found addr in loop but not in scc" in + Hash_set.iter loop_points ~f:(fun loop_addr -> + assert_equal ~msg:in_loop_not_scc true + @@ Hash_set.mem scc_points loop_addr); + let in_scc_not_loop = "Found addr in scc not loop" in + Hash_set.iter scc_points ~f:(fun loop_addr -> + assert_equal ~msg:in_scc_not_loop true + @@ Hash_set.mem loop_points loop_addr) + +let test_scc test_ctxt = + let insn_map, insn_risg = init () in + let zero = Addr.(of_int ~width 0) in + let entry = Addr.(of_int ~width 1) in + let insn_risg = add_edge insn_risg zero entry in + let superset = + Superset_impl.of_components ~insn_map ~insn_risg arch in + let components = Grammar.addrs_of_filtered_loops superset in + assert_equal ~msg:"found non scc component" 0 (Set.length components) + +let test_find_conflicts test_ctxt = + let insn_map, insn_risg = init () in + let in_loop_addr = Addr.(of_int ~width 0x10) in + let num_conflicts = 6 in + let insn_map, insn_risg = + construct_tail_conflict insn_map insn_risg in_loop_addr + num_conflicts in + let superset = Superset_impl.of_components + ~insn_map ~insn_risg arch in + let conflicts = Superset.Occlusion.find_all_conflicts superset in + assert_equal Set.(length conflicts) num_conflicts + +let test_trim_scc test_ctxt = + let insn_map, insn_risg = init () in + let entry = Addr.(of_int ~width 1) in + let insn_map, insn_risg = + construct_loop insn_map insn_risg entry Addr.(entry ++ 20) in + let superset = Superset_impl.of_components + ~insn_map ~insn_risg arch in + let keep_entry = Superset.Core.mem superset entry in + let msg = sprintf "entry %s should be in the graph" + Addr.(to_string entry) in + assert_equal ~msg keep_entry true; + let in_loop_addr = Addr.(of_int ~width 0x10) in + let loop_points = Addr.Hash_set.create () in + Seq.iter (G.nodes insn_risg) ~f:(Hash_set.add loop_points); + let insn_map, insn_risg = + construct_tail_conflict insn_map insn_risg in_loop_addr 3 in + let conflicts_added = Addr.Hash_set.create () in + Seq.iter (G.nodes insn_risg) ~f:(fun vert -> + if not (Hash_set.mem loop_points vert) then + Hash_set.add conflicts_added vert); + let superset = Superset_impl.of_components + ~insn_map ~insn_risg arch in + let components = Superset.ISG.raw_loops superset in + assert_bool "Should have a component" + (List.(length components) > 0); + let superset = + Grammar.tag_loop_contradictions ~min_size:1 superset in + assert_bool "should have marked conflict" + (0 < Superset.Inspection.(num_bad superset)); + let keep_entry = Superset.Inspection.is_bad_at superset entry in + let msg = sprintf "entry %s should not be marked bad" + Addr.(to_string entry) in + assert_equal ~msg keep_entry false; + let superset = Trim.run superset in + let keep_entry = Superset.Core.mem superset entry in + let msg = sprintf "entry %s should not be removed" + Addr.(to_string entry) in + assert_equal ~msg keep_entry true; + let conflicts_added_str = List.to_string ~f:Addr.to_string @@ + Hash_set.to_list conflicts_added in + let removed_msg = "of conflicts " ^ conflicts_added_str + ^ ", residual conflict present within " in + let isg_msg = Superset.ISG.isg_to_string superset in + let removed_msg = sprintf "%s\n%s" isg_msg removed_msg in + let (mlg, glm) = Superset.Inspection.unbalanced_diff superset in + let msg = sprintf "%s\nmlg: %d, glm: %d" removed_msg + Set.(length mlg) Set.(length glm) in + let set_to_string s = + List.to_string ~f:Addr.to_string @@ Set.to_list s in + let msg = sprintf "%s\nmap less graph: %s, graph less map: %s" + msg (set_to_string mlg) (set_to_string glm) in + assert_bool msg (Set.(length mlg)=0 && Set.(length glm)=0); + let loop_msg addr = + sprintf "%s\nloop addr %s should remain during tail trim" + (Superset.ISG.isg_to_string superset) + Addr.(to_string addr) in + Hash_set.iter loop_points ~f:(fun addr -> + assert_equal ~msg:(loop_msg addr) true @@ + Superset.Core.mem superset addr) + +(* Establishes, in the case of if structures, how topological *) +(* tranversal works - one time visit only *) +let test_topological_revisit ctxt = + let _, insn_risg = init () in + let width = 32 in + let start = Addr.of_int 0 ~width in + let stop = Addr.of_int 2 ~width in + let rec make_if insn_risg current stop = + if not Addr.(current = stop) then + let next = Addr.succ current in + let insn_risg = add_edge insn_risg current next in + make_if insn_risg next stop else insn_risg in + let insn_risg = make_if insn_risg start stop in + let insn_risg = add_edge insn_risg start stop in + let update_count addr visit_count = + match Map.find visit_count addr with + | Some (count) -> + let visit_count = Map.remove visit_count addr in + Map.set visit_count ~key:addr ~data:(count+1) + | None -> Map.set visit_count ~key:addr ~data:1 in + + let visit_count = Topological.fold + update_count insn_risg Addr.Map.empty in + Map.iteri visit_count ~f:(fun ~key ~data -> assert_equal ~ctxt data 1) + +let rec extend_back insn_map insn_isg ?(step=1) addr num = + let make_link len = + let dest = Addr.(addr -- len) in + let insn_isg = add_edge insn_isg addr dest in + let junk_data = String.make len ' ' in + let mem = create_memory arch dest junk_data |> ok_exn in + let insn_map = Map.set insn_map ~key:dest ~data:(mem, None) in + (insn_map, insn_isg) in + if not (num = 0) then + let (insn_map, insn_isg) = make_link step in + extend_back insn_map insn_isg Addr.(addr -- step) (num - 1) ~step + else + insn_map, insn_isg + +let make_extended_cross tail_addr = + let insn_map, insn_risg = init () in + let insn_map, insn_risg = + construct_tail_conflict insn_map insn_risg tail_addr 2 in + let layer_options = Seq.to_list G.Node.(succs tail_addr insn_risg) in + let insn_map, insn_risg = List.fold ~init:(insn_map, insn_risg) + layer_options ~f:(fun (insn_map, insn_risg) opt -> + extend_back insn_map insn_risg opt 1 ~step:2 + ) in + let superset = Superset_impl.of_components + ~insn_map ~insn_risg arch in + let extended_points = + Superset.entries_of_isg superset in + let _,insn_risg = Hash_set.fold ~init:(None,insn_risg) extended_points + ~f:(fun (current,insn_risg) next -> + let insn_risg = Option.value_map current ~f:(fun current -> + add_edge insn_risg current next + ) ~default:insn_risg in + Some(next),insn_risg + ) in + insn_map, insn_risg + +let construct_branch insn_map insn_risg branch_at incr = + let left = Addr.(branch_at ++ incr) in + let junk_data = String.make incr ' ' in + let left_mem = create_memory arch left junk_data |> ok_exn in + let insn_map = Map.set insn_map ~key:left ~data:(left_mem, None) in + let right = Addr.(left ++ incr) in + let right_mem = create_memory arch right junk_data |> ok_exn in + let insn_map = Map.set insn_map ~key:right ~data:(right_mem, None) in + let rejoin = Addr.(right ++ incr) in + let rejoin_mem = create_memory arch rejoin junk_data |> ok_exn in + let insn_map = Map.set insn_map ~key:rejoin ~data:(rejoin_mem, None) in + let insn_risg = add_edge insn_risg left branch_at in + let insn_risg = add_edge insn_risg right branch_at in + let insn_risg = add_edge insn_risg rejoin right in + let insn_risg = add_edge insn_risg rejoin left in + insn_map, insn_risg + +let test_branch_recognition test_ctxt = + let tail_addr = Addr.of_int ~width:addr_size 50 in + let insn_map, insn_risg = init () in + let insn_map, insn_risg = + construct_branch insn_map insn_risg tail_addr 2 in + let superset = Superset_impl.of_components + ~insn_map ~insn_risg arch in + let entries = Superset.entries_of_isg superset in + let msg = "expect at least one entry" in + assert_bool msg (Hash_set.(length entries) > 0); + let branches = Grammar.identify_branches superset in + let msg = sprintf + "expect branches to be detected! was %d" + Hash_set.(length branches) in + assert_bool msg (Hash_set.(length branches) = 1); + let msg = "expect exact branch addr to be detected!" in + assert_bool msg (Hash_set.(mem branches tail_addr)); + () + +let test_dfs_iter_order test_ctxt = + let insn_map, insn_risg = init () in + let start = Addr.of_int ~width:addr_size 40 in + let insn_risg = add_edge insn_risg start Addr.(succ start) in + let insn_risg = add_edge insn_risg start Addr.(start ++ 2) in + let visit_order = ref [] in + let superset = + Superset_impl.of_components ~insn_map ~insn_risg arch in + Traverse.with_ancestors_at + ~pre:(fun v -> visit_order := v :: !visit_order) + superset start; + visit_order := List.rev !visit_order; + let msg = sprintf "expected addr %s to be first, was %s" + Addr.(to_string start) + (List.to_string ~f:Addr.to_string !visit_order) in + match !visit_order with + | first :: _ -> + assert_equal ~msg first start + | _ -> assert_bool msg false + +(* conflicts should include both the instruction at a data address *) +(* and the instruction whose body it is inside. *) +let test_find_all_conflicts test_ctxt = + let insn_map, insn_risg = init () in + let tail_addr = Addr.of_int ~width:addr_size 50 in + let num_conflicts = 2 in + let insn_map, insn_risg = + construct_tail_conflict + insn_map insn_risg tail_addr num_conflicts in + let superset = Superset_impl.of_components + ~insn_map ~insn_risg arch in + let conflicts = Superset.Occlusion.find_all_conflicts superset in + let msg = sprintf "expect %d conflicts" num_conflicts in + assert_equal ~msg num_conflicts Set.(length conflicts) + + +(* Establish the idempotency or addition of edges. *) +let test_graph_edge_behavior test_ctxt = + let _, insn_risg = init () in + let start = Addr.of_int ~width:addr_size 50 in + let insn_risg = add_edge insn_risg start Addr.(succ start) in + let insn_risg = add_edge insn_risg start Addr.(succ start) in + let insn_risg = add_edge insn_risg start Addr.(succ start) in + let edges = Seq.filter (G.edges insn_risg) ~f:(fun e -> + Addr.((G.Edge.src e) = start) + && Addr.((G.Edge.dst e) = Addr.(succ start))) in + let msg = "expect single edge between nodes" in + assert_equal ~msg Seq.(length edges) 1 + +let test_streams test_ctxt = + let strm, sgnl = Stream.create () in + let called = ref false in + Stream.watch strm (fun id _ -> + called := true; + Stream.unsubscribe strm id + ); + Signal.send sgnl (); + assert_bool "stream did not receive signal called" !called + +let test_ssa test_ctxt = + let arch = Arch.(`x86_64) in + let make_chain bils = + let bils = List.rev bils in + let insn_map, insn_risg = init () in + let lifted = Addr.Table.create () in + let init = zero,insn_risg in + let _,insn_risg = + List.fold bils ~init ~f:(fun (accu,insn_risg) bil -> + Addr.Table.set lifted ~key:accu ~data:bil; + let s = Addr.succ accu in + let e = G.Edge.create accu s () in + s,G.Edge.insert e insn_risg + ) in + Superset_impl.of_components ~insn_map ~insn_risg ~lifted arch in + let find_ssa superset ~f = + let entries = Superset.entries_of_isg superset in + assert_bool "Expect >= 1 entry in superset" + ((Hash_set.length entries) > 0); + let fssa = Liveness.compute_liveness superset in + let ssa = Addr.Hash_set.create () in + List.iter Set.(to_list fssa) ~f:Hash_set.(add ssa); + f ssa in + let superset = + let open Bil in + let v = Var.create "v" @@ Bil.Types.Imm 32 in + let def = Bil.move v @@ Bil.Int zero in + let use = Bil.move v ((Bil.Var v) + (Bil.Int (Addr.succ zero))) in + make_chain [[def]; [use]] in + find_ssa superset ~f:(fun ssa_rax -> + assert_bool "Expect >= 1 ssa for move move" + ((Hash_set.length ssa_rax) > 0)); + let superset = + let open Bil in + let s = Size.(`r64) in + let mem = Bil.var @@ Var.create "mem" (Bil.Types.Imm 64) in + let exp = Bil.var @@ Var.create "v1" (Bil.Types.Imm 64) in + let addr = Var.create "addr" (Bil.Types.Imm 64) in + let mv_addr = Bil.move addr Bil.(exp + (Int zero)) in + let st = store ~mem ~addr:Bil.(Var addr) exp LittleEndian s; in + let stv = Var.create "st" @@ Bil.Types.Imm 32 in + let v2 = Var.create "v2" (Bil.Types.Imm 64) in + let move_v2 = Bil.(move v2 Bil.(exp + (Int zero))) in + let mem = Bil.var @@ Var.create "mem2" (Bil.Types.Imm 64) in + let ld = load ~mem ~addr:(Var v2) LittleEndian s in + let ldv = Var.create "ld" @@ Bil.Types.Imm 32 in + let st = Bil.move stv st in + let ld = Bil.move ldv ld in + make_chain [[mv_addr]; [st]; [move_v2]; [ld];] in + find_ssa superset ~f:(fun ssa_renamed -> + let actual = (Hash_set.length ssa_renamed) in + let msg = + sprintf + "Expect >= 2 ssa for recognition over renaming, was %d" + actual in + assert_bool msg (actual > 2)); + let superset = + let open Bil in + let s = Size.(`r64) in + let mem = Bil.var @@ Var.create "mem" (Bil.Types.Imm 64) in + let ld = load ~mem ~addr:(Int zero) LittleEndian s in + let ldv = Var.create "ld" @@ Bil.Types.Imm 32 in + let ld = [Bil.move ldv ld] in + let exp = Bil.var @@ Var.create "v1" (Bil.Types.Imm 64) in + let st = store ~mem ~addr:(Int zero) exp LittleEndian s; in + let stv = Var.create "st" @@ Bil.Types.Imm 32 in + let st = [Bil.move stv st] in + make_chain [ld; st] in + find_ssa superset ~f:(fun ssa_mem_mem -> + assert_bool "Expect >= 1 ssa for load store, address exp equal" + ((Hash_set.length ssa_mem_mem) > 0)); + let superset = + let open Bil in + let s = Size.(`r64) in + let mem = Bil.var @@ Var.create "mem" (Bil.Types.Imm 64) in + let ld = load ~mem ~addr:(Int zero) LittleEndian s in + let ldv = Var.create "ld" @@ Bil.Types.Imm 32 in + let ld = [Bil.move ldv ld] in + let exp = Bil.var @@ Var.create "v1" (Bil.Types.Imm 64) in + let st = store ~mem ~addr:(Int zero) exp LittleEndian s; in + let stv = Var.create "st" @@ Bil.Types.Imm 32 in + let st = [Bil.move stv st] in + make_chain [ld; st] in + find_ssa superset ~f:(fun ssa_mem_mem -> + assert_bool + "Expect >= 1 ssa for memory exp operation to same addr" + ((Hash_set.length ssa_mem_mem) > 0)); + let superset = + let open Bil in + let s = Size.(`r64) in + let mem = Bil.var @@ Var.create "x" (Bil.Types.Imm 64) in + let ld = load ~mem ~addr:(Int zero) LittleEndian s in + let ldv = Var.create "ld" @@ Bil.Types.Imm 32 in + let ld = [Bil.move ldv ld] in + let exp = Bil.((var ldv) + mem) in + let st = + store ~mem ~addr:(Int (Addr.succ zero)) exp LittleEndian s; in + let stv = Var.create "st" @@ Bil.Types.Imm 32 in + let st = [Bil.move stv st] in + make_chain [ld; st] in + find_ssa superset ~f:(fun ssa_load_store -> + assert_bool + "Expect >= 1 ssa for memory exp operation sharing variable" + ((Hash_set.length ssa_load_store) > 0)); + let superset = + let open Bil in + let v1 = Var.create "v2" @@ Bil.Types.Imm 32 in + let def1 = Bil.move v1 @@ Bil.Int zero in + let v2 = Var.create "v2" @@ Bil.Types.Imm 32 in + let def2 = Bil.move v2 @@ Bil.Var v1 in + let v3 = Var.create "v3" @@ Bil.Types.Imm 32 in + let use = Bil.move v3 ((Bil.Var v1) + (Bil.Var v2)) in + let step1 = [def1] in + let step2 = [def2] in + let step3 = [use] in + make_chain [step1; step2; step3; step3;] in + find_ssa superset ~f:(fun ssa_chain -> + assert_bool "Expect >= 2 ssa for register chain" + ((Hash_set.length ssa_chain) > 1)); + () + +let () = + let suite = + "suite">::: + [ + "test_hits_every_byte" >:: test_hits_every_byte; + "test_trim" >:: test_trim; + "test_trims_invalid_jump" >:: test_trims_invalid_jump; + "test_addr_map" >:: test_addr_map; + "test_insn_isg" >:: test_insn_isg; + "test_consistent_superset" >:: test_consistent_superset; + "test_loop_scc" >:: test_loop_scc; + "test_scc" >:: test_scc; + "test_find_conflicts" >:: test_find_conflicts; + "test_trim_scc" >:: test_trim_scc; + "test_topological_revisit" >:: test_topological_revisit; + "test_branch_recognition" >:: test_branch_recognition; + "test_dfs_iter_order" >:: test_dfs_iter_order; + "test_find_all_conflicts" >:: test_find_all_conflicts; + "test_graph_edge_behavior" >:: test_graph_edge_behavior; + "test_can_lift" >:: test_can_lift; + "test_static_successors_includes_fall_through" >:: + test_static_successors_includes_fall_through; + "test_brancher" >:: test_brancher; + "test_lift" >:: test_lift; + "test_successor_calculation" >:: test_successor_calculation; + "test_superset_contains_addr" >:: test_superset_contains_addr; + "test_target_not_in_mem" >:: test_target_not_in_mem; + "test_tag_non_insn" >:: test_tag_non_insn; + "test_tag_target_is_bad" >:: test_tag_target_is_bad; + "test_target_in_body" >:: test_target_in_body; + "test_streams" >:: test_streams; + "test_ssa" >:: test_ssa; + ] in + run_test_tt_main suite +;; diff --git a/oasis/superset-disasm b/oasis/superset-disasm new file mode 100644 index 000000000..9cc1a0ec1 --- /dev/null +++ b/oasis/superset-disasm @@ -0,0 +1,64 @@ +OASISFormat: 0.4 +Name: superset_disasm +Version: 0.1 +Synopsis: A superset disassembler that strives to make the minimum superset of disassembly possible, starting from every byte offset by default. +Authors: Kenneth Adam Miller +Maintainers: Kenneth Adam Miller +License: MIT +Plugins: META (0.4), DevFiles (0.4) +AlphaFeatures: ocamlbuild_more_args +BuildTools: ocamlbuild +XOCamlbuildExtraArgs: -j 5 +BuildDepends: + core_kernel, + bap, + ppx_inline_test, + graphlib, + landmarks, + bap-future, + zmq, + gnuplot + +Library superset_disassemblers + Path: src/ + Install: true + CompiledObject: best + BuildDepends: bap, core_kernel, graphlib, ppx_inline_test, bap-future #, landmarks.ppx + Modules: + Superset, + Metrics, + Cmdoptions, + Traverse, + Fixpoint, + Trim, + Invariants, + Grammar, + Liveness, + Heuristics, + Report, + Superset_impl, + Metadata + +Document "superset_disassemblers_api" + Type: ocamlbuild (0.4) + BuildTools: ocamldoc + Title: API reference for superset_disasm + XOCamlbuildPath: . + XOCamlbuildExtraArgs: + "-docflags '-colorize-code -short-functors -charset utf-8'" + XOCamlbuildLibraries: superset_disassemblers + +Executable plot_superset_cache + Path: src/ + Install: true + MainIs: plot_superset_cache.ml + CompiledObject: best + BuildDepends: core_kernel, bap, cmdliner, superset_disassemblers, findlib.dynload, gnuplot, bap-future, bap-knowledge #,landmarks.ppx + + +Executable test_superset_disasm + Path: unit_tests + MainIs: test_superset_disasm.ml + Install: false + BuildDepends: core_kernel, bap, cmdliner, oUnit, superset_disassemblers, graphlib, ppx_inline_test, bap-future, findlib.dynload + CompiledObject: best diff --git a/plugins/superset_disasm/cmdoptions.ml b/plugins/superset_disasm/cmdoptions.ml new file mode 100644 index 000000000..963fa9b2e --- /dev/null +++ b/plugins/superset_disasm/cmdoptions.ml @@ -0,0 +1,214 @@ +open Core +open Bap.Std +open Bap_knowledge +open Bap_core_theory + +module Dis = Disasm_expert.Basic + +let tag_loop_contradictions = + Grammar.tag_loop_contradictions ?min_size:None +let tag_grammar = + Grammar.tag_by_traversal ?threshold:None +let list_analyses = [ + "StronglyConnectedComponentData", tag_loop_contradictions; + "Grammar-convergent", tag_grammar; + ] + +type t = { + disassembler : string; + ground_truth_bin : string option; + target : string; + invariants : string list; + analyses : string list; + converge : bool; + protect : bool; + save_dot : bool; + tp_threshold : float; + rounds : int; + heuristics : string list; + find_fn_culprit : bool; +} [@@deriving sexp, fields, bin_io] +type opts = t [@@deriving sexp, bin_io] + +module Cache = struct + + let disasm_opts = + let package = "superset-disasm-cmdoptions" in + let opts_persistent = + Knowledge.Persistent.of_binable + (module struct type t = opts option [@@deriving bin_io] end) in + let attr ty persistent name desc = + let open Theory.Program in + Knowledge.Class.property ~package cls name ty + ~persistent + ~public:true + ~desc in + let open Knowledge.Domain in + let disequal _ _ = false in + let opts = optional ~inspect:sexp_of_opts ~equal:disequal "opts ty" in + attr opts opts_persistent "disasm_opts" + "All command options given" + +end + +module type Provider = sig + val options : t +end + +module With_options(Conf : Provider) = struct + open Conf + open Or_error + open Format + + let () = + let open KB.Syntax in + KB.promise Cache.disasm_opts (fun o -> + KB.return (Some options) + ); + (match options.ground_truth_bin with + | Some bin -> + KB.promise Metrics.Cache.ground_truth_source + (fun _ -> KB.return bin); + | None -> ()) + + let fn_addrs = Addr.Hash_set.create () + + let check_false_negs superset name sym_label = + let open KB.Syntax in + if options.find_fn_culprit then ( + KB.collect Metrics.Cache.ground_truth sym_label >>= + fun gt -> + match gt with + | None -> KB.return () + | Some gt -> ( + let missing = + Set.fold ~init:0 gt ~f:(fun tot addr -> + if not (Hash_set.mem fn_addrs addr) && + Superset.Core.mem superset addr then + tot + else ( + Hash_set.add fn_addrs addr; + tot+1 + ) + ) in + if missing <> 0 then ( + KB.return @@ + print_endline + @@ sprintf "%s triggered %d false negs" + name missing; + ) else KB.return () + ) + ) else KB.return () + + let checkpoint ?addrs bin invariants = + let backend = options.disassembler in + let invariants = List.map invariants ~f:snd in + let invariants = Invariants.tag_success ::invariants in + let f = Invariants.tag ~invariants in + Superset.superset_disasm_of_file ?addrs ~backend bin ~f + + let args_to_funcs args funcs = + List.filter_map args + ~f:(fun arg -> + List.find funcs ~f:(fun (name,f) -> + String.equal arg name + ) + ) + + let invariants = + args_to_funcs options.invariants Invariants.default_tags + + let analyses = args_to_funcs options.analyses list_analyses + + let with_analyses superset analyses ~g = + let open KB.Syntax in + List.fold analyses ~init:superset + ~f:(fun superset (name,analyze) -> + superset >>= fun superset -> + let superset = analyze superset in + g superset name >>= fun () -> + KB.return superset + ) + + let with_options () = + let open KB.Syntax in + Superset.Cache.sym_label >>= fun sym_label -> + KB.collect Superset.Cache.superset_graph sym_label + >>= fun graph -> + let superset = + match graph with + | None -> + let superset = checkpoint options.target invariants in + let () = Metrics.set_ground_truth superset in + let trim = Trim.run in + let superset = trim superset in + check_false_negs superset "Invariants " sym_label + >>= fun () -> + let g superset name = + let superset = trim superset in + check_false_negs superset name sym_label in + let superset = KB.return superset in + with_analyses superset analyses ~g >>= fun superset -> + KB.promise Superset.Cache.superset_graph + (fun _ -> KB.return @@ Some Superset.ISG.(to_list superset)); + KB.return superset + | Some graph -> + let graph = Seq.of_list graph in + let graph = + Seq.concat @@ Seq.map graph ~f:(fun (s,d) -> + Seq.of_list [s;d] + ) in + let superset = checkpoint ~addrs:graph options.target [] in + check_false_negs superset + "stored in persistent graph with fns" + sym_label >>= fun () -> + KB.return @@ superset in + + let f superset = + Heuristics.with_featureset options.heuristics superset + ~init:(superset) + ~f:(fun fname feature superset -> + superset >>= fun superset -> + KB.return @@ Trim.run @@ feature superset + ) in + Fixpoint.iterate options.rounds f superset >>= fun superset -> + let pnts_of_percent prcnt = + Int.of_float (1.0/.(1.0-.prcnt)) in + let threshold = (pnts_of_percent options.tp_threshold) in + let f pmap featureset superset = + let total_of_features l = + List.fold ~init:0 ~f:(fun x (y,_,_) -> x + y) l in + let feature_pmap = + Map.map pmap ~f:(total_of_features) in + let feature_pmap = + Map.filter feature_pmap ~f:(fun total -> + (total > threshold)) in + Report.collect_distributions superset threshold pmap; + let superset = + if options.converge then ( + let f superset = + let superset = + if options.protect then ( + Fixpoint.protect superset (fun superset -> + Fixpoint.converge superset options.heuristics + feature_pmap + ) + ) else + Fixpoint.converge superset options.heuristics + feature_pmap in + Trim.run superset in + check_false_negs superset + "feature convergence " + sym_label >>= fun () -> + KB.return @@ Fixpoint.iterate options.rounds f superset + ) else KB.return superset in + superset >>= fun superset -> + Metrics.compute_metrics superset; + KB.return superset + in + Heuristics.with_featurepmap options.heuristics superset + ~f + + let main = with_options + +end diff --git a/plugins/superset_disasm/dune b/plugins/superset_disasm/dune new file mode 100644 index 000000000..6a0acda64 --- /dev/null +++ b/plugins/superset_disasm/dune @@ -0,0 +1,13 @@ +(library + (name bap_superset_disasm_plugin) + (public_name bap-superset-disasm.plugin) + (preprocess (pps ppx_bap)) + (libraries bap bap_superset_disasm ) +) + +(plugin + (name superset-disasm) + (package bap-superset-disasm) + (libraries bap-superset-disasm bap-superset-disasm.plugin) + (site (bap-common plugins)) +) diff --git a/plugins/superset_disasm/plot_superset_cache.ml b/plugins/superset_disasm/plot_superset_cache.ml new file mode 100644 index 000000000..828f2f04c --- /dev/null +++ b/plugins/superset_disasm/plot_superset_cache.ml @@ -0,0 +1,135 @@ +open Core +open Bap.Std +open Regular.Std +open Bap_knowledge +open Bap_core_theory +open Monads.Std + +let transform_summaries summaries = + let open Metrics in + let summaries = + List.filter_map summaries ~f:(fun s -> + match s.size, s.occ, s.occ_space, s.fe, s.clean, s.fns, s.fps, + s.tps, s.time with + | None, _, _, _, _, _, _, _, _ -> None + | _, None, _, _, _, _, _, _, _ -> None + | _, _, None, _, _, _, _, _, _ -> None + | _, _, _, None, _, _, _, _, _ -> None + | _, _, _, _, None, _, _, _, _ -> None + | _, _, _, _, _, None, _, _, _ -> None + | _, _, _, _, _, _, None, _, _ -> None + | _, _, _, _, _, _, _, None, _ -> None + | _, _, _, _, _, _, _, _, None -> None + | Some size, Some occ, Some occ_space, Some fe, Some clean, + Some fns, Some fps, Some tps, Some time -> + Some (size, occ, occ_space, fe, clean, fns, fps, tps, time) + ) in + List.fold summaries ~init:([],[],[],[],[],[],[],[],[]) + ~f:(fun (sizes,occ,occ_space,fe,clean,fns,fps,tps,time) s -> + let _size,_occ,_occ_space,_fe,_clean,_fns,_fps,_tps,_time = s in + _size :: sizes, _occ :: occ, _occ_space :: occ_space, + _fe :: fe,_clean :: clean,_fns :: fns,_fps :: fps,_tps :: tps, + _time :: time + ) + + (* Plots: + binary size to occlusive rate (occlusion by occ space) + occlusive space to occlusion + scatter plot occlusive count and number of occ functions + size and processing time + least value required for safe convergence + number of binaries and occ rate + *) +let make_plots summaries = + let open Metrics in + let sizes,occ,occ_space,fe,clean,fns,fps,tps,time = + transform_summaries summaries in + let make_plot xlabel ylabel fname x y = + let x = List.map x ~f:float_of_int in + let y = List.map y ~f:float_of_int in + match List.zip x y with + | Ok data -> + let title = (xlabel ^ " and " ^ ylabel) in + let labels = Gnuplot.Labels.create ~x:xlabel ~y:ylabel () in + let color = Gnuplot.Color.(`Blue) in + let plot = Gnuplot.Series.points_xy ~title ~color data in + let gp = Gnuplot.create () in + let output = (Gnuplot.Output.create (`Png (title ^ ".png"))) in + Gnuplot.set ~use_grid:true gp; + Gnuplot.plot ~output ~labels gp plot; + Gnuplot.close gp; + () + | _ -> () in + make_plot "Size" "Occlusion" "size_and_occlusion.png" sizes occ; + make_plot "Actual Occlusion" "Possible Occlusion" + "occlusion_and_occspace.png" occ occ_space; + let () = + match List.map2 fe clean ~f:(fun x y -> x - y) with + | List.Or_unequal_lengths.Ok occfuncs -> + make_plot "Total Occlusion" "# Unclean functions" + "occcnt_occfuncs.png" occ occfuncs + | _ -> () in + make_plot "Size" "Time" "size_time.png" sizes time; + () + +let knowledge_reader = Data.Read.create + ~of_bigstring:Knowledge.of_bigstring () + +let knowledge_writer = Data.Write.create + ~to_bigstring:Knowledge.to_bigstring () + +let knowledge_cache () = + Data.Cache.Service.request + knowledge_reader + knowledge_writer + +let load_cache_with_digest cache digest = + match Data.Cache.load cache digest with + | None -> false + | Some state -> + Toplevel.set state; + true + +let import_knowledge_from_cache digest = + let cache = knowledge_cache () in + load_cache_with_digest cache digest + +let make_digest inputs = + let inputs = String.concat inputs in + fun ~namespace -> + let d = Data.Cache.Digest.create ~namespace in + Data.Cache.Digest.add d "%s" inputs + +let compute_digest target disasm = + make_digest [ + Caml.Digest.file target; + disasm; + ] ~namespace:"knowledge" + +let summaries_of_files print_fn_bins tgt fs = + List.fold fs ~init:[] ~f:(fun ls lf -> + let digest = compute_digest lf tgt in + if import_knowledge_from_cache digest then + let sum = Metrics.get_summary () in + let fns = Option.value sum.fns ~default:0 in + if print_fn_bins && fns > 0 then + print_endline lf; + sum :: ls + else ( + print_endline @@ sprintf "%s not present in cache" lf; + ls + ) + ) + +let plot_summaries summaries = + make_plots summaries; + let sizes,occ,occ_space,fe,clean,fns,fps,tps,time = + transform_summaries summaries in + let tot_fns = List.fold fns ~init:0 ~f:(+) in + let tot_occ = List.fold occ ~init:0 ~f:(+) in + let tot_occ_space = List.fold occ_space ~init:0 ~f:(+) in + let avg_occ = + (float_of_int tot_occ) /. (float_of_int tot_occ_space) in + printf "fns: %d, total occ space %d, avg occ: %f\n" + tot_fns tot_occ_space avg_occ + diff --git a/plugins/superset_disasm/superset_disassembler.ml b/plugins/superset_disasm/superset_disassembler.ml new file mode 100644 index 000000000..70cf5b447 --- /dev/null +++ b/plugins/superset_disasm/superset_disassembler.ml @@ -0,0 +1,553 @@ +open Core +open Bap.Std +open Regular.Std +open Bap_knowledge +open Bap_core_theory +open Monads.Std +open Cmdoptions +open Bap_main +open Bap_plugins.Std + +include Self() +module Dis = Disasm_expert.Basic + +let man = {| + # DESCRIPTION + + Superset disassembly is a disassembly method in which every single + byte offset within the executable region of a binary is initially + treated as being potentially compiler intended output. However, + after applying several rounds of heuristics the true positives, or + the compiler intended instructions, can be distinguished from the + noise. It is an alternate disassembly method from linear sweep or + recursive descent, the two (probably most) popular mainstream + disassembly methods. This approach exchanges the possibility of + some small portion of the final output including some occlusive + unintended sequences being incorrectly kept (a superset) for the + probabilistic guarantee of having no misses of those that are + intended. + + Heuristics are broken into three main groups: invariants, analyses, + and heuristics. Invariants are ideally lawful characteristics of + binary code, where disobedience is illegal for any well formed + assembler, and run with a limited scope/visibility of just + instructions. Analyses are typically processes that identify less + visible violations of well-formed assembler rules or other lawful + assembler characteristics that require global visibility. Heuristics + are data traits that may be dirty and require some iterative + convergence to recognize a subset within the initial superset that + can be guaranteeably cleansed. Once convergence occurs, the bodies + of lineages with sufficient evidence are cleansed of occlusion, and + any lineage that does not have enough features to support being kept + is dropped. + + # ARGUMENTS + + Fundamental arguments specific to the superset disassembler include: + rounds, loader, ground_truth_bin, ground_truth_file, target, + invariants, analyses, trim_method, tp_threshold, save_addrs, + save_gt, save_dot, collect_report + + # PASSES + + Passes are not run by the superset disassembler, but the output can + be fed into the regular disassembly pipeline by making use of the + cache. At that point, an analysis pass can be run, and it isn't + meaningful to try to run an analysis pass on the raw superset alone + because it does not reconstruct the full project type. + + # OUTPUT + + The resulting project data structure could be dumped using the + $(b,--dump) (or $(b,-d) for short) option, which accepts the desired + format and, optionally, the output file name. + + It is possible to specify the $(b,--dump) option multiple times, in + which case the project will be dumped in several formats. + + ``` + bap superset-disasm /bin/echo -dasm:out.asm + ``` + |} + +let superset_disasm options = + let module With_options = + With_options(struct + let options = options + end) in + let t = Stdlib.Sys.time() in + let open KB.Syntax in + With_options.with_options () >>= fun superset -> + KB.promise Metrics.Cache.time (fun o -> + KB.return (Some (int_of_float (Stdlib.Sys.time() -. t)))); + (* Provide the is_valid label as a check on whether a given + address is in the superset after trimming *) + KB.promise Theory.Label.is_valid @@ (fun label -> + (* (target is just the machine target) *) + Theory.Label.target label >>= fun tgt -> + (* For each address the in the knowledge base *) + (* Collect the is_valid's label address *) + KB.collect Theory.Label.addr label >>= fun addr -> + match addr with + | Some addr -> + let addr = (Word.code_addr tgt addr) in + (* And return whether it should be kept or not *) + KB.return @@ Some (Superset.Core.mem superset addr) + | None -> KB.return None + ); + KB.return () + +let features_used = [ + "disassembler"; + "lifter"; + "symbolizer"; + "brancher"; + "loader"; + "abi"; +] + +type failure = + | Expects_a_regular_file of string + | Incompatible_options of string * string + | Project of Error.t + | Unknown_format of string + | Unavailable_format_version of string + | Unknown_collator of string + | Unknown_analysis of string + | No_knowledge of string + +type Extension.Error.t += Fail of failure + +module Err = Monad.Result.Make(Extension.Error)(Monad.Ident) +open Err.Syntax + +let proj_error = Result.map_error ~f:(fun err -> Fail (Project err)) + +let knowledge_reader = Data.Read.create + ~of_bigstring:Knowledge.of_bigstring () + +let knowledge_writer = Data.Write.create + ~to_bigstring:Knowledge.to_bigstring () + +let knowledge_cache () = + Data.Cache.Service.request + knowledge_reader + knowledge_writer + +let load_cache_with_digest cache digest = + match Data.Cache.load cache digest with + | None -> false + | Some state -> + info "importing knowledge from cache"; + Toplevel.set state; + true + +let import_knowledge_from_cache digest = + let digest = digest ~namespace:"knowledge" in + info "looking for knowledge with digest %a" + Data.Cache.Digest.pp digest; + let cache = knowledge_cache () in + load_cache_with_digest cache digest + +let store_knowledge_in_cache digest = + let digest = digest ~namespace:"knowledge" in + info "caching knowledge with digest %a" + Data.Cache.Digest.pp digest; + let cache = knowledge_cache () in + Toplevel.current () |> + Data.Cache.save cache digest + +let load_knowledge digest = function + | None -> import_knowledge_from_cache digest + | Some path when not (Stdlib.Sys.file_exists path) -> + import_knowledge_from_cache digest + | Some path -> + info "importing knowledge from %S" path; + Toplevel.set @@ Knowledge.load path; + true + +let outputs = + Extension.Command.parameters + ~doc:"Dumps the program to (defaults to stdout) \ + in the format (defaults to bir)." + ~as_flag:"bir" + ~aliases:["d"] + Extension.Type.("[[:]]" %: string) + "dump" + +let rw_file = Extension.Type.define + ~name:"" ~print:Fn.id ~parse:Fn.id + ~digest:(fun path -> + if Stdlib.Sys.file_exists path + then Caml.Digest.file path + else Caml.Digest.string "empty") + "" + +let update = + Extension.Command.flag "update" ~aliases:["u"] + ~doc: "Preserve the knowledge base, i.e., do not change it." + +let knowledge = + Extension.Command.parameter + ~doc:"Import the knowledge to the provided knowledge base. \ + If the $(b,--update) flag is set the knowledge base \ + will be also updated with the new information. If \ + $(b,--update) is set, the knowledge base might not \ + exist and it will be created" + ~aliases:["k"; "knowledge-base";] + (Extension.Type.some rw_file) "project" + +let input = Extension.Command.argument + ~doc:"The input file" Extension.Type.("FILE" %: string =? "a.out" ) + +let loader = + Extension.Command.parameter + ~doc:"Use the specified loader. + Use the loader `raw' to load unstructured files" + Extension.Type.(string =? "llvm") + "loader" + +let target = + let t = Extension.Type.define Theory.Target.unknown + ~name:"NAME" + ~digest:(fun t -> Caml.Digest.string@@Theory.Target.to_string t) + ~parse:(fun s -> match Theory.Target.lookup ~package:"bap" s with + | Some t -> t + | None -> + invalid_argf "unknown target %S, please see \ + `bap list targets' for the full list \ + of targets" s ()) + ~print:Theory.Target.to_string in + Extension.Command.parameter t "target" + ~doc:"Refines the target architecture of the binary. \ + See `bap list targets` for the full hierarchy of targets. \ + The specified target must be a refinement of the actual \ + target stored in the binary, otherwise an error is signaled." + +let validate_input file = + Result.ok_if_true (Stdlib.Sys.file_exists file) + ~error:(Fail (Expects_a_regular_file file)) + +let validate_knowledge update kb = match kb with + | None -> + Ok () + | Some path -> + let error = + Fail (No_knowledge "No initial knowledge to update") in + Result.ok_if_true (Stdlib.Sys.file_exists path || update) ~error + +let option_digest f = function + | None -> "none" + | Some s -> f s + +module Dump_formats = struct + let parse_fmt fmt = + match String.split ~on:'-' fmt with + | [fmt;ver] -> fmt, Some ver + | _ -> fmt,None + + let flatten (x,(y,z)) = x,y,z + + let split str = match String.split ~on:':' str with + | [fmt;dst] -> flatten (`file dst,parse_fmt fmt) + | _ -> flatten (`stdout,parse_fmt str) + + let parse_format str = + let (_,fmt,ver) as r = split str in + match Project.find_writer ?ver fmt with + | Some _ -> Ok r + | None -> match Project.find_writer fmt with + | None -> Error (Fail (Unknown_format fmt)) + | Some _ -> Error (Fail (Unavailable_format_version fmt)) + + let parse outputs = + Result.all @@ + List.map outputs ~f:parse_format +end + +let make_digest inputs = + let inputs = String.concat inputs in + fun ~namespace -> + let d = Data.Cache.Digest.create ~namespace in + Data.Cache.Digest.add d "%s" inputs + +let compute_digest target disasm = + make_digest [ + Caml.Digest.file target; + disasm; + ] + +let superset_digest options = + let open Cmdoptions in + compute_digest options.target options.disassembler + +let create_and_process kb options = + let digest = superset_digest options in + let _ = load_knowledge digest kb in + let () = Toplevel.exec @@ superset_disasm options in + let ro = Metrics.Cache.reduced_occlusion in + let _ = Toplevel.eval ro Metrics.Cache.sym_label in + let _ = Toplevel.eval Metrics.Cache.size Metrics.Cache.sym_label in + let summary = Metrics.get_summary () in + print_endline @@ + sprintf "%s %s" options.target @@ + Sexp.to_string @@ Metrics.sexp_of_t summary; + store_knowledge_in_cache (superset_digest options) + +let rounds = + let doc = "Number of analysis cycles" in + Extension.Command.parameter ~doc Extension.Type.(int =? 2) "rounds" + +let tp_threshold = + let doc = "The threshold at which convergence occurs" in + Extension.Command.parameter ~doc Extension.Type.(float =? 0.99) + "threshold" + +let heuristics = + let doc = + "Specify the features used to converge upon the true positive set" in + Extension.Command.parameter ~doc + Extension.Type.(list string =? Heuristics.defaults) + "heuristics" + +let invariants = + let doc = "Specify the desired invariants to apply to the superset" in + let deflt = List.map Invariants.default_tags ~f:fst in + Extension.Command.parameter ~doc + Extension.Type.(list string =? deflt) "invariants" + +let ground_truth_bin = + let doc = ("Compare results against a ground truth constructed" ^ + " from debug symbols of an unstripped binary") in + Extension.Command.parameter + ~doc Extension.Type.("" %: (some string) =? None) + "ground-truth-bin" + +let analyses = + let deflt = ["Strongly Connected Component Data"] in + Extension.Command.parameter + Extension.Type.(list string =? deflt) "analyses" + +let save_dot = Extension.Command.flag "save-dot" + +let converge = + Extension.Command.flag "converge" + +let protect = + Extension.Command.flag "protect" + +let find_fn_culprit = Extension.Command.flag "find-fn-culprit" + +let _superset_disassemble_command : unit = + let args = + let open Extension.Command in + args $input $outputs $loader $update $knowledge + $ground_truth_bin $invariants $analyses $tp_threshold $heuristics + $save_dot $rounds $converge $protect $find_fn_culprit + in + Extension.Command.declare ~doc:man "superset-disasm" + ~requires:features_used args @@ + fun input outputs loader update kb + ground_truth_bin invariants analyses tp_threshold heuristics + save_dot rounds converge protect find_fn_culprit ctxt -> + let converge = not converge in + let protect = not protect in + let options = + Fields.create ~disassembler:loader + ~ground_truth_bin ~target:input ~save_dot ~tp_threshold + ~rounds ~heuristics ~analyses + ~converge ~protect ~invariants ~find_fn_culprit in + validate_knowledge update kb >>= fun () -> + validate_input input >>= fun () -> + Dump_formats.parse outputs >>= fun outputs -> + Ok (create_and_process kb options) + + +let inputs = + Extension.Command.argument + ~doc:"The input files" + Extension.Type.("FILES" %: string =? "files.txt" + ) + +let print_fn_bins = + Extension.Command.flag "print-fn-bins" + +exception Missing_file of string + +let _graph_metrics : unit = + let args = + let open Extension.Command in + args $inputs $print_fn_bins $loader + in + let cmdname = "supersetd-graph-metrics" in + let doc = sprintf + {| The %s command iterates over the list of input files to + disassemble and accesses the cache for each file to + collect important metrics. These metrics are then + graphed with gnuplot, and saved in the current directory. |} + cmdname in + Extension.Command.declare ~doc cmdname + ~requires:features_used args @@ + fun files_list print_fn_bins tgt ctxt -> + let inputs = Stdlib.In_channel.with_open_text files_list + In_channel.input_lines in + let is_missing x = + not (Stdlib.Sys.file_exists x) in + let missing = List.find inputs ~f:is_missing in + let () = + match missing with + | None -> () + | Some f -> + print_endline @@ sprintf "%s is missing" f; + raise (Missing_file f); + in + let summaries = Plot_superset_cache.summaries_of_files + print_fn_bins tgt inputs in + Ok (Plot_superset_cache.plot_summaries summaries) + +let metrics = + let doc = + sprintf "%s%s%s%s" + "The format string specifying how to print the metrics" + ", which include: clean_functions, true_positives, " + "false_positives, false_negatives, reduced_occlusion, " + " occlusive_space, and function_entrances " in + Extension.Command.parameter ~doc + Extension.Type.(some (list string)) "metrics" + +let _print_metrics_command : unit = + let args = + let open Extension.Command in + args $input $outputs $loader $update $knowledge + $ground_truth_bin $invariants $analyses + $tp_threshold $heuristics $rounds + $converge $metrics $find_fn_culprit in + let man = "Perform computational operations on the cache" in + Extension.Command.declare ~doc:man "supersetd-print-metrics" + ~requires:features_used args @@ + fun input outputs loader update kb + ground_truth_bin invariants + analyses tp_threshold heuristics rounds + converge metrics find_fn_culprit + ctxt -> + validate_knowledge update kb >>= fun () -> + validate_input input >>= fun () -> + Dump_formats.parse outputs >>= fun outputs -> + let options = + Fields.create ~disassembler:loader + ~ground_truth_bin ~target:input ~save_dot:false ~tp_threshold + ~rounds ~heuristics ~analyses ~converge:false ~protect:false + ~invariants ~find_fn_culprit in + let digest = superset_digest options in + let _ = load_knowledge digest kb in + let map_opt = + function | None -> "unknown" | Some v -> sprintf "%d" v in + let open KB.Syntax in + Toplevel.exec @@ + (match metrics with + | Some metrics -> + Metrics.Cache.sym_label >>= (fun label -> + let oc_space = Metrics.Cache.occlusive_space in + let ro = Metrics.Cache.reduced_occlusion in + let fns = Metrics.Cache.false_negatives in + let fps = Metrics.Cache.false_positives in + let tps = Metrics.Cache.true_positives in + let slots = [ oc_space ; ro; fns; fps; tps ] in + let slots = + List.map slots ~f:(fun slt -> + KB.collect slt label >>= fun d -> + KB.return @@ + ((KB.Name.show @@ KB.Slot.name slt),(map_opt d)) + ) in + let fe = Metrics.Cache.function_entrances in + let clean = Metrics.Cache.clean_functions in + KB.all @@ List.append slots @@ + List.map [fe; clean] ~f:(fun slt -> + KB.collect slt label >>= fun d -> + let d = Option.map d ~f:Set.length in + KB.return @@ + ((KB.Name.show @@ KB.Slot.name slt),(map_opt d)) + ) >>= fun slots -> + let metric_vals = + List.fold ~init:String.Map.empty slots + ~f:(fun m (name,v) -> String.Map.set m ~key:name ~data:v) in + let fmt,rem = List.hd metrics, List.tl metrics in + let s = + match fmt, rem with + | Some fmt, Some rem -> + let init = fmt,1 in + let s,_=List.fold rem ~init ~f:(fun (fmt,v) s -> + let r = Str.regexp @@ sprintf "%%%d" v in + let opts = + List.to_string (Map.keys metric_vals) + ~f:Fn.id in + let default = + sprintf + "\"%s\" is not a metric, opts: %s" s opts in + let s = Map.find metric_vals s in + let s = Option.value s ~default in + Str.global_replace r s fmt,v+1 + ) in s + | _ -> "inproper arguments given to metrics" in + print_endline s; + KB.return () + ) + | None -> KB.return () + ); + Ok () + +let show_cache_digest = + Extension.Command.flag "show_cache_digest" + +let reset_cache = + Extension.Command.flag "reset_cache" + +let is_present = + Extension.Command.flag "verify_cache" + +let _cache_command : unit = + let args = + let open Extension.Command in + args $input $outputs $loader $update $knowledge + $show_cache_digest $reset_cache $is_present + in + let man = "Apply operations to the superset cache" in + Extension.Command.declare ~doc:man "supersetd-cache" + ~requires:features_used args @@ + fun input outputs loader update kb + show_cache_digest reset_cache verify_cache + ctxt -> + let get_raw_digest bin = + compute_digest bin loader ~namespace:"knowledge" in + let get_digest bin = + let d = get_raw_digest bin in + Data.Cache.Digest.to_string d in + let () = + if verify_cache then + let lines = In_channel.read_lines input in + List.iter lines ~f:(fun bin -> + let d = get_raw_digest bin in + let cache = knowledge_cache () in + let b = load_cache_with_digest cache d in + print_endline @@ sprintf "%s Present in cache: %b" bin b; + ) + else () in + let () = + if reset_cache then + let path = get_digest input in + let cachedir = Bap_main.Extension.Configuration.cachedir in + let l = [ ""; "/data/"; "/data2/"] in + List.iter l ~f:(fun s -> + let path = cachedir ^ "/" ^ s ^ "/" ^ path in + try + Stdlib.Sys.remove path; + with _ -> () + ); + else () in + let () = + if show_cache_digest then + let path = get_digest input in + printf "%s\n%!" path + else () in + Ok () +