From d8506ecddb1082b719e31f3b37935ac9f3c6b463 Mon Sep 17 00:00:00 2001 From: Kenneth Adam Miller Date: Fri, 2 May 2025 12:21:20 -0500 Subject: [PATCH 01/31] Snapshot of bap_superset_disasm building with dune --- bap-superset-disasm.opam | 38 + dune-project | 18 + lib/bap_superset_disasm/abstract_ssa.ml | 153 ++++ lib/bap_superset_disasm/cmdoptions.ml | 158 ++++ lib/bap_superset_disasm/cmdoptions.mli | 0 lib/bap_superset_disasm/decision_trees.ml | 292 +++++++ lib/bap_superset_disasm/dune | 19 + lib/bap_superset_disasm/features.ml | 765 ++++++++++++++++ lib/bap_superset_disasm/features.mli | 0 lib/bap_superset_disasm/fixpoint.ml | 28 + lib/bap_superset_disasm/fixpoint.mli | 6 + lib/bap_superset_disasm/grammar.ml | 202 +++++ lib/bap_superset_disasm/grammar.mli | 8 + lib/bap_superset_disasm/heuristics.ml | 471 ++++++++++ lib/bap_superset_disasm/heuristics.mli | 6 + lib/bap_superset_disasm/invariants.ml | 103 +++ lib/bap_superset_disasm/invariants.mli | 12 + lib/bap_superset_disasm/liveness.ml | 56 ++ lib/bap_superset_disasm/liveness.mli | 3 + lib/bap_superset_disasm/metrics.ml | 308 +++++++ lib/bap_superset_disasm/metrics.mli | 52 ++ lib/bap_superset_disasm/report.ml | 103 +++ lib/bap_superset_disasm/report.mli | 3 + lib/bap_superset_disasm/superset.ml | 653 ++++++++++++++ lib/bap_superset_disasm/superset.mli | 325 +++++++ lib/bap_superset_disasm/superset_impl.ml | 241 +++++ lib/bap_superset_disasm/traverse.ml | 82 ++ lib/bap_superset_disasm/traverse.mli | 16 + lib/bap_superset_disasm/trim.ml | 27 + lib/bap_superset_disasm/trim.mli | 1 + lib_test/bap_superset_disasm/dune | 5 + .../test_superset_disasm.ml | 823 ++++++++++++++++++ oasis/superset-disasm | 64 ++ plugins/superset_disasm/build.sh | 5 + plugins/superset_disasm/install.sh | 1 + plugins/superset_disasm/run.sh | 1 + .../superset_disasm/superset_disassembler.ml | 616 +++++++++++++ 37 files changed, 5664 insertions(+) create mode 100644 bap-superset-disasm.opam create mode 100644 lib/bap_superset_disasm/abstract_ssa.ml create mode 100644 lib/bap_superset_disasm/cmdoptions.ml create mode 100644 lib/bap_superset_disasm/cmdoptions.mli create mode 100644 lib/bap_superset_disasm/decision_trees.ml create mode 100644 lib/bap_superset_disasm/dune create mode 100644 lib/bap_superset_disasm/features.ml create mode 100644 lib/bap_superset_disasm/features.mli create mode 100644 lib/bap_superset_disasm/fixpoint.ml create mode 100644 lib/bap_superset_disasm/fixpoint.mli create mode 100644 lib/bap_superset_disasm/grammar.ml create mode 100644 lib/bap_superset_disasm/grammar.mli create mode 100644 lib/bap_superset_disasm/heuristics.ml create mode 100644 lib/bap_superset_disasm/heuristics.mli create mode 100644 lib/bap_superset_disasm/invariants.ml create mode 100644 lib/bap_superset_disasm/invariants.mli create mode 100644 lib/bap_superset_disasm/liveness.ml create mode 100644 lib/bap_superset_disasm/liveness.mli create mode 100644 lib/bap_superset_disasm/metrics.ml create mode 100644 lib/bap_superset_disasm/metrics.mli create mode 100644 lib/bap_superset_disasm/report.ml create mode 100644 lib/bap_superset_disasm/report.mli create mode 100644 lib/bap_superset_disasm/superset.ml create mode 100644 lib/bap_superset_disasm/superset.mli create mode 100644 lib/bap_superset_disasm/superset_impl.ml create mode 100644 lib/bap_superset_disasm/traverse.ml create mode 100644 lib/bap_superset_disasm/traverse.mli create mode 100644 lib/bap_superset_disasm/trim.ml create mode 100644 lib/bap_superset_disasm/trim.mli create mode 100644 lib_test/bap_superset_disasm/dune create mode 100644 lib_test/bap_superset_disasm/test_superset_disasm.ml create mode 100644 oasis/superset-disasm create mode 100755 plugins/superset_disasm/build.sh create mode 100755 plugins/superset_disasm/install.sh create mode 100755 plugins/superset_disasm/run.sh create mode 100644 plugins/superset_disasm/superset_disassembler.ml diff --git a/bap-superset-disasm.opam b/bap-superset-disasm.opam new file mode 100644 index 000000000..d227ffb05 --- /dev/null +++ b/bap-superset-disasm.opam @@ -0,0 +1,38 @@ +# This file is generated by dune, edit dune-project instead +opam-version: "2.0" +version: "dev" +synopsis: "A bap disassembler that converges on a minimal superset." +maintainer: ["Ivan Gotovchits "] +authors: ["The BAP Team"] +license: "MIT" +homepage: "https://github.com/BinaryAnalysisPlatform/bap" +bug-reports: "https://github.com/BinaryAnalysisPlatform/bap/issues" +depends: [ + "dune" {>= "3.1"} + "core_kernel" + "bap" + "ppx_inline_test" + "graphlib" + "landmarks" + "bap-future" + "zmq" + "gnuplot" + "odoc" {with-doc} +] +build: [ + ["dune" "subst"] {dev} + [ + "dune" + "build" + "-p" + name + "-j" + jobs + "--promote-install-files=false" + "@install" + "@runtest" {with-test} + "@doc" {with-doc} + ] + ["dune" "install" "-p" name "--create-install-files" name] +] +dev-repo: "git+https://github.com/BinaryAnalysisPlatform/bap.git" diff --git a/dune-project b/dune-project index 52e68ced4..99b2141b2 100644 --- a/dune-project +++ b/dune-project @@ -143,6 +143,24 @@ (ocaml (> 4.08.0)) (stdio (and (>= v0.14) (< v0.16))))) +(package + (name bap-superset-disasm) + (synopsis "A bap disassembler that converges on a minimal superset.") + (sites + (lib plugins) + ) + (depends + core_kernel + bap + ppx_inline_test + graphlib + landmarks + bap-future + zmq + gnuplot) +) + + (package (name bap-primus-support) (synopsis "Provides essential Primus components") diff --git a/lib/bap_superset_disasm/abstract_ssa.ml b/lib/bap_superset_disasm/abstract_ssa.ml new file mode 100644 index 000000000..17921c981 --- /dev/null +++ b/lib/bap_superset_disasm/abstract_ssa.ml @@ -0,0 +1,153 @@ +open Bap.Std +open Core + +let stmt_def_vars = + object(self) + inherit [Exp.Set.t] Stmt.visitor + method enter_move def use accu = + if not Var.(is_virtual def) then + Set.add accu Exp.(Bil.Var def) + else accu + end + +let stmt_use_vars = + object(self) + inherit [Exp.Set.t] Stmt.visitor + method enter_move def use accu = + Set.add accu use + end + + +let stmt_def_freevars = + object(self) + inherit [Var.Set.t] Stmt.visitor + method enter_move def use accu = + if not Var.(is_virtual def) then + Set.add accu def + else accu + end + +let stmt_use_freevars = + object(self) + inherit [Var.Set.t] Stmt.visitor + method enter_move def use accu = + let free_vars = + Set.filter ~f:(fun v -> not Var.(is_virtual v)) (Exp.free_vars use) + in Set.union accu free_vars + end + + +let def_ssa bil = + stmt_def_vars#run bil Exp.Set.empty + +let use_ssa bil = + stmt_use_vars#run bil Exp.Set.empty + +let def_freevars bil = + stmt_def_freevars#run bil Var.Set.empty + +let use_freevars bil = + stmt_use_freevars#run bil Var.Set.empty + + +(* Abstract SSA *) +(*type t = + { + (* pointer expression *) + (* bil *) + (* ssa name *) + }*) + +(* currentDef is a mapping from each variable to it's defining *) +(* expression. When recording into this mapping, the right hand *) +(* expression of the IR is stored. *) + +(* filled - when local numbering for a block has finished. *) +(* sealed - when no further predecessors will be added to the *) +(* block. *) + +let absexp_of_bil bil = + object(self) + inherit [exp option] Stmt.finder + method! enter_jmp exp r = r + end +(* +let writeVariable variable block value = + currentDef[variable][block] <- value + +let readVariable variable block = + if Set.mem currentDef[variable] block then + currentDef[variable][block] + else readVariableRecursive varaible block + +let tryRemoveTrivialPhi phi = + let same = + List.fold phi.operands ~init:None ~f:(fun op curr -> + if op = curr or op = phi then + curr (* Unique value or self reference *) + else + Some(op) + ) in + if Option.is_some same then + (* The phi merges at least two values: not trivial *) + phi + else + (* If same is none (unreachable or in the start block, *) + (* then create an undefined value *) + let same = Option.value same ~default:(Operand.create ()) in + (* Remember all users except the phi itself *) + let users = phi.users.remove phi in + (* Reroute all uses of phi to same and remove phi *) + phi.replaceBy same; + (* Try to recursively remove all phi users, which might *) + (* have become trivial *) + Set.iter users ~f:(fun use -> + match use with + | Phi(use) -> + tryRemoveTrivialPhi use + | _ -> ()); + same + +let addPhiOperands variable phi = + (* If there is more than one, collect the definitions from all *) + (* the predecessors and construct a phi function joining them *) + (* into a single value. *) + List.iter phi.block.preds ~f:(fun pred -> + phi.appendOperand (readVariable variable pred)); + tryRemoveTrivialPhi phi + +(* if a block currently contains no definition for a variable, we *) +(* recursively look for a definition in its predecessors. *) +let rec readVariableRecursive variable block = + if not Set.(mem sealedBlocks block) then + let value = Phi.create block in + incompletePhis[block][variable] <- value; + writeVariable variable block value; + value + else if List.(length block.preds) = 1 then + (* If the block has a single predecessor (edge leading into it) *) + (* then recursively query it for a definition. *) + let value = readVariable(variable, List.(hd block.preds)) in + writeVariable variable block value; + value + else + (* Determine operands from predecessors *) + let value = Phi.create block in + writeVaraible variable block value; + let value = addPhiOperands variable value in + writeVariable variable block value; + value + + +let readVariable variable block = + if Map.mem currentDef[variable] block then + currentDef[variable][block] + else + readVariableRecursive variable block + +let sealBlock block = + Set.iter incompletePhis[block] ~f:(fun variable -> + addPhiOperands variable incompletePhis[block][variable] + ); + Set.add sealedBlocks block +*) diff --git a/lib/bap_superset_disasm/cmdoptions.ml b/lib/bap_superset_disasm/cmdoptions.ml new file mode 100644 index 000000000..9d847385d --- /dev/null +++ b/lib/bap_superset_disasm/cmdoptions.ml @@ -0,0 +1,158 @@ +open Core +open Bap.Std +open Bap_knowledge +open Bap_core_theory + +module Dis = Disasm_expert.Basic + +let tag_loop_contradictions = + Grammar.tag_loop_contradictions ?min_size:None +let tag_grammar = + Grammar.tag_by_traversal ?threshold:None +let list_analyses = [ + "Strongly Connected Component Data", tag_loop_contradictions; + "Grammar convergent", tag_grammar; + ] + +type t = { + disassembler : string; + ground_truth_bin : string option; + target : string; + invariants : string list; + analyses : string list; + converge : bool; + protect : bool; + save_dot : bool; + tp_threshold : float; + rounds : int; + heuristics : string list; +} [@@deriving sexp, fields, bin_io] +type opts = t [@@deriving sexp, bin_io] + +module Cache = struct + + let disasm_opts = + let package = "superset-disasm-cmdoptions" in + let opts_persistent = + Knowledge.Persistent.of_binable + (module struct type t = opts option [@@deriving bin_io] end) in + let attr ty persistent name desc = + let open Theory.Program in + Knowledge.Class.property ~package cls name ty + ~persistent + ~public:true + ~desc in + let open Knowledge.Domain in + let disequal _ _ = false in + let opts = optional ~inspect:sexp_of_opts ~equal:disequal "opts ty" in + attr opts opts_persistent "disasm_opts" + "All command options given" + +end + +module type Provider = sig + val options : t +end + +module With_options(Conf : Provider) = struct + open Conf + open Or_error + open Format + + let () = + let open KB.Syntax in + KB.promise Cache.disasm_opts (fun o -> + KB.return (Some options) + ) + + let with_analyses superset analyses = + Trim.run @@ + List.fold analyses ~init:superset ~f:(fun superset analyze -> + analyze superset + ) + + let checkpoint ?addrs bin invariants = + let backend = options.disassembler in + let invariants = Invariants.tag_success ::invariants in + let f = Invariants.tag ~invariants in + Superset.superset_disasm_of_file ?addrs ~backend bin ~f + + let args_to_funcs args funcs = + let l = List.filter_map args + ~f:(fun arg -> + List.find funcs ~f:(fun (name,f) -> + String.equal arg name + ) + ) in + List.map l ~f:snd + + let invariants = + args_to_funcs options.invariants Invariants.default_tags + + let analyses = args_to_funcs options.analyses list_analyses + + let with_options () = + let open KB.Syntax in + Superset.Cache.sym_label >>= fun sym_label -> + KB.collect Superset.Cache.superset_graph sym_label + >>= fun graph -> + let superset = + match graph with + | None -> + let superset = checkpoint options.target invariants in + let () = Metrics.set_ground_truth superset in + let trim = Trim.run in + let superset = trim superset in + let superset = with_analyses superset analyses in + let superset = trim superset in + KB.promise Superset.Cache.superset_graph + (fun _ -> KB.return @@ Some Superset.ISG.(to_list superset)); + superset + | Some graph -> + let graph = Seq.of_list graph in + let graph = + Seq.concat @@ Seq.map graph ~f:(fun (s,d) -> + Seq.of_list [s;d] + ) in + checkpoint ~addrs:graph options.target [] in + let f superset = + Heuristics.with_featureset options.heuristics superset + ~init:(superset) + ~f:(fun fname feature superset -> + Trim.run @@ feature superset + ) in + let superset = Fixpoint.iterate options.rounds f superset in + let pnts_of_percent prcnt = + Int.of_float (1.0/.(1.0-.prcnt)) in + let threshold = (pnts_of_percent options.tp_threshold) in + Heuristics.with_featurepmap options.heuristics superset + ~f:(fun pmap featureset superset -> + let total_of_features l = + List.fold ~init:0 ~f:(fun x (y,_,_) -> x + y) l in + let feature_pmap = + Map.map pmap ~f:(total_of_features) in + let feature_pmap = + Map.filter feature_pmap ~f:(fun total -> + (total > threshold)) in + Report.collect_distributions superset threshold pmap; + let superset = + if options.converge then ( + let f superset = + let superset = + if options.protect then ( + Fixpoint.protect superset (fun superset -> + Fixpoint.converge superset options.heuristics feature_pmap + ) + ) else + Fixpoint.converge superset options.heuristics + feature_pmap in + Trim.run superset in + Fixpoint.iterate options.rounds f superset + ) else superset in + Metrics.compute_metrics superset; + ); + KB.return superset + + let main = with_options + +end diff --git a/lib/bap_superset_disasm/cmdoptions.mli b/lib/bap_superset_disasm/cmdoptions.mli new file mode 100644 index 000000000..e69de29bb diff --git a/lib/bap_superset_disasm/decision_trees.ml b/lib/bap_superset_disasm/decision_trees.ml new file mode 100644 index 000000000..8b7b4d1ec --- /dev/null +++ b/lib/bap_superset_disasm/decision_trees.ml @@ -0,0 +1,292 @@ +open Core +open Bap.Std +open Graphlib.Std + +module G = Graphlib.Make(Addr)(Unit) + + +(** The decision tree represents a set of potentially inter-dependent + decision trees and potential ramifications of selection at each + node. The objective is to present to the user a clean interface + by which to construct mutually compatible decisions, since it it + possible for a blithely written analysis to piece together many + decisions that are not fit with the whole. *) + +type decision_tree = { + tree : G.t; + starts : Addr.Hash_set.t; + } +type decision_forest = decision_tree list +type 'a possibility +type 'a choice +type 'a consequence +type tail + +let count tree = + List.length tree +let with_trees (tree : decision_forest) = + List.fold tree + +(** For any given entry, calculate the conflicts, and filter the set + down to lists of entries that conflict with one another. *) +let conflicts_of_entries superset entries = + let visited_entries = Addr.Hash_set.create () in + Hash_set.fold entries ~init:[] ~f: + (fun conflicted_entries entry -> + if not (Hash_set.mem visited_entries entry) then ( + Hash_set.add visited_entries entry; + let in_entry_conflicts = + Superset.Occlusion.conflicts_within_insn_at superset entry in + let conflicts = Addr.Hash_set.create () in + Hash_set.add conflicts entry; + Set.iter in_entry_conflicts + ~f:(fun conflict -> + (* A conflict that an entry may have may or may not *) + (* itself be an entry. *) + if Hash_set.mem entries conflict then ( + Hash_set.add visited_entries conflict; + Hash_set.add conflicts conflict; + ) + ); + if (Hash_set.length conflicts) > 1 then ( + conflicts :: conflicted_entries + ) else conflicted_entries + ) else conflicted_entries + ) + +(** Calculate the set potential points where occlusive instructions + could rejoin to a common target, such as cease when falling + through to the same instruction. Calculate the tail, or the join + target and the conflicts that led into that. *) +let tails_of_conflicts superset conflicts = + let possible_tails = Superset.mergers superset in + (* This tail is the particular instruction + that is the fall through target of several potential + competitors. We use this instruction against the + leaders map because those will be the ones that fall + through to the tail; the tail can then be associated with + those that lead into it. *) + let tails, _ = Set.fold ~init:(Addr.Map.empty, Addr.Set.empty) + ~f:(fun (tails, added_choices) possible_tail -> + (* For each edge from tail, lookup the respective vertex; if it *) + (* is in the conflicts set, then it gets added to a sheath *) + (* of choices. *) + let f sheath poss_conflict = + let not_added = not (Set.mem added_choices poss_conflict) in + let is_conflict = Set.mem conflicts poss_conflict in + let is_connected = Superset.ISG.check_connected + superset possible_tail poss_conflict in + if not_added && is_conflict && is_connected then + poss_conflict :: sheath + else sheath in + let sheath = List.fold_left + (Superset.ISG.ancestors superset possible_tail) ~init:[] ~f + in + match sheath with + | [] | _ :: []-> tails, added_choices + | _ -> + let added_choices = + Set.inter added_choices (Addr.Set.of_list sheath) in + (Addr.Map.set tails ~key:possible_tail ~data:sheath, added_choices) + ) possible_tails in + tails + +let add_edge dtr v1 v2 = + let g = dtr.tree in + let e = G.Edge.create v1 v2 () in + let g = G.Edge.insert e g in + { dtr with tree = g } + +let mem_edge dtr v1 v2 = + let g = dtr.tree in + let e = G.Edge.create v1 v2 () in + G.Edge.mem e g + +let new_dtree () = + let tree = Graphlib.create (module G) () in + let starts = Addr.Hash_set.create () in + { tree; starts; } + +module DecisionTree = struct + let count dtree = + (G.number_of_nodes dtree.tree) + (Hash_set.length dtree.starts) + let mem addr dtree = G.Node.mem addr dtree.tree + || Hash_set.mem dtree.starts addr +end + +(** Starting from each entry in the superset, identify the tails and + build a decision tree that allows to jump from conflict to + conflict and review the options. *) +let decision_tree_of_entries superset conflicted_entries entries tails = + let visited = Addr.Hash_set.create () in + let add_choices decision_tree current_vert = + let unvisited = + not (Hash_set.mem visited current_vert) in + if unvisited then + let possible_tail = current_vert in + match Addr.Map.find tails possible_tail with + | Some(sheath) -> + List.fold sheath ~init:decision_tree ~f:(fun decision_tree competitor -> + add_edge decision_tree possible_tail competitor + ); + | _ -> decision_tree + else decision_tree; + in + let link_start decision_tree entry = + Hash_set.add decision_tree.starts entry; + { decision_tree with starts = decision_tree.starts } + in + let f decision_tree entry = + let saved_vert = ref entry in + let link_choices decision_tree current_vert = + let decision_tree = add_choices decision_tree entry in + let contained = DecisionTree.mem current_vert decision_tree in + let is_new = Hash_set.mem visited current_vert in + let decision_tree = + if contained && is_new then ( + let decision_tree = + if not @@ mem_edge decision_tree !saved_vert + current_vert then ( + add_edge decision_tree !saved_vert + current_vert + ) else decision_tree in + saved_vert := current_vert; + decision_tree + ) else decision_tree in + decision_tree + in + Superset.ISG.dfs_fold superset ~visited decision_tree + ~post:(fun g v -> g) ~pre:link_choices entry + in + let conflicted_trees = + List.filter_map conflicted_entries ~f:(fun conflicted -> + if Hash_set.length conflicted > 0 then + let decision_tree = new_dtree () in + let f decision_tree entry = + if not (Hash_set.mem visited entry) then ( + let decision_tree = link_start decision_tree entry in + f decision_tree entry) else decision_tree + in + let decision_tree = + Hash_set.fold ~init:decision_tree conflicted ~f in + Some(decision_tree) + else None + ) in + Hash_set.fold entries ~init:conflicted_trees + ~f:(fun all_trees entry -> + if not (Hash_set.mem visited entry) then + let decision_tree = new_dtree () in + let decision_tree = f decision_tree entry in + if DecisionTree.count decision_tree > 0 then + decision_tree :: all_trees + else all_trees + else (all_trees) + ) + +(** Accepts a superset, and calculates the decision trees over groups + of instructions. The returned trees index from tails to the + options available. *) +let decision_trees_of_superset superset = + (* Here, for each vertex, look up the insn from the map and *) + (* identify conflicts. *) + let conflicts = Superset.Occlusion.find_all_conflicts superset in + (* entries variable: + We want to know the superset of all nodes that could be the + terminating point that would otherwise be the return instruction + of a function. *) + let entries = Superset.entries_of_isg superset in + (* + we need to keep track of the subset of potential choices + that fall in line with the normal control flow graph, and + leave the discovery of overlapping redirection to a + second pass, in order that when we do a map over all + instructions to check for conflicts, we know which are tails + in order to properly construct the sheath type. + *) + let tails = tails_of_conflicts superset conflicts in + (* It may be that some entries are accidental indirections that *) + (* happen to preside at the intended entry. These must map to to an *) + (* entirely distinct interpretation. *) + let conflicted_entries = conflicts_of_entries superset entries in + (* For each of the potentially conflicting entries, construct a *) + (* decision tree. *) + let decision_trees = decision_tree_of_entries + superset conflicted_entries entries tails in + decision_trees + +let insn_is_option superset addr = + let open Superset in + let len = Superset.Inspection.len_at superset addr in + let bound = Addr.(addr ++ len) in + let previous = Superset.ISG.descendants superset addr in + List.fold ~init:false previous ~f:(fun current descedant -> + if not current then + let further = Superset.ISG.ancestors superset descedant in + List.fold ~init:current further ~f:(fun current opt -> + if not current then + if Addr.(addr <= opt) && Addr.(opt < bound) then + true + else false + else current + ) + else current + ) + +(** For a given superset that contains groups of instruction lineages + as potential choices, calculate the result of picking a given + choice as a delta. *) +let calculate_deltas ?entries ?is_option superset = + let is_option = + Option.value is_option + ~default:(insn_is_option superset) in + let entries = Option.value entries + ~default:(Superset.entries_of_isg superset) in + let add_data_of_insn dataset at = + Superset.Occlusion.with_data_of_insn + superset at ~f:(Hash_set.add dataset) + in + let deltas = ref Addr.Map.empty in + let delta = ref None in + let make_deltas addr = + let insns, datas = + match !delta with + | Some (insns, datas) -> (insns, datas) + | None -> + let insns = Addr.Hash_set.create () in + let datas = Addr.Hash_set.create () in + delta := Some(insns, datas); + insns, datas in + if is_option addr then ( + deltas := Addr.Map.set !deltas addr (insns, datas); + delta := None + ) else ( + add_data_of_insn datas addr; + Hash_set.add insns addr; + ) + (* else if is in entries then store the delta in the deltas map *) + in + let visited = Addr.Hash_set.create () in + Hash_set.iter entries + ~f:(Traverse.with_ancestors_at + ~visited ~post:make_deltas ?pre:None superset); + !deltas + +module Speculate = struct + let weigh_possibilities _ _ = () + let make_choices x _ _ = x +end + +(** A delta from decision trees is constructed and passed to the + visitor functions during a visit. *) +let visit_with_deltas ?pre ?post ~is_option superset entries = + let pre = Option.value pre ~default:(fun _ _ -> ()) in + let post = Option.value post ~default:(fun _ _ -> ()) in + let deltas = ref (calculate_deltas + superset ~entries ~is_option) in + let pre addr = + pre !deltas addr in + let post addr = + post !deltas addr; + deltas := Map.remove !deltas addr + in + Traverse.visit ~pre ~post superset entries diff --git a/lib/bap_superset_disasm/dune b/lib/bap_superset_disasm/dune new file mode 100644 index 000000000..64b8899c9 --- /dev/null +++ b/lib/bap_superset_disasm/dune @@ -0,0 +1,19 @@ +(library + (name bap_superset_disasm) + (public_name bap-std.superset-disasm) + (wrapped false) + (preprocess (pps ppx_bap)) + (libraries + bap + bap-core-theory + bap-future + bap-knowledge + graphlib + ppx_inline_test + landmarks + zmq + gnuplot + ) +(modules_without_implementation plot_superset_cache) +) + diff --git a/lib/bap_superset_disasm/features.ml b/lib/bap_superset_disasm/features.ml new file mode 100644 index 000000000..563eed1c4 --- /dev/null +++ b/lib/bap_superset_disasm/features.ml @@ -0,0 +1,765 @@ +open Core +open Bap.Std + +module Dis = Disasm_expert + +let default_features = [ + "ImgEntry"; + (*"NoExit";*) + (*"LoopsWithBreak";*) + "BranchViolations"; + (*"LayerViolations";*) + "TrimLimitedClamped"; + "Callsites3"; + (*"TrimFixpointGrammar"; + "TrimFixpointTails";*) + (*"Clamped"; + "SCC"; + "LoopGrammar"; + "CallsiteLineage"; + "SSA";*) + (*"FreeVarSSA";*) + (*"Grammar";*) + (*"Constant";*) +] +let default_features = List.rev default_features + +let transform = Hash_set.fold ~init:Addr.Set.empty ~f:Set.add + +let clear_each superset visited = + Hash_set.iter visited ~f:(fun tp -> + Superset.Core.clear_bad superset tp + ) + +let get_non_fall_through_edges superset = + Superset.ISG.fold_edges superset + (fun child parent jmps -> + if Superset.is_fall_through superset parent child then + Map.set jmps ~key:child ~data:parent + else jmps + ) Addr.Map.empty + + +(** A callsite is a location which is shared as the target of a call + by several other locations in the binary. *) +let get_callsites ?(threshold=6) superset = + let callsites = Addr.Hash_set.create () in + Superset.ISG.iter_vertex superset + (fun v -> + let callers = Superset.ISG.ancestors superset v in + let num_callers = + List.fold callers ~init:0 ~f:(fun total caller -> + if not (Superset.is_fall_through superset caller v) then + total + 1 + else total) in + if num_callers > threshold then ( + Hash_set.add callsites v; + ) + ) ; + callsites + +(** Adds to the set visited the set of reachable descendents of a + callsite of a given sufficient threshold number of external callers *) +let tag_callsites visited ?callsites superset = + let callsites = Option.value callsites + ~default:(get_callsites ~threshold:6 superset) in + Hash_set.iter callsites ~f:(fun callsite -> + Traverse.with_descendents_at ~visited + ?post:None ?pre:None superset callsite; + ); + superset + +let find_free_insns superset = + let mem = Superset.Core.mem superset in + let all_conflicts = Addr.Hash_set.create () in + let to_clamp = + Superset.Core.fold superset ~init:(Addr.Set.empty) + ~f:(fun ~key ~data to_clamp -> + let (addr,(memory,_)) = key, data in + let len = Memory.length memory in + let conflicts = Superset.Occlusion.range_seq_of_conflicts + ~mem addr len in + let no_conflicts = Seq.is_empty conflicts in + Seq.iter conflicts ~f:(fun c -> + Hash_set.add all_conflicts c); + if no_conflicts && not Hash_set.(mem all_conflicts addr) then + Set.add to_clamp addr + else ( + to_clamp + ) + ) in + to_clamp +(*Hash_set.fold all_conflicts ~init:to_clamp ~f:Set.remove*) + +let restricted_clamp superset = + let entries = Superset.entries_of_isg superset in + let conflicts = Superset.Occlusion.find_all_conflicts superset in + let to_clamp = ref Addr.Set.empty in + Hash_set.iter entries ~f:(fun entry -> + let b = ref false in + let pre v = + if Addr.(v = entry) then + b := false + else if not (!b) then + if Set.mem conflicts v then + b := true + else to_clamp := Set.add (!to_clamp) v + in Traverse.with_ancestors_at ~post:(fun _ -> ()) ~pre superset entry; + ); + !to_clamp + +let extended_clamp superset = + let to_clamp = find_free_insns superset in + Set.fold to_clamp ~init:Addr.Set.empty ~f:(fun to_clamp clamp -> + let _, to_clamp = + Superset.ISG.dfs_fold superset + ~pre:(fun (struck,to_clamp) addr -> + if struck then (struck,to_clamp) else + let conflicts = + Superset.Occlusion.conflicts_within_insn_at + superset addr in + let no_conflicts = Set.length conflicts = 0 in + (*let conflicts = Superset.Occlusion.parent_conflict_at + insn_risg insn_map addr in + let no_conflicts = Set.length conflicts = 0 + && no_conflicts in*) + if no_conflicts then (struck, Set.(add to_clamp addr)) + else (true, to_clamp) + ) ~post:(fun x _ -> x) (false, to_clamp) clamp + in to_clamp + ) + +let extract_loop_addrs superset = + let loop_addrs = Superset.ISG.raw_loops superset in + List.fold_left ~init:Addr.Map.empty loop_addrs + ~f:(fun addrs loop -> + if List.length loop >= 2 then + Option.value ~default:addrs + Option.(map List.(hd loop) ~f:(fun addr -> + Map.set addrs ~key:addr ~data:loop)) + else addrs + ) + +let extract_filtered_loop_addrs superset = + let loop_addrs = extract_loop_addrs superset in + Map.filteri loop_addrs ~f:(fun ~key ~data -> + List.length data > 20) + +let extract_constants superset = + let imgmem = + Memmap.to_sequence @@ Superset.Inspection.get_memmap superset in + let addrs = + Seq.bind imgmem + ~f:(fun (segment,_) -> + let words_of_mem mem = + let rec yield_next addr = + let width = + Addr.bitwidth addr in + let s = Size.of_int_exn width in + let open Seq.Generator in + match Memory.view ~word_size:s ~from:addr mem with + | Ok next -> + yield next >>= fun () -> yield_next (Addr.succ addr) + | _ -> return () in + Sequence.Generator.run (yield_next Memory.(min_addr mem)) + in words_of_mem segment + ) in + Seq.fold ~init:Addr.Map.empty addrs + ~f:(fun constants m -> + let constant = Memory.(m ^ (min_addr m)) in + match constant with + | Ok constant -> + if Superset.Inspection.contains_addr superset constant + && Superset.Core.(mem superset constant) then + Map.set constants ~key:Memory.(min_addr m) ~data:constant + else constants + | _ -> constants + ) + +let stddev_of hs average pmap = + let deviation,deg_free = + Hash_set.fold ~init:(0.0,0) hs ~f:(fun (deviation,deg_free) addr -> + if Map.mem pmap addr then + let d = (Option.(value_exn Map.(find pmap addr)) -. average) in + let d = d *. d in + (deviation +. d, (deg_free+1)) + else (deviation, (deg_free)) + ) in + sqrt(deviation /. float_of_int (deg_free -1)) + +(* pre is called from descendant to ancestor order, so we want to + check for usage and put that into a map, and then for define on + post visitation, when coming back down from ancestors back to + descendants (as execution would move). *) +let pre_ssa superset lift factors var_use addr = + match Superset.Core.lookup superset addr with + | Some (mem, insn) -> ( + try + let bil = lift (mem, insn) in + Option.value_map ~default:() bil ~f:(fun (bil) -> + let use_vars = Abstract_ssa.use_ssa bil in + Set.iter use_vars ~f:(fun use_var -> + var_use := Map.set !var_use ~key:use_var ~data:addr + ) + ) + with _ -> () + ) + | None -> () + +let pre_freevarssa superset lift factors var_use addr = + match Superset.Core.lookup superset addr with + | Some (mem, insn) -> ( + try + let bil = lift (mem, insn) in + Option.value_map ~default:() bil ~f:(fun (bil) -> + let use_vars = Abstract_ssa.use_freevars bil in + Set.iter use_vars ~f:(fun use_var -> + var_use := Map.set !var_use ~key:use_var ~data:addr + ) + ) + with _ -> () + ) + | None -> () + +let post_ssa_with superset lift var_use addr f = + match Superset.Core.lookup superset addr with + | Some (mem, insn) -> ( + try + let bil = lift (mem, insn) in + Option.value_map ~default:() bil ~f:(fun (bil) -> + let use_vars = Abstract_ssa.use_ssa bil in + Set.iter use_vars ~f:(fun use_var -> + var_use := Map.remove !var_use use_var; + ); + let var_defs = Abstract_ssa.def_ssa bil in + Set.iter var_defs ~f:(fun var_def -> + match Map.find !var_use var_def with + | Some(waddr) -> + if not Addr.(waddr = addr) then ( + f waddr addr + ) + | None -> () + ); + Set.iter var_defs ~f:(fun write_reg -> + var_use := Map.remove !var_use write_reg + ) + ) + with _ -> () + ) + | None -> () + +let post_freevarssa_with superset lift var_use addr f = + match Superset.Core.lookup superset addr with + | Some (mem, insn) -> ( + try + let bil = lift (mem, insn) in + Option.value_map ~default:() bil ~f:(fun (bil) -> + let use_vars = Abstract_ssa.use_freevars bil in + let var_defs = Abstract_ssa.def_freevars bil in + Set.iter var_defs ~f:(fun var_def -> + match Map.find !var_use var_def with + | Some(waddr) -> + if not Set.(mem use_vars var_def) then ( + f waddr addr + ) + | None -> () + ); + Set.iter use_vars ~f:(fun use_var -> + var_use := Map.remove !var_use use_var; + ); + Set.iter var_defs ~f:(fun write_reg -> + var_use := Map.remove !var_use write_reg + ) + ) + with _ -> () + ) + | None -> () + +let extract_ssa_to_map superset = + let var_use = ref Exp.Map.empty in + let defuse_map = ref Addr.Map.empty in + let add_to_map def use = + defuse_map := Map.set !defuse_map ~key:def ~data:use in + let lift (mem, insn) = + Superset.Core.lift_insn superset ( (mem, insn)) in + let pre = pre_ssa superset lift () var_use in + let post addr = post_ssa_with superset lift var_use + addr add_to_map in + let entries = Superset.entries_of_isg superset in + Hash_set.iter entries ~f:(fun addr -> + Traverse.with_ancestors_at superset addr ~post ~pre; + var_use := Exp.Map.empty + ); + !defuse_map + +let extract_freevarssa_to_map superset = + let var_use = ref Var.Map.empty in + let defuse_map = ref Addr.Map.empty in + let add_to_map def use = + defuse_map := Map.set !defuse_map ~key:def ~data:use in + let lift (mem, insn) = + Superset.Core.lift_insn superset ((mem, insn)) in + let pre = pre_freevarssa superset lift () var_use in + let post addr = post_freevarssa_with superset lift var_use + addr add_to_map in + let entries = Superset.entries_of_isg superset in + Hash_set.iter entries ~f:(fun addr -> + Traverse.with_ancestors_at superset addr ~post ~pre; + var_use := Var.Map.empty + ); + !defuse_map + +let extract_cross_section_jmps superset = + let segments = Superset.Inspection.get_memmap superset in + let cross_section_edges = Superset.ISG.fold_edges superset + (fun src dst csedges -> + let collect_minaddrs addr = + let seg = Memmap.lookup segments addr in + Seq.fold seg ~init:Addr.Set.empty ~f:(fun s1 (mem,_) -> + Set.add s1 @@ Memory.min_addr mem + ) in + let s1 = collect_minaddrs src in + let s2 = collect_minaddrs dst in + if not (Set.(length @@ inter s1 s2) >= 1) then + let ft1 = Superset.is_fall_through superset src dst in + let ft2 = Superset.is_fall_through superset dst src in + if (ft1 || ft2) then ( + (*Superset_risg.G.remove_edge insn_risg src dst;*) + Map.set csedges ~key:src ~data:dst + ) else csedges + else csedges + ) Addr.Map.empty in + cross_section_edges + +let extract_trim_clamped superset = + let to_clamp = find_free_insns superset in + let visited = Addr.Hash_set.create () in + let datas = Addr.Hash_set.create () in + Set.iter to_clamp ~f:(fun c -> + if not Hash_set.(mem visited c) then + if Superset.Core.mem superset c then ( + Traverse.mark_descendent_bodies_at + ~visited ~datas superset c + ) + ); + Hash_set.iter datas ~f:(fun d -> + if Hash_set.(mem visited d) || Set.(mem to_clamp d) then + Superset.Core.clear_bad superset d + ); + clear_each superset visited; + superset + +let time ?(name="") f x = + let t = Stdlib.Sys.time() in + let fx = f x in + let s = sprintf "%s execution time: %fs\n" name (Stdlib.Sys.time() -. t) in + print_endline s; + fx + + + +let extract_trim_limited_clamped superset = + let visited = Addr.Hash_set.create () in + let callsites = get_callsites ~threshold:0 superset in + let f s = tag_callsites visited ~callsites s in + let superset = time ~name:"tagging callsites: " f superset in + let () = Superset.Core.clear_all_bad superset in + let superset = time ~name:"extract_trim_clamped " + extract_trim_clamped superset in + clear_each superset visited; + superset + +let fixpoint_descendants superset extractf depth = + let rec fix_descendants cur_features d = + if d >= depth then + cur_features + else + let visited = Addr.Hash_set.create () in + let subset_features = Addr.Hash_set.create () in + Hash_set.iter cur_features ~f:(fun cur -> + if not Hash_set.(mem visited cur) then + Traverse.with_descendents_at superset + ~pre:(fun v -> + if Hash_set.(mem cur_features v) + && not Addr.(cur = v) then + Hash_set.add subset_features v + ) ~visited cur + else Hash_set.add subset_features cur + ); + fix_descendants subset_features (d+1) + in + let cur_features = extractf superset in + fix_descendants cur_features 0 + +let fixpoint_map superset feature_pmap = + let visited = Addr.Hash_set.create () in + let entries = Superset.entries_of_isg superset in + Hash_set.fold ~init:feature_pmap entries ~f:(fun feature_pmap cur -> + if not Hash_set.(mem visited cur) then + let prev = ref [] in + let feature_pmap = ref feature_pmap in + Traverse.with_descendents_at ~pre:(fun v -> + match Map.find !feature_pmap v with + | None -> () + | Some(p) -> + prev := List.append p !prev; + feature_pmap := Map.set !feature_pmap ~key:v ~data:!prev; + ) ~visited superset cur; + !feature_pmap + else feature_pmap + ) + +let fixpoint_grammar superset depth = + let extractf superset = + Superset.get_branches superset in + fixpoint_descendants superset extractf depth + +let fixpoint_ssa superset depth = + let extractf superset = + let ssa_map = extract_ssa_to_map superset in + let ssa = Addr.Hash_set.create () in + List.iter Map.(data ssa_map) ~f:Hash_set.(add ssa); + ssa in + fixpoint_descendants superset extractf depth + +let fixpoint_freevarssa superset depth = + let extractf superset = + let freevars_map = extract_freevarssa_to_map superset in + let freevars = Addr.Hash_set.create () in + List.iter Map.(data freevars_map) ~f:Hash_set.(add freevars); + freevars in + fixpoint_descendants superset extractf depth + +let fixpoint_tails superset = + let extractf superset = + let conflicts = Superset.Occlusion.find_all_conflicts superset in + let tails_map = + Decision_trees.tails_of_conflicts superset conflicts in + let tails = Addr.Hash_set.create () in + List.iter Map.(keys tails_map) ~f:Hash_set.(add tails); + tails + in + fixpoint_descendants superset extractf 4 + +let allfeatures = + "RestrictedClamped" :: + "ExtendedClamped" :: + "ClassicGrammar" :: + "LinearGrammar" :: + "UnfilteredGrammar" :: + "FalseBranchMap" :: + "FilteredFalseBranchMap" :: + "UnfilteredSCC" :: + "FreeVarSSA" :: + "FixpointGrammar" :: + "FixpointSSA" :: + "FixpointFreevarSSA" :: + "FixpointTails" :: + default_features + +let get_branches superset = + let branches = Superset.get_branches superset in + transform branches + +let branch_map_of_branches superset branches = + let name = Superset.Inspection.filename superset in + let name = Option.value_exn name in + let true_positives = Metrics.true_positives superset name in + let branches = + Hash_set.fold true_positives ~init:branches ~f:Set.remove in + Set.fold branches ~init:Addr.Map.empty ~f:(fun fpbranchmap fpbranch -> + let target = + List.find_exn Superset.ISG.(descendants superset fpbranch) + ~f:Superset.(is_fall_through superset fpbranch) in + Map.set fpbranchmap ~key:fpbranch ~data:target + ) +let extract_fp_branches superset = + let branches = get_branches superset in + branch_map_of_branches superset branches +let extract_fp_branches superset = + let branches = get_branches superset in + branch_map_of_branches superset branches +let linear_grammar superset = + let entries = Superset.entries_of_isg superset in + transform Grammar.(linear_branch_sweep superset entries) +let classic_grammar superset = + transform Grammar.(identify_branches superset) +let extract_loops_to_set superset = + let loops = Superset.ISG.raw_loops superset in + let loops = List.filter loops ~f:(fun l -> List.length l >= 2) in + Grammar.addrs_of_loops loops + +let extract_filter_loops superset = + Grammar.addrs_of_filtered_loops superset + +let extract_loops_with_break superset = + let loop_addrs = extract_loop_addrs superset in + Map.fold ~init:Addr.Set.empty loop_addrs ~f:(fun ~key ~data loops -> + let loop = List.fold ~init:Addr.Set.empty data ~f:Set.add in + let has_break = List.exists data + ~f:(fun addr -> + let targets = Superset.ISG.descendants superset addr in + List.exists targets + ~f:(fun x -> not Set.(mem loop x)) + ) in + if has_break then Set.union loops loop else loops + ) + +let extract_constants_to_set superset = + let constants = extract_constants superset in + Map.fold constants ~init:Addr.Set.empty ~f:(fun ~key ~data consts -> + Set.add consts data + ) +let extract_exitless superset = + let returned = Addr.Hash_set.create () in + let entries = Superset.entries_of_isg superset in + Hash_set.iter entries ~f:(fun entry -> + Traverse.with_ancestors_at superset + ?post:None ~pre:(Hash_set.add returned) entry + ); + Superset.Core.fold superset ~f:(fun ~key ~data exitless -> + let v = key in + if not (Hash_set.mem returned v) + then Set.add exitless v else exitless + ) ~init:Addr.Set.empty + +let collect_descendants superset ?insn_isg ?visited ?datas targets = + let visited = Option.value visited ~default:(Addr.Hash_set.create ()) in + let datas = Option.value datas ~default:(Addr.Hash_set.create ()) in + Hash_set.iter targets ~f:(fun v -> + if not Hash_set.(mem visited v) then + Traverse.mark_descendent_bodies_at ~visited ~datas superset v + ) + +let extract_img_entry superset = + let e = Addr.Set.empty in + match Superset.Inspection.get_main_entry superset with + | Some mentry -> + let s = sprintf "entry: %s" + Addr.(to_string mentry) in + print_endline s; + Set.add e mentry + | None -> e + +let extract_trim_callsites superset = + let visited = Addr.Hash_set.create () in + let callsites = get_callsites ~threshold:2 superset in + let protection = get_callsites ~threshold:0 superset in + collect_descendants superset ~visited protection; + Superset.Core.clear_all_bad superset; + let superset = tag_callsites visited ~callsites superset in + clear_each superset visited; + superset +let extract_trim_loops_with_break superset = + (*let loops = extract_loops_with_break superset in*) + superset +let extract_trim_entry superset = + let imgentry = extract_img_entry superset in + Set.iter imgentry ~f:Traverse.(mark_descendent_bodies_at superset); + superset +let extract_trim_noexit superset = + let exitless = extract_exitless superset in + Set.iter exitless ~f:Superset.Core.(mark_bad superset); + superset +let extract_trim_fixpoint_grammar superset = + let gdesc = fixpoint_grammar superset 10 in + let visited = Addr.Hash_set.create () in + let datas = Addr.Hash_set.create () in + let callsites = get_callsites ~threshold:0 superset in + let superset = tag_callsites visited ~callsites superset in + Superset.Core.clear_all_bad superset; + collect_descendants ~visited superset gdesc; + Hash_set.iter datas ~f:(fun d -> + if Hash_set.(mem visited d) || Hash_set.(mem gdesc d) then + Superset.Core.clear_bad superset d + ); + clear_each superset visited; + clear_each superset gdesc; + superset +let extract_trim_fixpoint_ssa superset = + let gdesc = fixpoint_ssa superset 6 in + let visited = Addr.Hash_set.create () in + let datas = Addr.Hash_set.create () in + let callsites = get_callsites ~threshold:0 superset in + (*collect_descendants ~visited ~insn_isg superset callsites;*) + let superset = tag_callsites visited ~callsites superset in + Superset.Core.clear_all_bad superset; + collect_descendants ~visited superset gdesc; + Hash_set.iter datas ~f:(fun d -> + if Hash_set.(mem visited d) || Hash_set.(mem gdesc d) then + Superset.Core.clear_bad superset d + ); + clear_each superset visited; + superset +let extract_trim_fixpoint_freevarssa superset = + let gdesc = fixpoint_freevarssa superset 6 in + let visited = Addr.Hash_set.create () in + let datas = Addr.Hash_set.create () in + let callsites = get_callsites ~threshold:0 superset in + (*collect_descendants ~visited ~insn_isg superset callsites;*) + let superset = tag_callsites visited ~callsites superset in + Superset.Core.clear_all_bad superset; + collect_descendants ~visited superset gdesc; + Hash_set.iter datas ~f:(fun d -> + if Hash_set.(mem visited d) || Hash_set.(mem gdesc d) then + Superset.Core.clear_bad superset d + ); + clear_each superset visited; + superset +let extract_trim_fixpoint_tails superset = + let tdesc = fixpoint_tails superset in + let visited = Addr.Hash_set.create () in + let datas = Addr.Hash_set.create () in + let callsites = get_callsites ~threshold:0 superset in + let superset = tag_callsites visited ~callsites superset in + Superset.Core.clear_all_bad superset; + Hash_set.iter tdesc ~f:(fun v -> + if not Hash_set.(mem visited v) then + Traverse.mark_descendent_bodies_at ~visited ~datas superset v + ); + Hash_set.iter datas ~f:(fun d -> + if Hash_set.(mem visited d) || Hash_set.(mem tdesc d) then + Superset.Core.clear_bad superset d + ); + clear_each superset visited; + superset + +let discard_edges superset = + Superset.ISG.fold_edges superset + (fun child parent superset -> + if not Superset.(is_fall_through superset parent child) then ( + match Superset.Core.lookup superset parent with + | None -> superset + | Some (mem, insn) -> + match insn with + | Some(insn) -> + let insn = Insn.of_basic insn in + if Insn.(is Insn.call insn) then + Superset.ISG.unlink superset child parent + else superset + | None -> superset + ) else superset + ) superset + (*let edges = Superset.get_non_fall_through_edges superset in*) + +type extractor = (Superset.t -> Addr.Set.t) +type ('b) mapextractor = (Superset.t -> 'b Addr.Map.t) +type setfilter = (Superset.t -> Addr.Set.t -> Addr.Set.t) +type ('b) mapfilter = (Superset.t -> 'b Addr.Map.t -> 'b Addr.Map.t) +type setexfilt = extractor * setfilter +type ('a, 'b) mapexfilt = ('b) mapextractor * ('b) mapfilter +let unfiltered _ = Fn.id + +let _exfiltset = [ + ("FixpointGrammar", + ((fun x -> transform (fixpoint_grammar x 0)), unfiltered)); + ("FixpointTails", + ((fun x -> transform (fixpoint_tails x)), unfiltered)); + ("FixpointFreevarSSA", + ((fun x -> transform (fixpoint_freevarssa x 0)), unfiltered)); + ("LinearGrammar", (linear_grammar, unfiltered)); + ("UnfilteredGrammar", (get_branches, unfiltered)); + ("ClassicGrammar", (classic_grammar, unfiltered)); + ("Callsites3", + ((fun x -> transform (get_callsites + ~threshold:6 x)), unfiltered)); + ("Clamped", (find_free_insns, unfiltered)); + ("RestrictedClamped", (restricted_clamp, unfiltered)); + ("ExtendedClamped", (extended_clamp, unfiltered)); + ("UnfilteredSCC", (extract_loops_to_set,unfiltered)); + ("LoopsWithBreak", (extract_loops_with_break,unfiltered)); + ("SCC", (extract_filter_loops,unfiltered)); + ("NoExit", (extract_exitless, unfiltered)); + ("Constant", (extract_constants_to_set,unfiltered)); + ("ImgEntry", (extract_img_entry, unfiltered)); +] +let exfiltset :(setexfilt) String.Map.t + = List.fold ~init:String.Map.empty _exfiltset + ~f:(fun exfiltset (name, f) -> + String.Map.set exfiltset ~key:name ~data:f + ) + +let _exfiltmap = [ + ("SSA", (extract_ssa_to_map, unfiltered)); + ("FalseBranchMap", (extract_fp_branches, unfiltered)); + ("FreeVarSSA", (extract_freevarssa_to_map, unfiltered)); + ("SSA", (extract_ssa_to_map, unfiltered)); +] +let exfiltmap : ((unit, Addr.t) mapexfilt) String.Map.t + = List.fold ~init:String.Map.empty _exfiltmap + ~f:(fun exfiltmap (name, f) -> + String.Map.set exfiltmap ~key:name ~data:f + ) + +let featureflist = + [("Callsites3", extract_trim_callsites); + ("DiscardEdges", discard_edges); + ("LoopsWithBreak", extract_trim_loops_with_break); + ("ImgEntry",extract_trim_entry); + (*("SCC", extract_tag_loops)*) + ("NoExit", extract_trim_noexit); + ("TrimLimitedClamped" ,extract_trim_limited_clamped); + ("TrimFixpointGrammar", extract_trim_fixpoint_grammar); + ("TrimFixpointSSA", extract_trim_fixpoint_ssa); + ("TrimFixpointFreevarSSA", extract_trim_fixpoint_freevarssa); + ("TrimFixpointTails", extract_trim_fixpoint_tails); + ] +let featuremap = List.fold featureflist ~init:String.Map.empty + ~f:(fun featuremap (name, f) -> + Map.set featuremap ~key:name ~data:f + ) + +let apply_featureset featureset superset = + let superset = List.fold ~init:(superset) featureset ~f:(fun (superset) feature -> + match Map.(find featuremap feature) with + | None -> superset + | Some (f) -> + print_endline feature; + let superset = f superset in + let superset = Trim.run superset in + superset + ) in + superset + +let fdists = String.Map.empty +let fdists = String.Map.set fdists ~key:"FixpointGrammar" ~data:5 +let fdists = String.Map.set fdists ~key:"FixpointFreevarSSA" ~data:3 + +let make_featurepmap featureset superset = + List.fold ~f:(fun (feature_pmap) feature -> + let p = Map.find fdists feature in + let p = Option.value p ~default:2 in + match Map.(find exfiltset feature) with + | None -> feature_pmap + | Some (extract,filter) -> + print_endline feature; + let fset = extract superset in + Set.fold fset ~init:feature_pmap + ~f:(fun feature_pmap x -> + Map.update feature_pmap x ~f:(function + | Some l -> (p, x, feature) :: l + | None -> [(p, x, feature)] + ) + ) + ) ~init:Addr.Map.empty featureset + +let total_of_features l = + List.fold ~init:0 ~f:(fun x (y,_,_) -> x + y) l + +let apply_featurepmap featureset ?(threshold=50) superset = + let feature_pmap = make_featurepmap featureset superset in + let feature_pmap = fixpoint_map superset feature_pmap in + let feature_pmap = + Map.map feature_pmap ~f:(total_of_features) in + let feature_pmap = + Map.filter feature_pmap ~f:(fun total -> total > threshold) in + let visited = Addr.Hash_set.create () in + let callsites = get_callsites ~threshold:0 superset in + let superset = tag_callsites visited ~callsites superset in + Superset.Core.clear_all_bad superset; + List.iter Map.(keys feature_pmap) ~f:(fun addr -> + Traverse.mark_descendent_bodies_at superset ~visited addr + ); + clear_each superset visited; + superset +(*Trim.trim superset*) diff --git a/lib/bap_superset_disasm/features.mli b/lib/bap_superset_disasm/features.mli new file mode 100644 index 000000000..e69de29bb diff --git a/lib/bap_superset_disasm/fixpoint.ml b/lib/bap_superset_disasm/fixpoint.ml new file mode 100644 index 000000000..9d99e8867 --- /dev/null +++ b/lib/bap_superset_disasm/fixpoint.ml @@ -0,0 +1,28 @@ +open Core +open Bap.Std + +let iterate rounds f superset = + let (superset) = f superset in + let rec do_analysis round superset = + if round = rounds then superset else + let (superset) = f superset in + let superset = Trim.run superset in + do_analysis (round+1) superset in + do_analysis 1 superset + +let protect superset f = + let visited = Addr.Hash_set.create () in + let callsites = Heuristics.get_callsites ~threshold:0 superset in + let superset = Heuristics.tag_callsites visited ~callsites superset in + let superset = f superset in + Superset.Core.clear_each superset visited; + Trim.run superset + +let converge superset heuristics feature_pmap = + let superset = Trim.run superset in + let cache = Addr.Hash_set.create () in + List.iter Map.(keys feature_pmap) ~f:(fun addr -> + Traverse.mark_descendent_bodies_at superset ~visited:cache addr + ); + superset + diff --git a/lib/bap_superset_disasm/fixpoint.mli b/lib/bap_superset_disasm/fixpoint.mli new file mode 100644 index 000000000..a1abcca03 --- /dev/null +++ b/lib/bap_superset_disasm/fixpoint.mli @@ -0,0 +1,6 @@ +open Bap.Std + + +val iterate : int -> (Superset_impl.t -> Superset_impl.t) -> Superset_impl.t -> Superset_impl.t +val protect : Superset_impl.t -> (Superset_impl.t -> Superset_impl.t) -> Superset_impl.t +val converge : Superset_impl.t -> 'a -> 'b Addr.Map.t -> Superset_impl.t diff --git a/lib/bap_superset_disasm/grammar.ml b/lib/bap_superset_disasm/grammar.ml new file mode 100644 index 000000000..2d53f2c4c --- /dev/null +++ b/lib/bap_superset_disasm/grammar.ml @@ -0,0 +1,202 @@ +open Core +open Bap.Std + +(** In superset disassembly, branches can occur from within the + bodies of instructions originally intended by the + compiler. Therefore, identify branches tries to visit starting + from entries, and see what set of branches include some child + that is a descendent from which the traversal originated. *) +let identify_branches superset = + let deferred = ref Addr.Map.empty in + let entries = Superset.entries_of_isg superset in + (* need to create a sequence of non-fall through edges *) + let insns = Addr.Hash_set.create () in + let branches = Addr.Hash_set.create () in + let tag_branches addr = + if Superset.is_branch superset addr then + let inbound = Superset.ISG.descendants superset addr in + List.iter inbound ~f:(fun child -> + (* check for edges between instructions that are not + fall through, but for which *) + if Hash_set.mem insns child then + let ft = Superset.fall_through_of superset addr in + if not Addr.(ft = child) && + not Addr.(addr = child) then + deferred := Map.set !deferred ~key:ft ~data:(child, addr) + ); + in + let confirm_branches addr = + match Map.find !deferred addr with + | Some (child, branch) -> + if Hash_set.mem insns child then + Hash_set.add branches branch + | None -> () + in + let pre addr = + Hash_set.add insns addr; + tag_branches addr + in + let post addr = + Hash_set.remove insns addr in + Traverse.visit + ~pre ~post superset entries; + let pre addr = + Hash_set.add insns addr; + confirm_branches addr + in + Traverse.visit + ~pre ~post superset entries; + branches + +let increment_map_at m ?(x=1) addr = + m := Map.update !m addr + ~f:(fun hits -> Option.value_map hits ~default:1 + ~f:(fun hits -> hits +x)); + Option.value ~default:x Map.(find !m addr) + +(** This searches through the set of blocks starting from entries for + branches that got hit at least twice. The idea is to respect the + diamond structure of control flow, which is that both sides + around a conditional (diamond) must be constructed in the cfg + statically in order for the final target to be sound + w.r.t. assembler rules. Using this technique, can filter false + positive branches. *) +let linear_branch_sweep superset entries = + let jmp_hit_cnt = ref Addr.Map.empty in + let update_hit_count = increment_map_at jmp_hit_cnt in + let pre jmps targets addr = + if Set.mem targets addr then ( + ignore (update_hit_count addr); + ); + match Map.find jmps addr with + | Some(branch) -> + ignore (update_hit_count branch); + | None -> (); + in + let post _ _ _ = () in + let _ = Traverse.visit_by_block superset ~pre ~post entries in + let final_jmps = Addr.Hash_set.create () in + Map.iteri !jmp_hit_cnt ~f:(fun ~key ~data -> + let jmp_addr = key in + let cnt = data in + if cnt = 2 then + Hash_set.add final_jmps jmp_addr; + ); + final_jmps + +(** The objective here is to tag grammar structures while traversing + topologically in such a manner that we can converge the + probability of recognizing an intended sequence by the + compiler. After we've hit some recognition threshold, we begin + traversing forward from some activation point whereby we trim + occlusive instructions. To recognize grammars, we have several + means: one, loops are strongly connected components, and if + sequences must branch at some point only to reify at a common + point, expressing a path by which they can finally rejoin. *) +let tag_by_traversal ?(threshold=8) superset = + let visited = Addr.Hash_set.create () in + (*let callsites = Superset.get_callsites ~threshold:6 superset in + let superset = tag_callsites visited ~callsites superset in + let superset = Invariants.tag_layer_violations superset in + let superset = Invariants.tag_branch_violations superset in*) + let entries = Superset.entries_of_isg superset in + let branches = Superset.get_branches superset in + (*let branches = identify_branches superset in*) + (*let branches = linear_branch_sweep superset entries in*) + let cur_total = ref 0 in + let positives = ref [] in + let entry = ref None in + let tps = Addr.Hash_set.create () in + (* In the case that our current starting point, entry, is none, set *) + (* it to being the address of the lambda parameter, addr. Then, set *) + (* the current total number of recognized grammar items to zero, *) + (* as well as the positives since we're starting over *) + let pre addr = + if Option.is_none !entry then ( + entry := Some(addr); + cur_total := 0; + positives := []; + ); + if Hash_set.mem branches addr then ( + cur_total := !cur_total + 1; + positives := addr :: !positives; + if !cur_total >= threshold then ( + let open Option in + ignore (List.nth !positives threshold >>| + (fun convergent_point -> + Hash_set.add tps convergent_point)); + ) + ) in + let post addr = + entry := Option.value_map !entry ~default:!entry + ~f:(fun e -> if Addr.(e = addr) then None else Some(e)); + if Hash_set.mem branches addr then ( + cur_total := !cur_total - 1; + match !positives with + | _ :: remaining -> positives := remaining + | [] -> (); + ) in + Traverse.visit ~visited + ~pre ~post superset entries; + Hash_set.iter tps ~f:(fun tp -> + if not (Hash_set.mem visited tp) then ( + Traverse.with_descendents_at superset tp ~pre:(fun tp -> + let mark_bad addr = + if Superset.ISG.mem_vertex superset addr then ( + Superset.Core.mark_bad superset addr + ) in + Superset.Occlusion.with_data_of_insn superset tp ~f:mark_bad; + Hash_set.add visited tp; + ) ; + ) + ); + Hash_set.iter visited + ~f:(fun tp -> Superset.Core.clear_bad superset tp); + superset + + +let parents_of_insns superset component = + Set.fold component ~init:Addr.Set.empty ~f:(fun potential_parents addr -> + List.fold (Superset.ISG.ancestors superset addr) + ~init:potential_parents + ~f:(fun potential_parents ancestor -> + if not Set.(mem component ancestor) then + Set.add potential_parents ancestor + else potential_parents + ) + ) + +let addrs_of_loops loops = + List.fold_left loops ~init:Addr.Set.empty + ~f:(fun keep loop -> + Addr.Set.(union keep (of_list loop)) + ) + +let filter_loops ?(min_size=20) loops = + let loops = + List.filter loops ~f:(fun l -> List.length l > min_size) in + addrs_of_loops loops + +let addrs_of_filtered_loops ?(min_size=20) superset = + filter_loops ~min_size @@ Superset.ISG.raw_loops superset + +(** In the body of a loop, instructions fall through eventually to + themselves, which amounts to effectively a trigger of an + invariant. But the level at which invariants operate is too fine + grained to see the consequence propagated from conflicts that are + potentially in loops that are many instructions long. This + function cleanses the bodies of instructions that occur in loops + of a minimum size. *) +let tag_loop_contradictions ?(min_size=20) superset = + let keep = addrs_of_filtered_loops ~min_size superset in + (* Here we have to be careful; we only want to find instructions + that occur within a loop that produce a self-contradiction *) + let parents = parents_of_insns superset keep in + let to_remove = + Superset.Occlusion.conflicts_within_insns superset keep in + let to_remove = Set.inter to_remove parents in + let to_remove = Set.diff to_remove keep in + Set.iter to_remove ~f:(Superset.Core.mark_bad superset); + superset + +let default_tags = [tag_loop_contradictions] diff --git a/lib/bap_superset_disasm/grammar.mli b/lib/bap_superset_disasm/grammar.mli new file mode 100644 index 000000000..3a066fea2 --- /dev/null +++ b/lib/bap_superset_disasm/grammar.mli @@ -0,0 +1,8 @@ +open Bap.Std + +val tag_loop_contradictions : ?min_size:int -> Superset_impl.t -> Superset_impl.t +val tag_by_traversal : ?threshold:int -> Superset_impl.t -> Superset_impl.t +val linear_branch_sweep : Superset_impl.t -> Addr.Hash_set.t -> Addr.Hash_set.t +val identify_branches : Superset_impl.t -> Addr.Hash_set.t +val addrs_of_loops : addr list list -> Addr.Set.t +val addrs_of_filtered_loops : ?min_size:int -> Superset_impl.t -> Addr.Set.t diff --git a/lib/bap_superset_disasm/heuristics.ml b/lib/bap_superset_disasm/heuristics.ml new file mode 100644 index 000000000..079eb2ef7 --- /dev/null +++ b/lib/bap_superset_disasm/heuristics.ml @@ -0,0 +1,471 @@ +open Core +open Bap.Std + +module Dis = Disasm_expert + +module type Heurism = sig + type t + val name : string + val impl : Superset.t -> t +end + +module HeurismSet(H : Heurism) = struct + open Bap_knowledge + open Bap_core_theory + + module Cache = struct + open H + let package = "superset-heuristics" + let addrs_t = + Knowledge.Domain.optional + ~inspect:Addr.Set.sexp_of_t ~equal:Addr.Set.equal "addr.set" + + let addrs_persistent = + Knowledge.Persistent.of_binable + (module struct type t = Addr.Set.t option [@@deriving bin_io] end) + + let attr ty persistent desc = + let open Theory.Program in + Knowledge.Class.property ~package cls name ty + ~persistent ~public:true ~desc + let locations = + attr addrs_t addrs_persistent + ("addresses of all sites of " ^ name ^ " heuristic") + end +end + +let defaults = [ + "ImgEntry"; + (*"NoExit";*) + (*"LoopsWithBreak";*) + "BranchViolations"; + (*"LayerViolations";*) + "TrimLimitedClamped"; + "Callsites3"; + (*"TrimFixpointGrammar"; + "TrimFixpointTails";*) + (*"Clamped"; + "SCC"; + "LoopGrammar"; + "CallsiteLineage"; + "SSA";*) + (*"FreeVarSSA";*) + (*"Grammar";*) + (*"Constant";*) +] +let defaults = List.rev defaults + +let transform = Hash_set.fold ~init:Addr.Set.empty ~f:Set.add + +(** A callsite is a location which is shared as the target of a call + by several other locations in the binary. *) +let get_callsites ?(threshold=6) superset = + let callsites = Addr.Hash_set.create () in + Superset.ISG.iter_vertex superset + (fun v -> + let callers = Superset.ISG.ancestors superset v in + let num_callers = + List.fold callers ~init:0 ~f:(fun total caller -> + if not (Superset.is_fall_through superset caller v) then + total + 1 + else total) in + if num_callers > threshold then ( + Hash_set.add callsites v; + ) + ) ; + callsites + +(** Adds to the set visited the set of reachable descendents of a + callsite of a given sufficient threshold number of external + callers *) +let tag_callsites visited ?callsites superset = + let callsites = Option.value callsites + ~default:(get_callsites ~threshold:6 superset) in + Hash_set.iter callsites ~f:(fun callsite -> + Traverse.with_descendents_at ~visited + ?post:None ?pre:None superset callsite; + ); + superset + +let find_free_insns superset = + let mem = Superset.Core.mem superset in + let all_conflicts = Addr.Hash_set.create () in + let to_clamp = + Superset.Core.fold superset ~init:([]) + ~f:(fun ~key ~data to_clamp -> + let (addr,(memory,_)) = key, data in + let len = Memory.length memory in + let conflicts = Superset.Occlusion.range_seq_of_conflicts + ~mem addr len in + let no_conflicts = Seq.is_empty conflicts in + Seq.iter conflicts ~f:(fun c -> Hash_set.add all_conflicts c); + if no_conflicts && not Hash_set.(mem all_conflicts addr) then + addr :: to_clamp + else ( + to_clamp + ) + ) in + to_clamp + +let restricted_clamp superset = + let entries = Superset.entries_of_isg superset in + let conflicts = Superset.Occlusion.find_all_conflicts superset in + let to_clamp = ref Addr.Set.empty in + Hash_set.iter entries ~f:(fun entry -> + let b = ref false in + let pre v = + if Addr.(v = entry) then + b := false + else if not (!b) then + if Set.mem conflicts v then + b := true + else to_clamp := Set.add (!to_clamp) v + in Traverse.with_ancestors_at ~post:(fun _ -> ()) ~pre superset entry; + ); + !to_clamp + +let extended_clamp superset = + let to_clamp = find_free_insns superset in + List.fold to_clamp ~init:Addr.Set.empty ~f:(fun to_clamp clamp -> + let _, to_clamp = + Superset.ISG.dfs_fold superset + ~pre:(fun (struck,to_clamp) addr -> + if struck then (struck,to_clamp) else + let conflicts = + Superset.Occlusion.conflicts_within_insn_at + superset addr in + let no_conflicts = Set.length conflicts = 0 in + (*let conflicts = Superset.Occlusion.parent_conflict_at + insn_risg insn_map addr in + let no_conflicts = Set.length conflicts = 0 + && no_conflicts in*) + if no_conflicts then (struck, Set.(add to_clamp addr)) + else (true, to_clamp) + ) ~post:(fun x _ -> x) (false, to_clamp) clamp + in to_clamp + ) + +let extract_loop_addrs superset = + let loop_addrs = Superset.ISG.raw_loops superset in + List.fold_left ~init:Addr.Map.empty loop_addrs + ~f:(fun addrs loop -> + if List.length loop >= 2 then + Option.value ~default:addrs + Option.(map List.(hd loop) ~f:(fun addr -> + Map.set addrs ~key:addr ~data:loop)) + else addrs + ) + +let extract_filtered_loop_addrs superset = + let loop_addrs = extract_loop_addrs superset in + Map.filteri loop_addrs ~f:(fun ~key ~data -> + List.length data > 20) + +let extract_constants superset = + let imgmem = + Memmap.to_sequence @@ Superset.Inspection.get_memmap superset in + let addrs = + Seq.bind imgmem + ~f:(fun (segment,_) -> + let words_of_mem mem = + let rec yield_next addr = + let width = Addr.bitwidth addr in + let s = Size.of_int_exn width in + let open Seq.Generator in + match Memory.view ~word_size:s ~from:addr mem with + | Ok next -> + yield next >>= fun () -> yield_next (Addr.succ addr) + | _ -> return () in + Sequence.Generator.run (yield_next Memory.(min_addr mem)) + in words_of_mem segment + ) in + Seq.fold ~init:Addr.Map.empty addrs + ~f:(fun constants m -> + let constant = Memory.(m ^ (min_addr m)) in + match constant with + | Ok constant -> + if Superset.Inspection.contains_addr superset constant + && Superset.Core.(mem superset constant) then + Map.set constants ~key:Memory.(min_addr m) ~data:constant + else constants + | _ -> constants + ) + +let extract_cross_section_jmps superset = + let segments = Superset.Inspection.get_memmap superset in + let cross_section_edges = Superset.ISG.fold_edges superset + (fun src dst csedges -> + let collect_minaddrs addr = + let seg = Memmap.lookup segments addr in + Seq.fold seg ~init:Addr.Set.empty ~f:(fun s1 (mem,_) -> + Set.add s1 @@ Memory.min_addr mem + ) in + let s1 = collect_minaddrs src in + let s2 = collect_minaddrs dst in + if not (Set.(length @@ inter s1 s2) >= 1) then + let ft1 = Superset.is_fall_through superset src dst in + let ft2 = Superset.is_fall_through superset dst src in + if (ft1 || ft2) then ( + (*Superset_risg.G.remove_edge insn_risg src dst;*) + Map.set csedges ~key:src ~data:dst + ) else csedges + else csedges + ) Addr.Map.empty in + cross_section_edges + + +let extract_trim_clamped superset = + let to_clamp = find_free_insns superset in + let visited = Addr.Hash_set.create () in + let datas = Addr.Hash_set.create () in + List.iter to_clamp ~f:(fun c -> + if not Hash_set.(mem visited c) then + if Superset.Core.mem superset c then ( + Traverse.mark_descendent_bodies_at + ~visited ~datas superset c + ) + ); + Superset.Core.clear_each superset visited; + List.iter to_clamp ~f:(Superset.Core.clear_bad superset); + superset + +let extract_trim_limited_clamped superset = + let protection = Addr.Hash_set.create () in + let superset = + if Hash_set.length protection = 0 then ( + let callsites = get_callsites ~threshold:0 superset in + tag_callsites protection ~callsites superset + ) else superset in + Superset.Core.clear_all_bad superset; + let superset = extract_trim_clamped superset in + Superset.Core.clear_each superset protection; superset + +let fixpoint_descendants superset extractf depth = + let rec fix_descendants cur_features d = + if d >= depth then + cur_features + else + let visited = Addr.Hash_set.create () in + let subset_features = Addr.Hash_set.create () in + Hash_set.iter cur_features ~f:(fun cur -> + if not Hash_set.(mem visited cur) then + Traverse.with_descendents_at superset + ~pre:(fun v -> + if Hash_set.(mem cur_features v) + && not Addr.(cur = v) then + Hash_set.add subset_features v + ) ~visited cur + else Hash_set.add subset_features cur + ); + fix_descendants subset_features (d+1) + in + let cur_features = extractf superset in + fix_descendants cur_features 0 + +let fixpoint_map superset feature_pmap = + let visited = Addr.Hash_set.create () in + let entries = Superset.frond_of_isg superset in + Hash_set.fold ~init:feature_pmap entries ~f:(fun feature_pmap cur -> + if not Hash_set.(mem visited cur) then + let prev = ref [] in + let feature_pmap = ref feature_pmap in + Traverse.with_descendents_at ~pre:(fun v -> + match Map.find !feature_pmap v with + | None -> () + | Some(p) -> + prev := List.append p !prev; + feature_pmap := Map.set !feature_pmap ~key:v ~data:!prev; + ) ~visited superset cur; + !feature_pmap + else feature_pmap + ) + +let fixpoint_grammar superset depth = + let extractf superset = + Superset.get_branches superset in + fixpoint_descendants superset extractf depth + +(* TODO all features is not all features *) +let allfeatures = + "RestrictedClamped" :: + "ExtendedClamped" :: + "ClassicGrammar" :: + "LinearGrammar" :: + "UnfilteredGrammar" :: + "FalseBranchMap" :: + "FilteredFalseBranchMap" :: + "UnfilteredSCC" :: + "FreeVarSSA" :: + "FixpointGrammar" :: + "FixpointSSA" :: + "FixpointFreevarSSA" :: + "FixpointTails" :: + defaults + +let get_branches superset = + let branches = Superset.get_branches superset in + transform branches + +let linear_grammar superset = + let entries = Superset.entries_of_isg superset in + transform Grammar.(linear_branch_sweep superset entries) + +let classic_grammar superset = + transform Grammar.(identify_branches superset) + +let extract_loops_to_set superset = + let loops = Superset.ISG.raw_loops superset in + let loops = List.filter loops ~f:(fun l -> List.length l >= 2) in + Grammar.addrs_of_loops loops + +let extract_filter_loops superset = + Grammar.addrs_of_filtered_loops superset + +let extract_loops_with_break superset = + let loop_addrs = extract_loop_addrs superset in + Map.fold ~init:Addr.Set.empty loop_addrs ~f:(fun ~key ~data loops -> + let loop = List.fold ~init:Addr.Set.empty data ~f:Set.add in + let has_break = List.exists data + ~f:(fun addr -> + let targets = Superset.ISG.descendants superset addr in + List.exists targets + ~f:(fun x -> not Set.(mem loop x)) + ) in + if has_break then Set.union loops loop else loops + ) + +let extract_exitless superset = + let returned = Addr.Hash_set.create () in + let entries = Superset.entries_of_isg superset in + Hash_set.iter entries ~f:(fun entry -> + Traverse.with_ancestors_at superset + ?post:None ~pre:(Hash_set.add returned) entry + ); + Superset.Core.fold superset ~f:(fun ~key ~data exitless -> + let v = key in + if not (Hash_set.mem returned v) + then Set.add exitless v else exitless + ) ~init:Addr.Set.empty + +let extract_constants_to_set superset = + let constants = extract_constants superset in + Map.fold constants ~init:Addr.Set.empty ~f:(fun ~key ~data consts -> + Set.add consts data + ) + +let collect_descendants superset ?visited ?datas targets = + let visited = Option.value visited ~default:(Addr.Hash_set.create ()) in + let datas = Option.value datas ~default:(Addr.Hash_set.create ()) in + Hash_set.iter targets ~f:(fun v -> + if not Hash_set.(mem visited v) then + Traverse.mark_descendent_bodies_at ~visited ~datas superset v + ) + +let extract_img_entry superset = + let e = Addr.Set.empty in + match Superset.Inspection.get_main_entry superset with + | Some mentry -> Set.add e mentry + | None -> e + +let extract_trim_callsites superset = + let visited = Addr.Hash_set.create () in + let callsites = get_callsites ~threshold:2 superset in + let protection = get_callsites ~threshold:0 superset in + collect_descendants superset ~visited protection; + Superset.Core.clear_all_bad superset; + let superset = tag_callsites visited ~callsites superset in + Superset.Core.clear_each superset visited; + superset + +let extract_trim_entry superset = + let imgentry = extract_img_entry superset in + Set.iter imgentry ~f:Traverse.(mark_descendent_bodies_at superset); + superset + +let extract_trim_noexit superset = + let exitless = extract_exitless superset in + Set.iter exitless ~f:Superset.Core.(mark_bad superset); + superset + +type extractor = (Superset.t -> Addr.Set.t) +type ('b) mapextractor = (Superset.t -> 'b Addr.Map.t) +type setfilter = (Superset.t -> Addr.Set.t -> Addr.Set.t) +type ('b) mapfilter = (Superset.t -> 'b Addr.Map.t -> 'b Addr.Map.t) +type setexfilt = extractor * setfilter +type ('a, 'b) mapexfilt = ('b) mapextractor * ('b) mapfilter +let unfiltered _ = Fn.id + +let _exfiltset = [ + ("FixpointGrammar", + ((fun x -> transform (fixpoint_grammar x 0)), unfiltered)); + ("Liveness", (Liveness.compute_liveness,unfiltered)); + ("LinearGrammar", (linear_grammar, unfiltered)); + ("UnfilteredGrammar", (get_branches, unfiltered)); + ("ClassicGrammar", (classic_grammar, unfiltered)); + ("Callsites3", + ((fun x -> transform (get_callsites + ~threshold:6 x)), unfiltered)); + ("Clamped", + ((fun s -> Addr.Set.of_list @@ find_free_insns s), unfiltered)); + ("RestrictedClamped", (restricted_clamp, unfiltered)); + ("ExtendedClamped", (extended_clamp, unfiltered)); + ("UnfilteredSCC", (extract_loops_to_set,unfiltered)); + ("LoopsWithBreak", (extract_loops_with_break,unfiltered)); + ("SCC", (extract_filter_loops,unfiltered)); + ("NoExit", (extract_exitless, unfiltered)); + ("Constant", (extract_constants_to_set,unfiltered)); + ("ImgEntry", (extract_img_entry, unfiltered)); +] +let exfiltset :(setexfilt) String.Map.t + = List.fold ~init:String.Map.empty _exfiltset + ~f:(fun exfiltset (name, f) -> + String.Map.set exfiltset ~key:name ~data:f + ) + +let featureflist = + [("Callsites3", extract_trim_callsites); + ("ImgEntry",extract_trim_entry); + (*("SCC", extract_tag_loops)*) + ("NoExit", extract_trim_noexit); + ("TrimLimitedClamped" ,extract_trim_limited_clamped); + ] +let featuremap = List.fold featureflist ~init:String.Map.empty + ~f:(fun featuremap (name, f) -> + Map.set featuremap ~key:name ~data:f + ) + +let with_featureset ~f ~init featureset superset = + let superset = List.fold ~init featureset ~f:(fun accu fname -> + match Map.(find featuremap fname) with + | None -> accu + | Some (feature) -> + f fname feature accu + ) in + superset + +let fdists = String.Map.empty +let fdists = String.Map.set fdists ~key:"FixpointGrammar" ~data:1 +let fdists = String.Map.set fdists ~key:"Liveness" ~data:1 + +let make_featurepmap featureset superset = + List.fold ~f:(fun (feature_pmap) feature -> + let p = Map.find fdists feature in + let p = Option.value p ~default:2 in + match Map.(find exfiltset feature) with + | None -> feature_pmap + | Some (extract,filter) -> + let fset = extract superset in + Set.fold fset ~init:feature_pmap + ~f:(fun feature_pmap x -> + Map.update feature_pmap x ~f:(function + | Some l -> (p, x, feature) :: l + | None -> [(p, x, feature)] + ) + ) + ) ~init:Addr.Map.empty featureset + +let with_featurepmap featureset superset ~f = + let feature_pmap = make_featurepmap featureset superset in + let feature_pmap = fixpoint_map superset feature_pmap in + f feature_pmap featureset superset + diff --git a/lib/bap_superset_disasm/heuristics.mli b/lib/bap_superset_disasm/heuristics.mli new file mode 100644 index 000000000..07d41544f --- /dev/null +++ b/lib/bap_superset_disasm/heuristics.mli @@ -0,0 +1,6 @@ +open Bap.Std + +val get_callsites : ?threshold:(int) -> Superset_impl.t -> Addr.Hash_set.t +val tag_callsites : Addr.Hash_set.t -> ?callsites:Addr.Hash_set.t -> Superset_impl.t -> Superset_impl.t +val with_featureset : f:(string -> (Superset_impl.t -> Superset_impl.t) -> 'a -> 'a) -> init:'a -> string list -> 'b -> 'a +val with_featurepmap : string list -> Superset_impl.t -> f:((int * word * string) list Addr.Map.t -> string list -> Superset_impl.t -> unit) -> unit diff --git a/lib/bap_superset_disasm/invariants.ml b/lib/bap_superset_disasm/invariants.ml new file mode 100644 index 000000000..4f9a91db1 --- /dev/null +++ b/lib/bap_superset_disasm/invariants.ml @@ -0,0 +1,103 @@ +open Bap.Std +open Core + +module Dis = Disasm_expert.Basic + +(** Build the static successors of the current instruction, and tag + the superset with the function f in a micro-context. *) +let tag_with ~f (mem, insn) superset = + let targets = Superset.Inspection.static_successors superset mem insn in + f superset mem insn targets + +(** Looks for control flow edges to addresses that are not statically + known. *) +let tag_target_not_in_mem superset mem insn targets = + List.iter targets + ~f:(fun (target,_) -> + match target with + | Some(target) -> + if not (Superset.Inspection.contains_addr superset target) then + Superset.Core.mark_bad superset target + | None -> () + ); + superset + +(** If a jmp to NULL occurs, then this is triggered. *) +let tag_target_is_bad superset mem insn targets = + let width = Addr.bitwidth @@ Memory.min_addr mem in + let z = Addr.zero width in + List.iter targets + ~f:(fun (target,_) -> + match target with + | Some(target) -> + if Addr.(target = z) then + Superset.Core.mark_bad superset target + | None -> () + ); + superset + +(** If a jmp specifies the body of the instruction itself, this is + triggered. *) +let tag_target_in_body superset mem insn targets = + let src = Memory.min_addr mem in + List.iter targets + ~f:(fun (target,_) -> + match target with + | Some(target) -> + if (Memory.contains mem target) && + not Addr.(src = target) then + Superset.Core.mark_bad superset src + | None -> () + ); + superset + +(** Applies the tag together with the visitor and smaller functions + under the hood. *) + +(** If the instruction could not be disassembled or lifted, then tag + it. *) +let tag_non_insn superset mem insn targets = + let src = Memory.min_addr mem in + if Option.is_none insn then ( + Superset.Core.mark_bad superset src + ); + superset + +(** Used for the maintenance and construction of the superset. *) +let tag_success superset mem insn targets = + let src = Memory.min_addr mem in + List.fold targets ~init:superset ~f:(fun superset (target,_) -> + match target with + | Some (target) -> + Superset.ISG.link superset target src + | None -> superset) + +let default_tags = ["Tag non insn", tag_non_insn; + "Tag target not in mem", tag_target_not_in_mem; + "Tag target is bad", tag_target_is_bad; + "Tag target in body", tag_target_in_body; + (*tag_success;*)] + +let default_funcs = List.map default_tags ~f:snd + +(** Tag an individual instruction of a superset with a list of + invariants. *) +let tag ?invariants = + let invariants = Option.value invariants ~default:default_funcs in + let f superset mem insn targets = + List.fold_left invariants ~init:superset ~f:(fun superset f -> + (f superset mem insn targets)) in + tag_with ~f + + +(** Tag with a list of invariants over an entire superset. *) +let tag_superset ?invariants superset = + let invariants = Option.value invariants ~default:default_funcs in + let f superset mem insn targets = + List.fold ~init:superset invariants + ~f:(fun superset invariant -> + invariant superset mem insn targets) in + Superset.Core.fold ~init:superset superset ~f:(fun ~key ~data superset -> + let mem, insn = data in + tag_with ~f (mem, insn) superset + ) diff --git a/lib/bap_superset_disasm/invariants.mli b/lib/bap_superset_disasm/invariants.mli new file mode 100644 index 000000000..85a7eea13 --- /dev/null +++ b/lib/bap_superset_disasm/invariants.mli @@ -0,0 +1,12 @@ +open Bap.Std + +module Dis = Disasm_expert.Basic + +val tag_success : Superset_impl.t -> mem -> Dis.full_insn option -> Brancher.dests -> Superset_impl.t + +val tag : ?invariants:((Superset_impl.t -> mem -> Dis.full_insn option -> Brancher.dests -> Superset_impl.t) list) -> + Bap.Std.mem * Superset.Dis.full_insn option -> + Superset_impl.t -> Superset_impl.t + +val default_tags : (string * (Superset_impl.t -> mem -> Dis.full_insn option -> Brancher.dests -> Superset_impl.t)) list + diff --git a/lib/bap_superset_disasm/liveness.ml b/lib/bap_superset_disasm/liveness.ml new file mode 100644 index 000000000..4c6a1d803 --- /dev/null +++ b/lib/bap_superset_disasm/liveness.ml @@ -0,0 +1,56 @@ +open Bap.Std +open Core +open Graphlib.Std + +let stmt_def_freevars = + object(self) + inherit [Var.Set.t] Stmt.visitor + method! enter_move def use accu = + Set.add accu def + end + +type rev_ssa = { + defs : Var.Set.t; + uses : Var.Set.t; + } + +let transitions superset = + Superset.ISG.fold_vertex superset (fun addr fs -> + match Superset.Core.lift_at superset addr with + | Some bil -> + Addr.Map.add_exn fs ~key:addr ~data:{ + defs = stmt_def_freevars#run bil Var.Set.empty; + uses = Bil.free_vars bil; + } + | None -> fs + ) Addr.Map.empty + +let (++) = Set.union and (--) = Set.diff + +let compute_liveness superset = + let start = Addr.of_int ~width:1 0 in + let _exit = Addr.of_int ~width:1 1 in + let entries = Superset.entries_of_isg superset in + let superset = Hash_set.fold ~init:superset entries + ~f:(fun s e -> Superset.ISG.link s _exit e) in + let frond = Superset.frond_of_isg superset in + let superset = Hash_set.fold frond ~init:superset + ~f:(fun s e -> Superset.ISG.link s e start) in + let init = Solution.create Addr.Map.empty Var.Set.empty in + let tran = transitions superset in + let soln = Superset.ISG.fixpoint superset ~init ~start ~rev:false + ~merge:Var.Set.union ~equal:Var.Set.equal ?step:None ?steps:None + ~f:(fun n vars -> + if Addr.equal n _exit || Addr.equal n start then vars + else + match Map.find tran n with + | Some {defs; uses} -> + vars -- defs ++ uses + | None -> vars + ) in + let superset = Superset.Core.remove superset _exit in + let _ = Superset.Core.remove superset start in + let liveness_pairs = Solution.enum soln in + Seq.fold liveness_pairs ~init:Addr.Set.empty ~f:(fun s (addr,_) -> + Addr.Set.add s addr) + diff --git a/lib/bap_superset_disasm/liveness.mli b/lib/bap_superset_disasm/liveness.mli new file mode 100644 index 000000000..c648803b7 --- /dev/null +++ b/lib/bap_superset_disasm/liveness.mli @@ -0,0 +1,3 @@ +open Bap.Std + +val compute_liveness : Superset_impl.t -> Addr.Set.t diff --git a/lib/bap_superset_disasm/metrics.ml b/lib/bap_superset_disasm/metrics.ml new file mode 100644 index 000000000..b03659cbb --- /dev/null +++ b/lib/bap_superset_disasm/metrics.ml @@ -0,0 +1,308 @@ +open Bap.Std +open Core +open Or_error + +module Linear = Disasm_expert.Linear + +let read arch ic : (string * addr * addr) list = + let sym_of_sexp x = [%of_sexp:string * int64 * int64] x in + let addr_of_int64 x = + let width = Arch.addr_size arch |> Size.in_bits in + Addr.of_int64 ~width x in + List.(Sexp.input_sexps ic >>| sym_of_sexp >>| (fun (s, es, ef) -> + s, addr_of_int64 es, addr_of_int64 ef)) + +let read_addrs ic : addr list = + List.t_of_sexp Addr.t_of_sexp @@ Sexp.input_sexp ic + +let ground_truth_of_unstripped_bin bin : addr seq Or_error.t = + let name = Filename.basename bin in + let tmp = Filename.temp_dir_name ^ "/bw_" ^ name ^ ".symbol" in + let cmd = sprintf "bap-byteweight dump -i symbols %S > %S" + bin tmp in + if Stdlib.Sys.command cmd = 0 + then return (Seq.of_list @@ In_channel.with_file tmp ~f:read_addrs) + else errorf + "failed to fetch symbols from unstripped binary, command `%s' + failed" cmd + +let true_positives_of_ground_truth superset ground_truth = + let true_positives = Addr.Hash_set.create () in + Set.iter ground_truth ~f:(fun addr -> + if Superset.ISG.mem_vertex superset addr then + Traverse.with_descendents_at + ~visited:true_positives + superset addr; + ); + true_positives + +let true_positives superset f = + let function_starts = ground_truth_of_unstripped_bin f |> ok_exn + in + let ground_truth = + Addr.Set.of_list @@ Seq.to_list function_starts in + true_positives_of_ground_truth superset ground_truth + +module Cache = struct + open Bap_knowledge + open Bap_core_theory + + let package = "superset-disasm-metrics" + let sym_label = + KB.Symbol.intern "superset_analysis" Theory.Program.cls + + let bool_t = Knowledge.Domain.optional + ~inspect:sexp_of_bool ~equal:Bool.equal "bool" + let bool_persistent = + Knowledge.Persistent.of_binable + (module struct type t = bool option [@@deriving bin_io] end) + + let int_t = Knowledge.Domain.optional + ~inspect:sexp_of_int ~equal:Int.equal "int" + + let int_persistent = + Knowledge.Persistent.of_binable + (module struct type t = int option [@@deriving bin_io] end) + + let addrs_t = + Knowledge.Domain.optional + ~inspect:Addr.Set.sexp_of_t ~equal:Addr.Set.equal "addr.set" + + let addrs_persistent = + Knowledge.Persistent.of_binable + (module struct type t = Addr.Set.t option [@@deriving bin_io] end) + + let attr ty persistent name desc = + let open Theory.Program in + Knowledge.Class.property ~package cls name ty + ~persistent ~public:true ~desc + + let string_persistent = + Knowledge.Persistent.of_binable + (module struct type t = string [@@deriving bin_io] end) + + let ground_truth_source = + let open Knowledge.Domain in + attr string string_persistent "ground_truth_source" + "Binary containing debug information" + + let function_entrances = + attr addrs_t addrs_persistent "function_entrances" + "List of compiler intended function entrances" + + let ground_truth = + attr addrs_t addrs_persistent "ground_truth" + "Statically reachable descendents of function entrances" + + let size = + attr int_t int_persistent "code_size" + "Total number of byte that the code memory segments occupy" + + let time = + attr int_t int_persistent "processing_time" + "Time required to process this binary" + + let occlusive_space = + attr int_t int_persistent "occlusive_space" + "Number of addresses are in the bodies (addrs) of ground truth" + + let reduced_occlusion = + attr int_t int_persistent "reduced_occlusion" + "Of the occlusive space, how many are occupied" + + let false_negatives = + attr int_t int_persistent "false_negatives" + "Number of compiler intended instructions missing" + + let false_positives = + attr int_t int_persistent "false_positives" + "Number of compiler intended instructions missing" + + let true_positives = + attr int_t int_persistent "true_positives" + "Number of retained compiler intended instructions" + + let clean_functions = + attr addrs_t addrs_persistent "clean_functions" + "Functions with completely empty occlusive space" + + let occluded_entrances = + attr addrs_t addrs_persistent "occluded_entrances" + "Functions with at least one address that is obstructed" +end + +let set_ground_truth superset = + let open Bap_knowledge in + let open Bap_core_theory in + let open KB.Syntax in + KB.promise Cache.function_entrances (fun label -> + KB.collect Cache.ground_truth_source label >>= fun bin -> + (* list of compiler intended entrances *) + let function_addrs = ground_truth_of_unstripped_bin bin + |> ok_exn in + let function_addrs = + Addr.Set.of_list @@ Seq.to_list function_addrs in + KB.return (Some function_addrs) + ); + + KB.promise Cache.ground_truth (fun label -> + (* List of compiler intended addresses *) + KB.collect Cache.function_entrances label >>= + fun function_addrs -> + match function_addrs with + | None -> KB.return None + | Some function_addrs -> + let visited = Addr.Hash_set.create () in + Set.iter function_addrs ~f:(fun x -> + Traverse.with_descendents_at ~visited superset x + ); + let ground_truth = Hash_set.fold visited ~init: + Addr.Set.empty ~f:Set.add in + KB.return (Some ground_truth) + ) + +let compute_metrics superset = + let open Bap_knowledge in + let open Bap_core_theory in + let open KB.Syntax in + KB.promise Cache.size (fun label -> + KB.return (Some (Superset.Inspection.total_bytes superset)) + ); + + KB.promise Cache.occlusive_space (fun label -> + KB.collect Cache.ground_truth label >>= fun ground_truth -> + match ground_truth with + | None -> KB.return None + | Some ground_truth -> + KB.return @@ + Some (Set.fold ground_truth ~init:0 + ~f:(fun occ addr -> + occ + (Superset.Inspection.len_at superset addr) + )) + ); + + (* per feature metrics *) + KB.promise Cache.reduced_occlusion (fun label -> + KB.collect Cache.ground_truth label >>= fun ground_truth -> + match ground_truth with + | None -> KB.return None + | Some ground_truth -> + KB.return @@ + Some (Set.fold ground_truth ~init:0 + ~f:(fun ro addr -> + let conflicts = + Superset.Occlusion.conflict_seq_at + superset addr in + ro + (Seq.length conflicts) + )) + + ); + + KB.promise Cache.clean_functions (fun label -> + KB.collect Cache.function_entrances label >>= + fun function_entrances -> + match function_entrances with + | None -> KB.return None + | Some (function_entrances) -> + let ro_at x = + let ro = ref false in + let pre x = + let c = Superset.Occlusion.conflict_seq_at superset x in + ro := Seq.exists c ~f:(fun _ -> true); in + Traverse.with_descendents_at superset ~pre x; !ro in + let init = Addr.Set.empty in + KB.return @@ + (Some + (Set.fold function_entrances ~init ~f:(fun clean x -> + if ro_at x then clean else Set.add clean x + )) + ) + ); + + KB.promise Cache.occluded_entrances (fun label -> + KB.collect Cache.function_entrances label >>= function + | None -> KB.return None + | Some funcs -> + let occluded_starts = + Set.fold funcs ~init:Addr.Set.empty ~f:(fun occluded start -> + let behind = Addr.(start -- 20) in + let range = Superset.Core.seq_of_addr_range behind 21 in + let range = Seq.filter range ~f:(fun v -> not Addr.(v = start)) in + let addr = + Seq.find range ~f:(fun current -> + Seq.exists + (Superset.Occlusion.conflict_seq_at superset current) + ~f:(fun conflict -> Addr.((not (conflict = current)) + && conflict = start) + )) in + Option.value_map addr ~default:occluded ~f:Set.(add occluded) + ) in + KB.return (Some occluded_starts) + ); + + KB.promise Cache.true_positives (fun label -> + KB.collect Cache.ground_truth label >>= fun ground_truth -> + match ground_truth with + | None -> KB.return None + | Some ground_truth -> + KB.return + (Some (Set.fold ground_truth ~init:0 ~f:(fun tp_cnt x -> + if Superset.Core.mem superset x then + tp_cnt + 1 + else tp_cnt + )) + ) + ); + + KB.promise Cache.false_negatives (fun label -> + KB.collect Cache.ground_truth label >>= fun ground_truth -> + match ground_truth with + | Some ground_truth -> + let fn_cnt = + Set.fold ground_truth ~init:0 ~f:(fun cnt x -> + if not (Superset.Core.mem superset x) then + cnt + 1 + else cnt + ) in + KB.return @@ Some fn_cnt + | None -> KB.return None + ); + + KB.promise Cache.false_positives (fun label -> + KB.collect Cache.ground_truth label >>= fun ground_truth -> + match ground_truth with + | Some ground_truth -> + let false_positives = + Superset.Core.fold superset ~init:0 + ~f:(fun ~key ~data c -> + if not Set.(mem ground_truth key) then c+1 + else c) in + KB.return (Some false_positives) + | None -> KB.return None + ) + +type t = { + size : int option; + time : int option; + occ : int option; + occ_space : int option; + fe : int option; + clean : int option; + fns : int option; + fps : int option; + tps : int option; + } [@@deriving sexp] + +let get_summary () = { + size = Toplevel.eval Cache.size Cache.sym_label; + time = Toplevel.eval Cache.time Cache.sym_label; + fe = Option.map ~f:Set.length @@ + Toplevel.eval Cache.function_entrances Cache.sym_label; + occ_space = Toplevel.eval Cache.occlusive_space Cache.sym_label; + occ = Toplevel.eval Cache.reduced_occlusion Cache.sym_label; + fns = Toplevel.eval Cache.false_negatives Cache.sym_label; + fps = Toplevel.eval Cache.false_positives Cache.sym_label; + tps = Toplevel.eval Cache.true_positives Cache.sym_label; + clean = Option.map ~f:Set.length @@ + Toplevel.eval Cache.clean_functions Cache.sym_label; + } diff --git a/lib/bap_superset_disasm/metrics.mli b/lib/bap_superset_disasm/metrics.mli new file mode 100644 index 000000000..dedc27b14 --- /dev/null +++ b/lib/bap_superset_disasm/metrics.mli @@ -0,0 +1,52 @@ +open Core +open Bap.Std + +module Cache : sig + open Bap_knowledge + open Bap_core_theory + open Theory + + val sym_label : program Knowledge.obj KB.t + val ground_truth_source : (program, string) Knowledge.slot + + val function_entrances : (program, Addr.Set.t option) Knowledge.slot + + val ground_truth : (program, Addr.Set.t option) Knowledge.slot + + val size : (program, int option) Knowledge.slot + + val time : (program, int option) Knowledge.slot + + val occlusive_space : (program, int option) Knowledge.slot + + val reduced_occlusion : (program, int option) Knowledge.slot + + val false_negatives : (program, int option) Knowledge.slot + + val false_positives : (program, int option) Knowledge.slot + + val true_positives : (program, int option) Knowledge.slot + + val clean_functions : (program, Addr.Set.t option) Knowledge.slot + +end + +val set_ground_truth : Superset.t -> unit +val compute_metrics : Superset.t -> unit +val true_positives : Superset.t -> string -> Addr.Hash_set.t +type t = { + size : int option; + time : int option; + occ : int option; + occ_space : int option; + fe : int option; + clean : int option; + fns : int option; + fps : int option; + tps : int option; + } + +val t_of_sexp : Sexp.t -> t +val sexp_of_t : t -> Sexp.t + +val get_summary : unit -> t diff --git a/lib/bap_superset_disasm/report.ml b/lib/bap_superset_disasm/report.ml new file mode 100644 index 000000000..8613b7a92 --- /dev/null +++ b/lib/bap_superset_disasm/report.ml @@ -0,0 +1,103 @@ +open Core +open Bap.Std + +let () = Random.self_init () + +module Distribution = struct + type t = { + (* The ith elem says the number (int) of insns with i + accumulated total heurism instances *) + fp_at : int array; + tp_at : int array; + (* The ith list elem (x, y list) says the number (x) of true + positives that have i many heurisms in their favor and how + many false positives (y) they occlude with j number of heurisms. *) + fp_competitors_at : (int * int array) array; + } [@@deriving sexp,bin_io] + + let empty threshold = + let len = threshold + 1 in + { + fp_at = Array.create ~len 0; + tp_at = Array.create ~len 0; + fp_competitors_at = Array.create ~len (0,Array.create ~len 0); + } + + let add dist tps inst = + let (p,addr,heurism) = inst in + let p = min p @@ ((Array.length dist.fp_at) - 1) in + let insert c = + let cur = Array.get c p in + Array.set c p (cur+1) in + let _ = + if Set.mem tps addr then + insert dist.tp_at + else insert dist.fp_at in + dist +end + +let equal _ _ = false + +module Cache = struct + open Bap_knowledge + open Bap_core_theory + module SMap = struct + type t = Distribution.t String.Map.t [@@deriving bin_io,sexp] + let compare = String.Map.compare String.compare + let empty = String.Map.empty + end + + let distributions = + let package = "superset-disasm-reports" in + let map_persistent = + Knowledge.Persistent.of_binable (module SMap) in + let attr ty persistent name desc = + let open Theory.Program in + Knowledge.Class.property ~package cls name ty + ~persistent + ~public:true + ~desc in + let open Knowledge.Domain in + let smap_domain = + mapping (module String) ~equal:(equal) "string_map" + in + attr smap_domain map_persistent "reports" + "The reports for a given analysis" + +end + +let collect_distributions superset threshold pmap = + let open Bap_knowledge in + let open Bap_core_theory in + KB.promise Cache.distributions (fun o -> + let open KB.Syntax in + KB.collect Metrics.Cache.ground_truth o >>= fun tps -> + let default = Cache.SMap.empty in + KB.return @@ Option.value_map ~default tps ~f:(fun tps -> + let init = String.Map.empty in + let pmap = + Map.mapi pmap ~f:(fun ~key ~data -> + List.fold data ~init:String.Map.empty + ~f:(fun dist (p,addr,ftname) -> + String.Map.update dist ftname ~f:(fun total -> + (Option.value total ~default:0) + p + ) + ) + ) in + let reports = + List.fold (Addr.Map.to_alist pmap) ~init ~f:(fun reports (p_at,fttot) -> + String.Map.fold fttot ~init:reports + ~f:(fun ~key ~data reports -> + let name = key in + let p = data in + String.Map.update reports name ~f:(fun dist -> + let dist = + Option.value dist + ~default:(Distribution.empty threshold) + in Distribution.add dist tps (p,p_at,name) + ) + ) + ) in + reports + ) + ) diff --git a/lib/bap_superset_disasm/report.mli b/lib/bap_superset_disasm/report.mli new file mode 100644 index 000000000..cad2a7726 --- /dev/null +++ b/lib/bap_superset_disasm/report.mli @@ -0,0 +1,3 @@ +open Bap.Std + +val collect_distributions : Superset_impl.t -> int -> (int * word * string) list Addr.Map.t -> unit diff --git a/lib/bap_superset_disasm/superset.ml b/lib/bap_superset_disasm/superset.ml new file mode 100644 index 000000000..206fbd9f2 --- /dev/null +++ b/lib/bap_superset_disasm/superset.ml @@ -0,0 +1,653 @@ +open Bap.Std +open Regular.Std +open Core +open Or_error +open Graphlib.Std +open Bap_knowledge +open Bap_core_theory + +module Dis = Disasm_expert.Basic +open Superset_impl +type elem = Superset_impl.elem +type t = Superset_impl.t + +(* private accessors *) +let get_graph superset = superset.insn_risg +let get_insns superset = superset.insns + +(* private modifiers *) +let add_to_map superset mem insn = + let insns = get_insns superset in + let addr = (Memory.min_addr mem) in + Addr.Table.set insns ~key:addr ~data:(mem, insn); + superset + +module OG = Graphlib.To_ocamlgraph(G) + +let add_to_graph superset mem insn = + let addr = Memory.min_addr mem in + let insn_risg = G.Node.insert addr superset.insn_risg in + { superset with insn_risg } + +module Cache = struct + let package = "superset-disasm" + let sym_label = + KB.Symbol.intern "superset" Theory.Program.cls + + let superset_graph_t = + let sexp_of_edge (s,d) = + Tuple2.sexp_of_t Addr.sexp_of_t Addr.sexp_of_t (s,d) in + let equal (s1,d1) (s2,d2) = Addr.equal s1 s2 && Addr.equal d1 d2 in + let inspect = List.sexp_of_t sexp_of_edge in + Knowledge.Domain.optional + ~inspect ~equal:(List.equal equal) "edges" + + let superset_graph_persistent = + Knowledge.Persistent.of_binable + (module struct type t = (addr * addr) list option [@@deriving bin_io] end) + + let superset_graph = + let attr ty persistent name desc = + let open Theory.Program in + Knowledge.Class.property ~package cls name ty + ~persistent + ~public:true + ~desc in + attr superset_graph_t superset_graph_persistent "superset_graph" + "Graph, including all edges and single nodes." +end + +module Core = struct + let add superset mem insn = + let superset = add_to_graph superset mem insn in + let superset = add_to_map superset mem insn in + superset + + let remove superset addr = + let insn_risg = OG.remove_vertex superset.insn_risg addr in + Addr.Table.remove superset.insns addr; + Addr.Table.remove superset.lifted addr; + { superset with insn_risg; } + + let empty arch = + let brancher = Brancher.of_bil arch in + let module Target = (val target_of_arch arch) in + let lifter = Target.lift in + { + arch; + filename = None; + main_entry = None; + sections = Memmap.empty; + brancher; + endianness= None; + lifter; + insns = Addr.Table.create (); + lifted = Addr.Table.create (); + insn_risg = Graphlib.create (module G) (); + bad = Addr.Hash_set.create (); + keep = Addr.Hash_set.create (); + } + + let lookup superset addr = + Addr.Table.find superset.insns addr + + let mem superset addr = + OG.mem_vertex superset.insn_risg addr + + let fold superset = + Addr.Table.fold superset.insns + + let clear_bad superset addr = + Hash_set.remove superset.bad addr + + let clear_each superset s = + Hash_set.iter s ~f:(fun v -> + clear_bad superset v + ) + + let clear_all_bad superset = + Hash_set.clear superset.bad + + let mark_bad superset addr = + if OG.mem_vertex superset.insn_risg addr then + Hash_set.add superset.bad addr + + let copy_bad superset = + Hash_set.copy superset.bad + + let next_chunk mem ~addr = + let next_addr = Addr.succ addr in + Memory.view ~from:next_addr mem + + let seq_of_addr_range addr len = + let open Seq.Generator in + let rec gen_next_addr cur_addr = + if Addr.(cur_addr >= (addr ++ len)) then + return () + else + yield cur_addr >>= fun () -> + let next_addr = Addr.succ cur_addr in + gen_next_addr next_addr + in run (gen_next_addr Addr.(succ addr)) + + (** Builds a sequence disassembly sequence at every byte offset of + the memory mem. *) + let run_seq dis mem = + let start_addr = Memory.min_addr mem in + let len = Memory.length mem in + let addrs = seq_of_addr_range start_addr len in + Seq.filter_map addrs ~f:(fun addr -> + let m = Memory.view ~from:addr mem in + match m with + | Error _ -> None + | Ok m -> ( + match Dis.insn_of_mem dis m with + | Ok (m, insn, _) -> Some (m, insn) + | Error _ -> Some (m, None) + ) + ) + + (** Fold over the memory at every byte offset with function f *) + let run dis ~accu ~f mem = + Seq.fold ~init:accu ~f:(fun x y -> f y x) (run_seq dis mem) + + (** This builds the disasm type, and runs it on the memory. *) + let disasm ?(backend="llvm") ~addrs ~accu ~f arch memry = + Or_error.map + (Dis.with_disasm ~backend (Arch.to_string arch) + ~f:(fun d -> + let rec next state (addrs,accu) = + match Seq.next addrs with + | None -> Dis.stop state (addrs,accu) + | Some(addr,addrs) -> + match Memory.view ~from:addr memry with + | Error _ -> next state (addrs,accu) + | Ok jtgt -> Dis.jump state jtgt (addrs,accu) in + let invalid state m (addrs,accu) = + let accu = f (m, None) accu in + next state (addrs,accu) in + let hit state m insn (addrs,accu) = + let accu = f (m, (Some insn)) accu in + next state (addrs,accu) in + Ok(Dis.run ~backlog:1 ~stop_on:[`Valid] ~invalid + ~hit d ~init:(addrs,accu) ~return:Fn.id memry) + )) ~f:(fun (_,accu) -> accu) + + let disasm_all ?(backend="llvm") ~accu ~f arch memry = + let addrs = seq_of_addr_range + (Memory.min_addr memry) (Memory.length memry) in + disasm ~backend ~addrs ~accu ~f arch memry + + let lift_at superset addr = + match Addr.Table.find superset.lifted addr with + | Some (bil) -> Some (bil) + | None -> ( + match lookup superset addr with + | Some (mem, insn) -> ( + let bil = + Option.value_map insn ~default:[] ~f:(fun insn -> + match (superset.lifter mem insn) with + | Ok bil -> + Addr.Table.set superset.lifted ~key:addr ~data:bil; + bil + | _ -> [] + ) in + Some (bil) + ) + | None -> None + ) + + let lift_insn superset (mem,insn) = + let addr = Memory.(min_addr mem) in + lift_at superset addr + + (** Perform superset disassembly on mem and add the results. *) + let update_with_mem ?backend ?addrs ?f superset mem = + let f = Option.value f ~default:(fun (m, i) a -> a) in + let f (mem, insn) superset = + let superset = add superset mem insn in + f (mem, insn) superset in + let default = seq_of_addr_range + (Memory.min_addr mem) (Memory.length mem) in + let addrs = Option.value addrs ~default in + disasm ?backend ~accu:superset ~f ~addrs superset.arch mem |> ok_exn + + let is_unbalanced superset = + Addr.Table.length superset.insns <> OG.nb_vertex superset.insn_risg + + (** This function is required by the differences between propagating + data removal maximally and maintaining associations between the + address and the instruction decoded thereof. It strives to make the + results of trimming and tagging consistent between the map and + graph, in order to keep a consistent view. But this may not produce + the one to one needed, since there are scenarios in which there can + be mismatch in some scenarios that are possible in binaries. Using + the tagging and other library together with fixpoint is sufficient + to rule that out, however. *) + let rebalance superset = + let insns = get_insns superset in + let insn_risg = get_graph superset in + if is_unbalanced superset then + OG.iter_vertex (fun vert -> + if not Addr.Table.(mem insns vert) + && Memmap.contains superset.sections vert then ( + mark_bad superset vert; + ) + ) insn_risg; + let insns = + if Hash_set.length superset.bad > 0 || + is_unbalanced superset then + Addr.Table.filteri ~f:(fun ~key ~data -> + let vert = key in + OG.(mem_vertex insn_risg vert) + ) insns + else superset.insns in + let lifted = + if is_unbalanced superset then + let f ~key ~data = OG.mem_vertex insn_risg key in + Addr.Table.filteri superset.lifted ~f + else superset.lifted in + { superset with insn_risg; insns; lifted; } + +end + + +module ISG = struct + let ancestors superset = + OG.succ superset.insn_risg + + let descendants superset = + OG.pred superset.insn_risg + + let iter_vertex superset f = + OG.iter_vertex f superset.insn_risg + + let fold_vertex superset f = + OG.fold_vertex f superset.insn_risg + + let fold_edges superset f = + let insn_risg = superset.insn_risg in + OG.fold_edges f insn_risg + + let to_list superset = + let init = [] in + let f s d acc = (s,d) :: acc in + let l = fold_edges superset f init in + let f v acc = + let g = superset.insn_risg in + if OG.in_degree g v = 0 && OG.out_degree g v = 0 then + (v,v) :: acc + else acc in + fold_vertex superset f l + + let link superset v1 v2 = + let e = G.Edge.create v1 v2 () in + let insn_risg = G.Edge.insert e superset.insn_risg in + { superset with insn_risg } + + let unlink superset v1 v2 = + let insn_risg = OG.remove_edge superset.insn_risg v1 v2 in + { superset with insn_risg } + + let mem_vertex superset = OG.mem_vertex superset.insn_risg + + let check_connected superset e1 e2 = + match OG.find_all_edges + superset.insn_risg e1 e2 with + | [] -> false | _ -> true + + let raw_loops superset = + StrongComponents.scc_list superset.insn_risg + + let dfs_fold ?visited superset = + fold_component ?visited superset.insn_risg + + let dfs ?(terminator=(fun _ -> true)) + ?visited ?(pre=fun _ -> ()) ?(post=fun _ -> ()) explore superset v = + let visited = Option.value visited + ~default:(Addr.Hash_set.create ()) in + let rec visit v = + Hash_set.add visited v; + pre v; + List.iter (explore superset v) + ~f:(fun w -> + if (not (Hash_set.mem visited w)) && (terminator w) then + visit w) ; + post v + in if Core.mem superset v then visit v + + let fixpoint ?steps ?start ?rev ?step superset = + Graphlib.fixpoint ?steps ?start ?rev ?step + (module G) superset.insn_risg + + let print_dot ?colorings superset = + (*if not (colorings = String.Map.empty) then*) + let colorings = Option.value colorings ~default:String.Map.empty in + let fout = + Out_channel.create @@ Option.value_exn superset.filename + ^ ".dot" in + let superset_isg = Oper.mirror superset.insn_risg in + let insns = superset.insns in + let module Layout = + Make(struct + let instance = (superset_isg, colorings, insns) + end) in + Layout.Dot.output_graph fout (superset_isg, colorings, insns) + + let format_isg ?format superset = + let format = Option.value format ~default:Format.std_formatter in + let format = + fold_edges superset (fun src dst format -> + Format.pp_print_string format @@ Addr.to_string src; + Format.pp_print_string format @@ "-"; + Format.pp_print_string format @@ Addr.to_string dst; + Format.pp_print_string format @@ "\n"; + format + ) format in + Format.pp_print_string format @@ "\n"; + iter_vertex superset (fun v -> + Format.pp_print_string format @@ Addr.to_string v; + Format.pp_print_string format @@ "\n"; + ) + + let isg_to_string superset = + let format = Format.str_formatter in + format_isg ~format superset; + Format.flush_str_formatter () + + let parse_isg bin = + let str = In_channel.read_all bin in + let gstr = String.split_lines str in (* heh *) + let init = false,Graphlib.create (module G) () in + let _,g = List.fold ~init gstr ~f:(fun (status,g) line -> + if not status then ( + if String.equal line "" then true,g else ( + let l = String.split line ~on:'-' in + let l = List.map l ~f:Addr.of_string in + match l with + | [src;dst] -> + let e = G.Edge.create src dst () in + status,G.Edge.insert e g + | _ -> status,g + ) + ) else ( + status, OG.add_vertex g @@ Addr.of_string line + ) + ) in g + + let filter superset subgraph = + let insn_risg = superset.insn_risg in + let g = Graphlib.create (module G) () in + let g = + Hash_set.fold subgraph ~init:g ~f:(fun g addr -> + let g = OG.add_vertex g addr in + let g = OG.fold_succ + (fun s g -> + if Hash_set.mem subgraph s then + OG.add_edge g addr s + else g + ) insn_risg addr g in + let g = OG.fold_pred + (fun s g -> + if Hash_set.mem subgraph s then + OG.add_edge g s addr + else g + ) insn_risg addr g in g + ) in + { superset with insn_risg =g; } + +end + +module Inspection = struct + + let contains_addr superset addr = + Memmap.contains superset.sections addr + + let total_bytes superset = + Seq.fold (Memmap.to_sequence superset.sections) ~init:0 + ~f:(fun total (mem,_) -> (total + (Memory.length mem))) + + let count superset = OG.nb_vertex superset.insn_risg + + let count_unbalanced superset = Addr.Table.length superset.insns + + let unbalanced_diff superset = + let kys = (Addr.Table.keys superset.insns) in + let mapaddrs = + List.fold kys ~init:Addr.Set.empty ~f:Set.add in + let gaddrs = OG.fold_vertex (fun x s -> Set.add s x) + superset.insn_risg Addr.Set.empty in + Set.diff mapaddrs gaddrs, Set.diff gaddrs mapaddrs + + let get_memmap superset = superset.sections + + let get_main_entry superset = superset.main_entry + + let filename superset = superset.filename + + let static_successors superset mem insn = + let brancher = superset.brancher in + let addr = Memory.min_addr mem in + if Addr.Table.mem superset.lifted addr then + let l = ISG.descendants superset addr in + List.map l ~f:(fun dst -> (Some dst, `Fall)) + else + match insn with + | None -> [None, `Fall] + | Some insn -> + try + Brancher.resolve brancher mem insn + with _ -> ( + print_endline @@ + "Target resolve failed on memory at " ^ Memory.to_string mem; + [None, `Fall] (*KB.return []*) + ) + + let len_at superset at = + let insns = get_insns superset in + match Addr.Table.find insns at with + | None -> 0 + | Some(mem, _) -> Memory.length mem + + let num_bad superset = + Hash_set.length superset.bad + + let is_bad_at superset at = Hash_set.mem superset.bad at + + let get_segments superset = superset.sections + + let get_endianness superset = superset.endianness + + let get_arch superset = superset.arch + +end + +module Occlusion = struct + + let range_seq_of_conflicts ~mem addr len = + let range_seq = Core.seq_of_addr_range addr len in + Seq.filter range_seq ~f:mem + + let conflict_seq_at superset addr = + let insns = superset.insns in + let check_mem = Addr.Table.(mem insns) in + match Addr.Table.find insns addr with + | Some(mem, _) -> + let len = Memory.length mem in + range_seq_of_conflicts ~mem:check_mem addr len + | None -> Seq.empty + + let with_data_of_insn superset at ~f = + let len = Inspection.len_at superset at in + let body = Core.seq_of_addr_range at len in + Seq.iter body ~f + + let conflicts_within_insn_at superset ?mem ?conflicts addr = + let default = (OG.mem_vertex superset.insn_risg) in + let mem = Option.value mem ~default in + let conflicts = Option.value conflicts ~default:Addr.Set.empty in + match Core.lookup superset addr with + | Some ((m, _)) -> + let len = (Memory.length m) in + let rng = range_seq_of_conflicts ~mem addr len in + let conflicts = + if not (Seq.is_empty rng) then + Set.add conflicts addr + else conflicts in + Seq.fold rng ~init:conflicts ~f:Set.add + | None -> conflicts + + let find_all_conflicts ?mem superset = + let insns = superset.insns in + List.fold Addr.Table.(keys insns) ~init:Addr.Set.empty + ~f:(fun conflicts addr -> + conflicts_within_insn_at superset ?mem ~conflicts addr + ) + + let conflicts_within_insns superset keep = + Set.fold keep ~init:Addr.Set.empty + ~f:(fun conflicts addr -> + conflicts_within_insn_at superset + ~conflicts addr + ) + + (* It is possible that scenarios admit an instruction that is not + the parent of a shared child that contains this addr *) + let parent_conflict_at superset addr = + let children = ISG.descendants superset addr in + List.fold children ~init:Addr.Set.empty ~f:(fun cparents child -> + let parents = ISG.ancestors superset child in + List.fold parents ~init:cparents ~f:(fun cparents parent -> + if not Addr.(parent = addr) then + match Core.lookup superset parent with + | Some(mem, _) -> + let len = Memory.length mem in + if Addr.(parent < addr) && Addr.(addr < (parent ++ len)) then + Set.add cparents parent + else cparents + | None -> cparents + else cparents + ) + ) + +end + +(** An address is an entry for an isg if it could be the return + ** terminating instruction at the end of a function. *) +let is_entry superset addr = + let insn_isg = superset.insn_risg in + OG.in_degree insn_isg addr = 0 && + OG.out_degree insn_isg addr > 0 + +let entries_of_isg superset = + let insn_isg = superset.insn_risg in + OG.fold_vertex (fun addr accu -> + if is_entry superset addr then + (Hash_set.add accu addr; accu) + else accu) + insn_isg (Addr.Hash_set.create ()) + +(** A frond point is a point that is distant most of a terminating + ** instruction, meaning it may be the first instruction of a + ** function. However, these could actually occur either within or + ** beyond the body of instruction sequence intended. *) +let is_frond_point superset addr = + let insn_isg = superset.insn_risg in + OG.in_degree insn_isg addr > 0 && + OG.out_degree insn_isg addr = 0 + +let frond_of_isg superset = + let insn_isg = superset.insn_risg in + OG.fold_vertex (fun addr accu -> + if is_frond_point superset addr then + (Hash_set.add accu addr; accu) + else accu) + insn_isg (Addr.Hash_set.create ()) + +let mergers superset = + let insn_risg= superset.insn_risg in + OG.fold_vertex (fun addr mergers -> + if OG.out_degree insn_risg addr > 1 then + Addr.Set.add mergers addr + else mergers) insn_risg Addr.Set.empty + +let is_branch superset addr = + OG.in_degree superset.insn_risg addr >= 2 + +let get_branches superset = + let branches = Addr.Hash_set.create () in + ISG.iter_vertex superset (fun vert -> + if is_branch superset vert then + Hash_set.add branches vert; + ); + branches + +(* This is a traversal + val with_bad : + t -> + ?visited:'b Addr.Hash_set.t_ -> + pre:('c -> addr -> 'd) -> + post:('d -> addr -> 'c) -> 'c -> 'c +*) +let with_bad superset ?visited ~pre ~post accu = + let visited = + match visited with + | None -> (Addr.Hash_set.create ()) + | Some visited -> visited in + Hash_set.fold ~init:accu superset.bad ~f:(fun accu b -> + if OG.mem_vertex superset.insn_risg b then + ISG.dfs_fold superset ~visited + ~pre ~post accu b + else accu + ) + +let fall_through_of superset addr = + let len = Inspection.len_at superset addr in + Addr.(addr ++ len) + +let is_fall_through superset parent child = + let ft = fall_through_of superset parent in + Addr.(child = ft) + +let get_non_fall_through_edges superset = + ISG.fold_edges superset + (fun child parent jmps -> + if is_fall_through superset parent child then + Map.set jmps ~key:child ~data:parent + else jmps + ) Addr.Map.empty + +let get_callers superset addr = + let g = (get_graph superset) in + if OG.mem_vertex g addr && + OG.out_degree g addr > 0 then + let callers = OG.succ g addr in + List.filter callers ~f:(fun caller -> + not (is_fall_through superset caller addr)) + else [] + +let with_img ~accu img ~f = + let segments = Table.to_sequence @@ Image.segments img in + Seq.fold segments ~init:accu ~f:(fun accu (mem, segment) -> + if Image.Segment.is_executable segment then + f ~accu mem + else accu + ) + +let superset_of_img ?f ?addrs ~backend img = + let arch = Image.arch img in + let segments = Image.memory img in + let main_entry = Image.entry_point img in + let filename = Image.filename img in + let f = Option.value f ~default:(fun (m, i) a -> a) in + let superset = + of_components ~main_entry ?filename ~segments arch in + with_img ~accu:superset img + ~f:(fun ~accu mem -> + Core.update_with_mem ~backend ?addrs accu mem ~f + ) + +let superset_disasm_of_file ?(backend="llvm") ?f ?addrs binary = + let img, errs = Image.create ~backend binary |> ok_exn in + superset_of_img ~backend img ?addrs ?f + diff --git a/lib/bap_superset_disasm/superset.mli b/lib/bap_superset_disasm/superset.mli new file mode 100644 index 000000000..a8ebf29df --- /dev/null +++ b/lib/bap_superset_disasm/superset.mli @@ -0,0 +1,325 @@ +open Bap.Std +open Core + +module Dis = Disasm_expert.Basic +type elem = mem * Dis.full_insn option + +type t = Superset_impl.t + +module ISG : sig + open Graphlib.Std + + (** Returns a list of those addresses for which the argument + address could be the immediate next instruction of. *) + val ancestors : t -> addr -> addr list + + (** Returns a list of those addresses for which the argument + address could potentially lead to. *) + val descendants : t -> addr -> addr list + + val mem_vertex : t -> addr -> bool + + val iter_vertex : t -> (addr -> unit) -> unit + + val fold_vertex : t -> (addr -> 'a -> 'a) -> 'a -> 'a + + val fold_edges : t -> (addr -> addr -> 'a -> 'a) -> 'a -> 'a + + val check_connected : t -> addr -> addr -> bool + + (** Adds an associated directed link from src to dst, tracking + addresses if they are not already. *) + val link : t -> addr -> addr -> t + + (** Removes a link between two addresses, but not stop tracking + those addresses even if they each have no more links *) + val unlink : t -> addr -> addr -> t + + (** Uses strongly connected components to determine loop lists, but + does no filtering. *) + val raw_loops : t -> addr list list + + val dfs_fold : + ?visited:Addr.Hash_set.t -> t -> pre:('a -> addr -> 'a) -> + post:('a -> addr -> 'a) -> 'a -> addr -> 'a + + val dfs : ?terminator:(addr -> bool) -> ?visited:Addr.Hash_set.t -> + ?pre:(addr -> unit) -> ?post:(addr -> unit) -> + (t -> addr -> addr list) -> t -> addr -> unit + + val to_list : t -> (addr * addr) list + + val fixpoint : ?steps:int -> + ?start:addr -> + ?rev:bool -> + ?step:(int -> addr -> 'a -> 'a -> 'a) -> + t -> + init:(addr, 'a) Solution.t -> + equal:('a -> 'a -> bool) -> + merge:('a -> 'a -> 'a) -> + f:(addr -> 'a -> 'a) -> (addr, 'a) Solution.t + + (** Print the graph to file for a given superset *) + val print_dot : ?colorings:Addr.Hash_set.t String.Map.t -> t -> unit + + (** For all items in the address hash set, remove them from the + superset. This is a raw removal, so it does not mark bad and + traverse to perform maximal removals. *) + val filter : t -> Addr.Hash_set.t -> t + + (** Prints the isg via a formatter in gml format. *) + val format_isg : ?format:Format.formatter -> t -> unit + + (** Prints the isg to a string buffer in gml format. *) + val isg_to_string : t -> string +end + +module Core : sig + + (** Insert the memory and disassembled instruction into the superset *) + val add : t -> mem -> Dis.full_insn option -> t + (** Stops tracking an address. Can unbalance the internal + structure, requiring further balanace and trim calls *) + val remove : t -> addr -> t + val empty : arch -> t + + (** This primary core function is the core of disassembly, and + simply reads each byte consecutively in memory by address + successor. *) + val run_seq : + ('a, 'b) Dis.t -> + mem -> + (mem * (Dis.asm, Dis.kinds) Dis.insn option) seq + + val seq_of_addr_range : addr -> int -> addr seq + + (** This primary core function is the core of disassembly, and simply + reads each byte consecutively in memory by address successor. It + is alike to run_seq, but it hides the sequence part, and accepts + a parameter lambda. *) + val run : + ('a, 'b) Dis.t -> + accu:'c -> + f:(mem * (Dis.asm, Dis.kinds) Dis.insn option -> 'c -> 'c) -> + mem -> 'c + + (** This function is the fundamental superset disassembly, and + disassembles at the addresses given by the supplied sequence. *) + val disasm : + ?backend:string -> addrs:addr seq -> accu:'a -> + f:(mem * (Dis.asm, Dis.kinds) Dis.insn option -> 'a -> 'a) -> + Arch.t -> mem -> 'a Or_error.t + + (** This function is the core of disassembly, and simply + reads each byte consecutively in memory by address successor. It + builds the disassembler and runs the superset behind the + scenes. One can accumulate with any arbitrary type. Later + accessories tuck the results into a custom superset + representation with graph specializations suited to the + invariants, heuristics and performance that are vital to good + operation. *) + val disasm_all : + ?backend:string -> accu:'a -> + f:(mem * (Dis.asm, Dis.kinds) Dis.insn option -> 'a -> 'a) -> + Arch.t -> mem -> 'a Or_error.t + + (** Lift a single disassembled memory and instruction pair *) + val lift_insn : + t -> (mem * Dis.full_insn option) -> (bil) option + + (** Given an address, lift a single instruction at that address *) + val lift_at : + t -> (addr) -> bil option + + (** The primary disassembler design interface. Implementing a + disassembler from the ground up almost certainly uses this as + it loads the memory images into the superset. *) + val update_with_mem : + ?backend:string -> ?addrs:addr seq -> + ?f:(mem * (Dis.asm, Dis.kinds) Dis.insn option -> t -> t) -> + t -> mem -> t + + (** Marking an address bad means that it is temporarily maintained + until a later phase in which it is removed, together with as + many other other instructions that might have accessed it as + possible. *) + val mark_bad : t -> addr -> unit + + (** Internally, performance is important, and it may be the case + that after many bad instructions are marked and removed, that + there is some mismatch between the internal tracking that is + done. So, this library imposes that after a trim that this be + called. *) + val rebalance : t -> t + + (** This removes bad entries from the tracker without pruning them + from the superset. If this is called before trimming the + superset, then the bad instructions that were marked are no + longer distinguishable as previously. *) + val clear_bad : t -> addr -> unit + + val clear_each : t -> Addr.Hash_set.t -> unit + + (** Removes all addresses from being tracked as bad, without + removing them from the superset. *) + val clear_all_bad : t -> unit + + (** Returns a copy of the set of addresses that have been marked + bad *) + val copy_bad : t -> Addr.Hash_set.t + + val lookup : t -> addr -> (mem * Dis.full_insn option) option + + (** Accumulate over each current disassembled instruction in the + current superset. *) + val fold : t -> init:'a -> f:(key:addr -> data:elem -> 'a -> 'a) -> 'a + val mem : t -> addr -> bool +end + +module Inspection : sig + (** Returns if the addr is still in the container representing the + superset. Note, an addr may be marked as bad for removal, but isn't + removed until the trim module traverses for cleaning. Marking for + removal can be reversed by calling clear_bad. *) + val contains_addr : t -> addr -> bool + + val get_endianness : t -> endian option + + val get_arch : t -> arch + + (** Mark and track the argument address for removal upon trimming. *) + val num_bad : t -> int + + (** Current number of disassembled instructions in the superset *) + val count : t -> int + + (** Returns information reporting unbalanced the superset has + become, which ideally resides at zero. *) + val count_unbalanced : t -> int + + (** Returns a tuple representing differences of addresses not in + each of either the disassembly set and the graph. *) + val unbalanced_diff : t -> (Addr.Set.t * Addr.Set.t) + + (** Returns true if the address is currently marked for removal *) + val is_bad_at : t -> addr -> bool + + (** Returns the length of the instruction at addr *) + val len_at : t -> addr -> int + + (** Returns the total overall size of memory in bytes that has been + processed by the superset *) + val total_bytes : t -> int + + (** A carefully written function that visits the address in the body + of any instruction that is longer than one byte. So, addr + 1, + addr + 2, ... addr + n. *) + val static_successors : t -> mem -> Dis.full_insn option -> + Brancher.dests + + val get_memmap : t -> value memmap + + (** Returns the entry of the image that was originally loaded into + this superset, if any was used or if it was discovered when + loaded. *) + val get_main_entry : t -> addr option + + (** Returns the filename, if any was used, to compute this superset. *) + val filename : t -> string option +end + +module Cache : sig + open Bap_knowledge + open Bap_core_theory + open Theory + + val package : string + val sym_label : program Knowledge.obj KB.t + val superset_graph_t : + (addr * addr) list option Knowledge.domain + val superset_graph_persistent : + (addr * addr) list option Knowledge.persistent + val superset_graph : + (program, (addr * addr) list option) Knowledge.slot + +end + +module Occlusion : sig + (** For each address within the set, search within the body of the + corresponding disassembled instruction, looking for conflicts + (shingles). Add all such addresses, including the original + instruction if any occurred in it's body. Only exclude an + address from the output set if it didn't occur in the body of + another, and if no instruction conflicted. *) + val conflicts_within_insns : t -> Addr.Set.t -> Addr.Set.t + + (** For a given address, produce a set of all addresses + within the body of the disassembly that possess a conflict. *) + val conflicts_within_insn_at : + t -> ?mem:(addr -> bool) -> ?conflicts:Addr.Set.t -> addr -> Addr.Set.t + + (** For a given superset, look at every single instruction. Defers + to conflicts_within_insns in implementation. *) + val find_all_conflicts : ?mem:(addr -> bool) -> t -> Addr.Set.t + + (** A sequence view of conflicts within a given disassembled + instruction at a given address for a given length. *) + val range_seq_of_conflicts : mem:(addr -> bool) -> addr -> int -> addr seq + + (** At a given address, return all addresses within its body for + which there exists another conflicting instruction. *) + val conflict_seq_at : t -> addr -> addr seq + + (** Compute at a given address given with those addresses reside + within the body of the disassembly. *) + val with_data_of_insn : + t -> addr -> f:(addr -> unit) -> unit +end + +(** The instruction immediately after argument addr. *) +val fall_through_of : t -> addr -> addr +(** A helper function meant to tuck away the representation + underneath that tracks bad addresses. *) +val is_fall_through : + t -> addr -> addr -> bool + +(** Entry is a connotation that denotes no other instruction leads + into this one. *) +val is_entry : t -> addr -> bool + +(** Return all addresses in a set for which is_entry returns true *) +val entries_of_isg : t -> Addr.Hash_set.t + +(** A frond point is a point that is distant most of a terminating + ** instruction, meaning it may be the first instruction of a + ** function. However, these could actually occur either within or + ** beyond the body of instruction sequence intended. *) +val is_frond_point : t -> addr -> bool + +(** Return all addresses in a set for which is_front_point is true *) +val frond_of_isg : t -> Addr.Hash_set.t + +(** Return the set of addreses for which more than one other + instruction targets it. *) +val mergers : t -> Addr.Set.t + +(** Checks for the existence of successors other than fall through. *) +val is_branch : t -> addr -> bool + +(** Returns a set of the addresses for which is_branch is true *) +val get_branches : t -> Addr.Hash_set.t + +(** DFS traverses the set of addresses that are currently marked as + bad, applying functions pre and post, from the bad to all + ancestors. *) +val with_bad : + t -> ?visited:Addr.Hash_set.t -> pre:('b -> addr -> 'c) -> + post:('c -> addr -> 'b) -> 'b -> 'b + +(** Take the raw superset of a given file name of a compiled object of + any kind that Image can parse. *) +val superset_disasm_of_file : + ?backend:string -> + ?f:(mem * (Dis.asm, Dis.kinds) Dis.insn option -> t -> t) -> + ?addrs:addr seq -> string -> t + diff --git a/lib/bap_superset_disasm/superset_impl.ml b/lib/bap_superset_disasm/superset_impl.ml new file mode 100644 index 000000000..c2b886689 --- /dev/null +++ b/lib/bap_superset_disasm/superset_impl.ml @@ -0,0 +1,241 @@ +open Bap.Std +open Regular.Std +open Core +open Or_error +open Graphlib.Std + +module Dis = Disasm_expert.Basic + +type elem = mem * (Dis.full_insn option) + +module G = Graphlib.Make(Addr)(Unit) + +type t = { + arch : arch; + filename : string option; + main_entry : addr option; + sections : value memmap; + brancher : Brancher.t; + endianness : endian option; + lifter : lifter; + insns : (mem * (Dis.full_insn option)) Addr.Table.t; + lifted : bil Addr.Table.t; + insn_risg : G.t; + bad : Addr.Hash_set.t; + keep : Addr.Hash_set.t; + (* marked data *) + (* visited *) + (* union_find *) +} + +let of_components + ?main_entry ?insns ?insn_risg ?lifted ?segments ?endianness ? + filename arch = + let insn_risg = + match insn_risg with + | Some insn_risg -> insn_risg + | None -> Graphlib.create (module G) () in + let segments = Option.value segments ~default:Memmap.empty in + let insns = Option.value insns ~default:(Addr.Table.create ()) in + let lifted = Option.value lifted ~default:(Addr.Table.create ()) in + let module Target = (val target_of_arch arch) in + let lifter = Target.lift in + { + arch = arch; + filename; + sections = segments; + brancher = Brancher.of_bil arch; + endianness = None; + lifter = lifter; + main_entry; + insns; + insn_risg; + lifted; + bad = Addr.Hash_set.create (); + keep = Addr.Hash_set.create (); + } + +module M = struct + module G = struct + include G + module V = G.Node + type vertex = V.t + module E = G.Edge + end + include Graphlib.To_ocamlgraph(G) + let empty = G.empty + let iter_vertex f g = + Seq.iter (G.nodes g) ~f + let iter_succ f g v = + Seq.iter (G.Node.succs v g) ~f + let fold_succ f g v init = + Seq.fold ~init (G.Node.succs v g) ~f:(fun v g -> f g v) + let fold_vertex f g init = + Seq.fold ~init (G.nodes g) ~f:(fun v g -> f g v) + let is_directed = true + + let iter_edges_e f g = + Seq.iter (G.edges g) ~f + let iter_edges f g = + Seq.iter (G.edges g) ~f:(fun e -> + f (G.Edge.src e) (G.Edge.dst e) + ) + let remove_edge e g = + G.Edge.remove e g + let remove_edge_e v1 v2 g = + let e = G.Edge.create v1 v2 () in + remove_edge e g + let add_edge e g = + G.Edge.insert e g + let add_edge_e v1 v2 g = + let e = G.Edge.create v1 v2 () in + add_edge e g + let remove_vertex v g = + G.Node.remove v g + let add_vertex v g = G.Node.insert v g + let copy g = + let gcpy = Graphlib.create (module G) () in + Seq.fold ~init:gcpy G.(edges g) ~f:(fun gcpy e -> + add_edge e gcpy + ) +end + +module Topological = + Graph.Topological.Make(M) +module StrongComponents = Graph.Components.Make(M) +(*module DiscreteComponents = Components.Undirected(G)*) +module P = Graph.Builder.P(Graphlib.To_ocamlgraph(G)) +module Oper = Graph.Oper.Make(P) +module Dfs = Graph.Traverse.Dfs(M) + +type colored_superset = G.t * Addr.Hash_set.t String.Map.t + * elem Addr.Table.t + +module Make(T : sig val instance : colored_superset end) = struct + open T + module Dottable = struct + type t = colored_superset + + module V = struct + type t = M.V.t + end + + module E = struct + type t = M.E.t + let src = M.E.src + let dst = M.E.dst + end + + let iter_vertex f (g, _, _) = + M.iter_vertex f g + + let iter_edges_e f (g, _, _) = + M.iter_edges_e f g + + let graph_attributes _ = [ + `Fontsize 14; + ] + let default_vertex_attributes gr = [ + `Shape `Box; + (*`Height 1.0*.Memory.(length mem);*) + `Fontsize 14; + `Fontcolor 0x666699; + `Fontname "Monospace"; + `Width 1.0 + ] + + let red = 0xff0000 + let green = 0x009900 + let yellow = 0xffff00 + let blue = 0x0000ff + let orange = 0xff6600 + let purple = 0x660066 + let brown = 0x663300 + let cyan = 0x0099cc + + let vertex_name name = + let fmt = Format.str_formatter in + Addr.(pp_generic ~prefix:`none ~suffix:`none ~format:`dec + fmt name); + Format.flush_str_formatter () + + let vertex_attributes v = + let default_attrs = + [ + `Label ((vertex_name v)); + ] in + let g, colors, insns = instance in + let contains name = + match Map.find colors name with + | Some(s) -> + Hash_set.mem s v + | None -> false in + let find_update default_attrs name color = + if contains name then + `Color color :: default_attrs + else default_attrs in + let default_attrs = + find_update default_attrs "False Negatives" red in + let default_attrs = + find_update default_attrs "True Positives" green in + let default_attrs = + find_update default_attrs "False Positives" yellow in + let default_attrs = + match List.hd default_attrs with + | Some (`Color _) -> + default_attrs + | _ -> `Color 0X660000 :: default_attrs in + match Addr.Table.find insns v with + | Some(mem,insn) -> + let len = float_of_int Memory.(length mem) in + `Height (1.0 *. len) :: + default_attrs + | None -> default_attrs + + + let get_subgraph _ = None + let default_edge_attributes _ = [ + `Penwidth 1.0; + `Arrowsize 0.5; + `Headport `N; + `Tailport `S; + `Labelfloat true; + ] + + let edge_attributes e = + (*let (src,dst) = M.E.src e,M.E.dst e in*) + (*let color,weight = match kind,arity with + | `Fall,`Many -> 0x660000, 4 + | `Fall,`Mono -> 0x000066, 8 + | `Cond,_ -> 0x006600, 2 + | `Jump,_ -> 0x000066, 2 in*) + [ + (*`Color color;*) + (*`Weight weight;*) + ] + end + module Dot = Graph.Graphviz.Dot(Dottable) +end + +let fold_component ?visited ~pre ~post g accu addr = + let visited = Option.value visited + ~default:(Addr.Hash_set.create ()) in + let s = Stack.create () in + (* invariant: [h] contains exactly the vertices which have been pushed *) + let push v = + if not (Hash_set.mem visited v) then begin + Hash_set.add visited v; + Stack.push s v + end + in + push addr; + let rec loop acc = + match Stack.pop s with + | Some v -> + let acc = pre acc v in + M.iter_succ push g v; + loop @@ post acc v + | None -> acc + in + loop accu + diff --git a/lib/bap_superset_disasm/traverse.ml b/lib/bap_superset_disasm/traverse.ml new file mode 100644 index 000000000..2fa17bd97 --- /dev/null +++ b/lib/bap_superset_disasm/traverse.ml @@ -0,0 +1,82 @@ +open Core +open Bap.Std +open Superset + +(** Abstract dfs from a given starting point. *) +let iter_component ?(terminator=(fun _ -> true)) + ?visited ?(pre=fun _ -> ()) ?(post=fun _ -> ()) = + ISG.dfs ~terminator ?visited ~pre ~post ISG.ancestors + +(** This function starts at a given address and traverses toward + every statically visible descendant. It is used to maximally + propagate a given function application. *) +let with_descendents_at ?visited ?post ?pre superset addr = + ISG.dfs ?visited ?post ?pre ISG.descendants superset addr + +(** This function starts at a given address and traverses toward + every statically visible ancestor. It is used to maximally + propagate a given function application. *) +let with_ancestors_at ?visited ?post ?pre superset addr = + ISG.dfs ?visited ?post ?pre ISG.ancestors superset addr + +(** From the starting point specified, this reviews all descendants + and marks their bodies as bad. *) +let mark_descendent_bodies_at ?visited ?datas superset addr = + let datas = Option.value datas + ~default:(Addr.Hash_set.create ()) in + let mark_bad = Core.mark_bad superset in + with_descendents_at ?visited superset addr + ~pre:(fun v -> + Occlusion.with_data_of_insn superset v ~f:mark_bad; + Occlusion.with_data_of_insn superset v ~f:(Hash_set.add datas); + ) + +(** A clean wrapper around raw superset that does some management of + visited nodes for efficiency behind the scenes. *) +let visit ?visited ~pre ~post superset entries = + let visited = Option.value visited + ~default:(Addr.Hash_set.create ()) in + let pre addr = + Hash_set.add visited addr; + pre addr in + Hash_set.iter entries ~f:(fun addr -> + if not (Hash_set.mem visited addr) then + with_ancestors_at superset ~visited ~pre ~post addr + ) + + +(** This traversal unlinks all non-branch jumps, collects every + entry, which now includes newly unlinked blocks in order to + provide a separation over traversal. *) +let visit_by_block superset + ?(pre=(fun _ _ _ -> ())) ?(post=(fun _ _ _ -> ())) entries = + let (jmps,targets) = Superset.ISG.fold_edges superset + (fun src target (jmps,targets) -> + let is_branch = Superset.is_branch superset target in + let is_jmp_edge = not (Superset.is_fall_through superset src target) in + if is_branch && is_jmp_edge then + (Map.set jmps ~key:src ~data:target, Set.add targets target) + else (jmps, targets) + ) (Addr.Map.empty,Addr.Set.empty) in + let superset = + Map.fold jmps ~init:superset ~f:(fun ~key ~data superset -> + Superset.ISG.unlink superset key data; + ) in + let entries = Superset.entries_of_isg superset in + let visited = Addr.Hash_set.create () in + let rec visit v = + Hash_set.add visited v; + pre jmps targets v; + let f w = + if not (Hash_set.mem visited w) then + visit w + else pre jmps targets w in + let ancs = Superset.ISG.ancestors superset v in + List.iter ancs ~f; + post jmps targets v; + in + Hash_set.iter entries ~f:visit; + Map.fold jmps ~init:superset ~f:(fun ~key ~data superset -> + Superset.ISG.link superset key data; + ) + diff --git a/lib/bap_superset_disasm/traverse.mli b/lib/bap_superset_disasm/traverse.mli new file mode 100644 index 000000000..46bdd77cd --- /dev/null +++ b/lib/bap_superset_disasm/traverse.mli @@ -0,0 +1,16 @@ +open Bap.Std + +val mark_descendent_bodies_at : ?visited:Bap.Std.Addr.Hash_set.t -> + ?datas:Addr.Hash_set.t -> + Superset_impl.t -> addr -> unit +val with_descendents_at : ?visited:Addr.Hash_set.t -> + ?post:(addr -> unit) -> + ?pre:(addr -> unit) -> Superset_impl.t -> addr -> unit +val with_ancestors_at : ?visited:Addr.Hash_set.t -> + ?post:(addr -> unit) -> + ?pre:(addr -> unit) -> Superset_impl.t -> addr -> unit +val visit : ?visited:Addr.Hash_set.t -> pre:(addr -> unit) -> + post:(addr -> unit) -> + Superset_impl.t -> Addr.Hash_set.t -> unit +val visit_by_block : Superset_impl.t -> ?pre:(addr Addr.Map.t -> Addr.Set.t -> addr -> unit) -> + ?post:(addr Addr.Map.t -> Addr.Set.t -> addr -> unit) -> Addr.Hash_set.t -> Superset_impl.t diff --git a/lib/bap_superset_disasm/trim.ml b/lib/bap_superset_disasm/trim.ml new file mode 100644 index 000000000..6b3fadc2f --- /dev/null +++ b/lib/bap_superset_disasm/trim.ml @@ -0,0 +1,27 @@ +open Core +open Bap.Std + +let run superset = + let accu = () in + let check_pre _ accu _ = accu in + let check_post _ accu _ = accu in + let check_elim _ _ _ = true in + let mark _ _ _ = () in + let post superset accu addr = + let module G = Superset.ISG in + if check_elim superset accu addr then ( + mark superset accu addr; + ); + check_post superset accu addr in + let visited = Addr.Hash_set.create () in + (*let superset = Superset.Core.rebalance superset in*) + let post = post superset in + let pre = check_pre superset in + let _ = Superset.with_bad superset ~visited ~pre ~post accu in + let superset = + Hash_set.fold visited ~init:superset ~f:(fun superset addr -> + Superset.Core.remove superset addr + ) in + Hash_set.clear visited; + Superset.Core.clear_all_bad superset; + superset diff --git a/lib/bap_superset_disasm/trim.mli b/lib/bap_superset_disasm/trim.mli new file mode 100644 index 000000000..464b6bf6d --- /dev/null +++ b/lib/bap_superset_disasm/trim.mli @@ -0,0 +1 @@ +val run : Superset_impl.t -> Superset_impl.t diff --git a/lib_test/bap_superset_disasm/dune b/lib_test/bap_superset_disasm/dune new file mode 100644 index 000000000..fd7a7faa6 --- /dev/null +++ b/lib_test/bap_superset_disasm/dune @@ -0,0 +1,5 @@ +(library + (name test_disasm) + (preprocess (pps ppx_bap)) + (wrapped false) + (libraries bap core_kernel ounit2 str)) diff --git a/lib_test/bap_superset_disasm/test_superset_disasm.ml b/lib_test/bap_superset_disasm/test_superset_disasm.ml new file mode 100644 index 000000000..1be894db7 --- /dev/null +++ b/lib_test/bap_superset_disasm/test_superset_disasm.ml @@ -0,0 +1,823 @@ +open OUnit2 +open Core_kernel +open Bap.Std +open Or_error +open Superset +open Bap_plugins.Std +open Graphlib.Std +open Bap_future.Std + +let requires = ["llvm"; "lifter"; "disassemble"; "disassembler"; + "semantics"] +let () = match Bap_main.init ~requires () with + | Ok () -> () + | Error err -> + let open Bap_main in + Bap_main.Extension.Error.pp Format.std_formatter err; + exit 1 + +let create_memory arch min_addr data = + let data = Bigstring.of_string data in + Memory.create (Arch.endian arch) min_addr data + +let arch = `x86 + +let segments = Table.empty + +let width = 8*(Arch.size_in_bytes arch);; +let zero = Addr.(of_int ~width 0) + +module G = Graphlib.Make(Addr)(Unit) +module Topological = Superset_impl.Topological + +let add_edge g v1 v2 = + let e = G.Edge.create v1 v2 () in + G.Edge.insert e g + +let mem_edge g v1 v2 = + let e = G.Edge.create v1 v2 () in + G.Edge.mem e g + +let init () = + let insn_isg = Graphlib.create (module G) () in + let insn_map = Addr.Map.empty in + insn_map, insn_isg + +let min_addr = 1 +let addr_size= Size.in_bits @@ Arch.addr_size arch +let min_addr = Addr.of_int ~width:addr_size min_addr + +let make_params ?(mina=min_addr) bytes = + let memory = create_memory arch min_addr bytes |> ok_exn in + memory, arch + +let check_results sizes expected_results = + let sizes = Seq.to_list sizes in + List.iter2_exn sizes expected_results + ~f:(fun actual_size expected_size -> + assert_equal ~msg:((List.to_string ~f:string_of_int sizes) + ^ (List.to_string ~f:string_of_int + expected_results)) actual_size + expected_size) + +let superset_to_length_list superset = + List.map superset + ~f:(fun (mem, insn) -> (Memory.length mem)) + +(* This test affirms that both the order and the inner sequences of a set of bytes + will be interpreted appropriately by the superset conservative disassembler *) +let test_hits_every_byte test_ctxt = + let memory, arch = make_params "\x2d\xdd\xc3\x54\x55" in + let raw_superset = Superset.Core.disasm_all + ~accu:[] ~f:List.cons arch memory |> ok_exn in + let sizes = superset_to_length_list raw_superset in + let expected_results = List.rev [ 5; 2; 1; 1; 1; ] in + check_results (Seq.of_list sizes) expected_results + +let of_mem arch mem = + let superset = Superset.Core.empty arch in + let f = (Invariants.tag ~invariants:[Invariants.tag_success]) in + Superset.Core.update_with_mem superset mem ~f + +let get_bads superset mem = + let maddr = Memory.min_addr mem in + let l = Memory.length mem in + let bds = Superset.Core.seq_of_addr_range maddr l in + Seq.filter bds + ~f:(Superset.Inspection.is_bad_at superset) + +let str_of_bads superset mem = + let bds = get_bads superset mem in + let bds = Seq.to_list bds in + List.to_string ~f:Addr.to_string bds + +let debug_msg superset mem = + let msg = Superset.ISG.isg_to_string superset in + let bads = str_of_bads superset mem in + let msg = sprintf "%s\n%s" + msg bads in + (*let pattern = ": " in + let msi = + String.substr_index mems ~pattern |> Option.value_exn in + let start = (msi+(String.length pattern)) in + let finish = String.((length mems)) - start in + let ms = String.sub mems start finish in*) + let cnt = Superset.Inspection.count superset in + let unb = Superset.Inspection.count_unbalanced superset in + sprintf "%s\ncount (graph): %d, unbalanced (map): %d" msg cnt unb + +(* TODO make default initialization from bytes much shorter *) + +let test_trim test_ctxt = + let bytes = "\x2d\xdd\xc3\x54\x55" in + let mem, arch = make_params bytes in + let superset = of_mem arch mem in + let superset = + Invariants.tag_superset superset in + let superset = Trim.run superset in + let superset = Superset.Core.rebalance superset in + let dbg = debug_msg superset mem in + let bads = str_of_bads superset mem in + let explanation = + sprintf "Expect one instruction, got %d. bad: %s" + (Superset.Inspection.count superset) bads in + let msg = sprintf "%s\n%s" + dbg explanation in + (* Only the return opcode ( 0xc3 ) can survive trimming *) + (* After refactoring, it may be that some targets that fall outside + the memory bounds are still in the graph, but this only accounts + for one edge, so it is negligible. *) + assert_equal ~msg 1 @@ Superset.Inspection.count superset + +let test_can_lift test_ctxt = + let bytes = "\x2d\xdd\xc3\x54\x55" in + let mem, arch = make_params bytes in + let superset = of_mem arch mem in + try + Superset.Core.fold superset ~init:() ~f:(fun ~key ~data () -> + let (mem, insn) = data in + let bil = Superset.Core.lift_insn superset (mem, insn) in + let msg = + sprintf "couldn't lift at %s" (Memory.to_string mem) in + match bil with + | Some _ -> () + | _ -> + assert_bool msg false + ) + with _ -> () + +let test_brancher test_ctxt = + let bytes = "\x77\x77" in + let mem, arch = make_params bytes in + let superset = of_mem arch mem in + match Superset.Core.lookup superset Memory.(min_addr mem) with + | Some (mem, insn) -> + let ss = + Superset.Inspection.static_successors superset mem insn in + let msg = sprintf "Should be two static successors here" in + assert_bool msg (List.(length ss) > 1) + | None -> assert_bool "should be an instruction at 0" false + +let test_lift test_ctxt = + let bytes = "\x77\x77" in + let mem, arch = make_params bytes in + let superset = of_mem arch mem in + match Superset.Core.lookup superset Memory.(min_addr mem) with + | Some (mem, insn) -> + let lifted = + Superset.Core.lift_insn superset (mem, insn) in + let msg = sprintf "Should be able to lift" in + assert_bool msg Option.(is_some lifted); + let r = Option.value_map ~default:false lifted ~f:(fun (bil) -> + List.length bil > 0) in + assert_bool "Should support this!" r + | None -> assert_bool "should be an instruction at 0" false + +(* TODO want a bil language oriented way to specify the construction of a superset *) +let dis_with_invariants ?superset bytes invariants = + let mem, arch = make_params bytes in + let default = + of_mem arch mem in + let superset = Option.value superset ~default in + let f = (Invariants.tag ~invariants) in + let superset = Superset.Core.update_with_mem + superset mem ~f in + let msg = debug_msg superset mem in + let msg = sprintf "Should be bad at %s\n%s" + Addr.(to_string min_addr) msg in + let superset = Trim.run superset in + let superset = Superset.Core.rebalance superset in + assert_bool msg @@ not @@ Superset.Core.mem superset min_addr; + let msg = debug_msg superset mem in + let offset_one = + Superset.Core.mem superset min_addr in + assert_bool msg (not offset_one) + +let test_tag_non_insn test_ctxt = + dis_with_invariants "\x0f\xff" + [Invariants.tag_success; Invariants.tag_non_insn] + +let test_tag_target_is_bad test_ctxt = () +(* TODO don't want to use this until have functor on superset + * interface allowing to set the semantics directly for test *) + (*let bytes = "\x77\xfe" in + dis_with_invariants bytes + [Invariants.tag_success; Invariants.tag_target_is_bad]*) + +let test_target_in_body test_ctxt = + let bytes = "\x77\xFF" in + dis_with_invariants bytes + [Invariants.tag_success; Invariants.tag_target_in_body] + +let test_target_not_in_mem test_ctxt = + let bytes = "\x77\x77" in + let invariants = [Invariants.tag_success; + Invariants.tag_target_not_in_mem] in + dis_with_invariants bytes invariants + +let test_static_successors_includes_fall_through test_ctxt = + let bytes = "\x54\x55" in + let mem, arch = make_params bytes in + let superset = of_mem arch mem in + let maddr = Memory.(min_addr mem) in + match Superset.Core.lookup superset maddr with + | Some (mem, insn) -> + let tgts = + Superset.Inspection.static_successors superset mem insn in + let b = List.fold ~init:false tgts ~f:(fun status (addro, e) -> + match addro with + | Some addr -> status || Addr.(addr = (succ maddr)) + | None -> status + ) in + assert_bool "static successors doesn't contain fall through" b + | None -> assert_bool "insn expected here" false + + +let test_successor_calculation test_ctxt = + let bytes = "\x2d\xdd\xc3\x54\x55" in + let mem, arch = make_params bytes in + let superset = of_mem arch mem in + let mn_addr = Memory.(min_addr mem) in + let mx_addr = Memory.(max_addr mem) in + match Superset.Core.lookup superset mn_addr with + | Some (mem, insn) -> + let tgts = + Superset.Inspection.static_successors superset mem insn in + let b = List.fold ~init:false tgts ~f:(fun status (addro, e) -> + match addro with + | Some addr -> + status || + (Addr.(addr > (mx_addr)) && (not Memory.(contains mem addr))) + | None -> status + ) in + let b = b && (List.length tgts > 0) in + assert_bool "static successors doesn't contain fall through" b + | None -> assert_bool "insn expected here" false + +let test_superset_contains_addr test_ctxt = + let bytes = "\x2d\xdd\xc3\x54\x55" in + let mem, arch = make_params bytes in + let superset = of_mem arch mem in + let mn_addr = Memory.(min_addr mem) in + match Superset.Core.lookup superset mn_addr with + | Some (mem, insn) -> + let tgts = + Superset.Inspection.static_successors superset mem insn in + let b = List.fold ~init:false tgts ~f:(fun status (addro, e) -> + match addro with + | Some addr -> + status || + (not (Superset.Inspection.contains_addr superset addr)) + | None -> status + ) in + let b = b && (List.length tgts > 0) in + assert_bool "static successors doesn't contain fall through" b + | None -> assert_bool "insn expected here" false + +let test_trims_invalid_jump test_ctxt = + let bytes = "\x55\x54\xE9\xFC\xFF\xFF\xFF" in + let memory, arch = make_params bytes in + let superset = of_mem arch memory in + let superset = Superset.Core.update_with_mem + superset memory ~f:Invariants.tag in + let superset = Trim.run superset in + let superset = Superset.Core.rebalance superset in + let expected_results = [ ] in + assert_equal ~msg:"lengths unequal" + (Superset.Inspection.count superset) + (List.length expected_results) + +let test_addr_map test_ctxt = + let min_addr = Addr.of_int ~width:addr_size 0 in + let insn_map = Addr.Map.empty in + let insn_map = Addr.Map.set insn_map ~key:min_addr ~data:() in + let insn_map = Addr.Map.set insn_map ~key:min_addr ~data:() in + let msg = "expected length to be one" in + assert_bool msg ((Addr.Map.length insn_map) = 1) + +let test_insn_isg test_ctxt = + let insn_risg = Graphlib.create (module G) () in + let addr = Addr.of_int ~width:addr_size 0 in + let insn_risg = G.Node.insert addr insn_risg in + let insn_risg = G.Node.insert addr insn_risg in + let msg = "expected length to be one" in + assert_bool msg ((G.number_of_nodes insn_risg) = 1) + +let test_consistent_superset test_ctxt = + let memory, arch = make_params "\x55\x54\xE9\xFC\xFF\xFF\xFF" in + let superset = of_mem arch memory in + let m_neg_g, g_neg_m = Superset.Inspection.unbalanced_diff superset in + let msg = "insn in map but not in after shingled of superset" in + assert_bool msg (Set.is_empty m_neg_g); + let msg = "insn in isg but not in map after shingled of superset" in + assert_bool msg (Set.is_empty g_neg_m) + +let construct_loop insn_map insn_isg start finish = + if Addr.(finish > start) then ( + (* Where we would otherwise connect the nodes from the tail + condition back up to the loop body entry, here all the edges are + reversed in the spirit of the disassembled insn_isg. *) + let insn_isg = add_edge insn_isg start finish in + let junk_data = String.of_char ' ' in + let start_mem = create_memory arch start junk_data |> ok_exn in + let insn_map = Addr.Map.set insn_map ~key:start ~data:(start_mem, None) in + let finish_mem = create_memory arch finish junk_data |> ok_exn in + let insn_map = Addr.Map.set insn_map ~key:finish ~data:(finish_mem, None) in + let one = (Addr.of_int 1 ~width) in + let two = (Addr.of_int 2 ~width) in + let rec construct_loop_body insn_isg insn_map start finish = + if not (Addr.equal start finish) then + let dist = Addr.max Addr.((finish - start)/two) one in + let step = Addr.(start + dist) in + (* Add edge from some intermediate point between the start and + the finish going from the calculated step toward the parameter + start, decreasing the distance between the outermost start + and finish. As this function executes, it creates log(dist) + nodes, each going from finish to step (reverse as it would + be a flow in a real binary) before finally reaching the + start *) + let insn_isg = add_edge insn_isg step start in + (* Because the algorithm at this point relies on the graph + and map entirely, it doesn't matter the contents of the + memory. *) + let junk_data = String.make (Addr.to_int dist |> ok_exn) ' ' in + let insn_map = Addr.Map.set insn_map ~key:start + ~data:(create_memory arch start junk_data |> ok_exn, None) in + construct_loop_body insn_isg insn_map step finish + else insn_map, insn_isg in + construct_loop_body insn_isg insn_map start finish + ) else insn_map, insn_isg + +let construct_entry_conflict insn_map insn_isg at conflict_len = + let junk_data = String.make conflict_len ' ' in + let conflict = Addr.(at ++ conflict_len) in + let insn_isg = add_edge insn_isg at conflict in + let insn_isg = add_edge insn_isg + Addr.(at ++ 1) Addr.(conflict ++ 1) in + let insn_map = Addr.Map.set insn_map ~key:at + ~data:(create_memory arch at junk_data |> ok_exn, None) in + let insn_map = Addr.Map.set insn_map ~key:conflict + ~data:(create_memory arch conflict junk_data |> ok_exn, None) in + let insn_map = Addr.Map.set insn_map ~key:Addr.(conflict ++1) + ~data:(create_memory arch Addr.(conflict ++1) junk_data |> ok_exn, None) in + let insn_map = Addr.Map.set insn_map ~key:Addr.(at ++ 1) + ~data:(create_memory arch Addr.(at ++ 1) junk_data |> ok_exn, None) in + insn_map, insn_isg + +let construct_tail_conflict + insn_map insn_isg tail_addr conflict_count = + let orig = + if G.Node.mem tail_addr insn_isg && + G.Node.degree ~dir:`Out tail_addr insn_isg > 0 then + let orig = G.Node.succs tail_addr insn_isg in + let orig = Addr.Set.of_list @@ Seq.to_list orig in orig + else Addr.Set.empty in + let insn_map, tail_len = match Addr.Map.find insn_map tail_addr with + | Some (mem, _) -> insn_map, Memory.length mem + | None -> + let tail_len = 1 in + let tail_data = String.make tail_len ' ' in + let mem = create_memory arch tail_addr tail_data |> ok_exn in + let insn_map = Addr.Map.set insn_map ~key:tail_addr + ~data:(mem, None) in + insn_map, tail_len in + let rec make_tail_options insn_map insn_isg conflict_count = + if conflict_count > 0 then + let junk_data = String.make conflict_count ' ' in + let conflict_addr = Addr.(tail_addr -- conflict_count) in + let insn_isg = add_edge insn_isg tail_addr conflict_addr in + let insn_map = Addr.Map.set insn_map ~key:conflict_addr + ~data:(create_memory arch conflict_addr junk_data |> ok_exn, + None) in + make_tail_options insn_map insn_isg (conflict_count - 1) + else + insn_map, insn_isg in + let insn_map, insn_isg = + make_tail_options insn_map insn_isg conflict_count in + let opts = G.Node.succs tail_addr insn_isg in + let opts = Addr.Set.of_list @@ Seq.to_list opts in + let opts = Set.diff opts orig in + let msg = sprintf + "expected %d, got %d" conflict_count Set.(length opts) in + assert_equal ~msg true (Set.(length opts) = conflict_count); + insn_map, insn_isg + +let test_loop_scc test_ctxt = + let insn_map, insn_isg = init () in + let entry = Addr.(of_int ~width 1) in + let insn_map, insn_isg = + construct_loop insn_map insn_isg entry Addr.(entry ++ 20) in + let loop_points = Addr.Hash_set.create () in + Seq.iter (G.nodes insn_isg) + ~f:(fun vert -> Hash_set.add loop_points vert); + let superset = + Superset_impl.of_components ~insn_map ~insn_risg:insn_isg arch in + let scc = Superset.ISG.raw_loops superset in + let scc_points = Addr.Hash_set.create () in + List.iter scc ~f:(fun scc -> + List.iter scc ~f:(fun component_addr -> + Hash_set.add scc_points component_addr; + )); + let in_loop_not_scc = "Found addr in loop but not in scc" in + Hash_set.iter loop_points ~f:(fun loop_addr -> + assert_equal ~msg:in_loop_not_scc true + @@ Hash_set.mem scc_points loop_addr); + let in_scc_not_loop = "Found addr in scc not loop" in + Hash_set.iter scc_points ~f:(fun loop_addr -> + assert_equal ~msg:in_scc_not_loop true + @@ Hash_set.mem loop_points loop_addr) + +let test_scc test_ctxt = + let insn_map, insn_risg = init () in + let zero = Addr.(of_int ~width 0) in + let entry = Addr.(of_int ~width 1) in + let insn_risg = add_edge insn_risg zero entry in + let superset = + Superset_impl.of_components ~insn_map ~insn_risg arch in + let components = Grammar.addrs_of_filtered_loops superset in + assert_equal ~msg:"found non scc component" 0 (Set.length components) + +let test_find_conflicts test_ctxt = + let insn_map, insn_risg = init () in + let in_loop_addr = Addr.(of_int ~width 0x10) in + let num_conflicts = 6 in + let insn_map, insn_risg = + construct_tail_conflict insn_map insn_risg in_loop_addr + num_conflicts in + let superset = Superset_impl.of_components + ~insn_map ~insn_risg arch in + let conflicts = Superset.Occlusion.find_all_conflicts superset in + assert_equal Set.(length conflicts) num_conflicts + +let test_trim_scc test_ctxt = + let insn_map, insn_risg = init () in + let entry = Addr.(of_int ~width 1) in + let insn_map, insn_risg = + construct_loop insn_map insn_risg entry Addr.(entry ++ 20) in + let superset = Superset_impl.of_components + ~insn_map ~insn_risg arch in + let keep_entry = Superset.Core.mem superset entry in + let msg = sprintf "entry %s should be in the graph" + Addr.(to_string entry) in + assert_equal ~msg keep_entry true; + let in_loop_addr = Addr.(of_int ~width 0x10) in + let loop_points = Addr.Hash_set.create () in + Seq.iter (G.nodes insn_risg) ~f:(Hash_set.add loop_points); + let insn_map, insn_risg = + construct_tail_conflict insn_map insn_risg in_loop_addr 3 in + let conflicts_added = Addr.Hash_set.create () in + Seq.iter (G.nodes insn_risg) ~f:(fun vert -> + if not (Hash_set.mem loop_points vert) then + Hash_set.add conflicts_added vert); + let superset = Superset_impl.of_components + ~insn_map ~insn_risg arch in + let components = Superset.ISG.raw_loops superset in + assert_bool "Should have a component" + (List.(length components) > 0); + let superset = + Grammar.tag_loop_contradictions ~min_size:1 superset in + assert_bool "should have marked conflict" + (0 < Superset.Inspection.(num_bad superset)); + let keep_entry = Superset.Inspection.is_bad_at superset entry in + let msg = sprintf "entry %s should not be marked bad" + Addr.(to_string entry) in + assert_equal ~msg keep_entry false; + let superset = Trim.run superset in + let keep_entry = Superset.Core.mem superset entry in + let msg = sprintf "entry %s should not be removed" + Addr.(to_string entry) in + assert_equal ~msg keep_entry true; + let conflicts_added_str = List.to_string ~f:Addr.to_string @@ + Hash_set.to_list conflicts_added in + let removed_msg = "of conflicts " ^ conflicts_added_str + ^ ", residual conflict present within " in + let isg_msg = Superset.ISG.isg_to_string superset in + let removed_msg = sprintf "%s\n%s" isg_msg removed_msg in + let (mlg, glm) = Superset.Inspection.unbalanced_diff superset in + let msg = sprintf "%s\nmlg: %d, glm: %d" removed_msg + Set.(length mlg) Set.(length glm) in + let set_to_string s = + List.to_string ~f:Addr.to_string @@ Set.to_list s in + let msg = sprintf "%s\nmap less graph: %s, graph less map: %s" + msg (set_to_string mlg) (set_to_string glm) in + assert_bool msg (Set.(length mlg)=0 && Set.(length glm)=0); + let loop_msg addr = + sprintf "%s\nloop addr %s should remain during tail trim" + (Superset.ISG.isg_to_string superset) + Addr.(to_string addr) in + Hash_set.iter loop_points ~f:(fun addr -> + assert_equal ~msg:(loop_msg addr) true @@ + Superset.Core.mem superset addr) + +(* Establishes, in the case of if structures, how topological *) +(* tranversal works - one time visit only *) +let test_topological_revisit ctxt = + let _, insn_risg = init () in + let width = 32 in + let start = Addr.of_int 0 ~width in + let stop = Addr.of_int 2 ~width in + let rec make_if insn_risg current stop = + if not Addr.(current = stop) then + let next = Addr.succ current in + let insn_risg = add_edge insn_risg current next in + make_if insn_risg next stop else insn_risg in + let insn_risg = make_if insn_risg start stop in + let insn_risg = add_edge insn_risg start stop in + let update_count addr visit_count = + match Map.find visit_count addr with + | Some (count) -> + let visit_count = Map.remove visit_count addr in + Map.set visit_count ~key:addr ~data:(count+1) + | None -> Map.set visit_count ~key:addr ~data:1 in + + let visit_count = Topological.fold + update_count insn_risg Addr.Map.empty in + Map.iteri visit_count ~f:(fun ~key ~data -> assert_equal ~ctxt data 1) + +let rec extend_back insn_map insn_isg ?(step=1) addr num = + let make_link len = + let dest = Addr.(addr -- len) in + let insn_isg = add_edge insn_isg addr dest in + let junk_data = String.make len ' ' in + let mem = create_memory arch dest junk_data |> ok_exn in + let insn_map = Map.set insn_map ~key:dest ~data:(mem, None) in + (insn_map, insn_isg) in + if not (num = 0) then + let (insn_map, insn_isg) = make_link step in + extend_back insn_map insn_isg Addr.(addr -- step) (num - 1) ~step + else + insn_map, insn_isg + +let make_extended_cross tail_addr = + let insn_map, insn_risg = init () in + let insn_map, insn_risg = + construct_tail_conflict insn_map insn_risg tail_addr 2 in + let layer_options = Seq.to_list G.Node.(succs tail_addr insn_risg) in + let insn_map, insn_risg = List.fold ~init:(insn_map, insn_risg) + layer_options ~f:(fun (insn_map, insn_risg) opt -> + extend_back insn_map insn_risg opt 1 ~step:2 + ) in + let superset = Superset_impl.of_components + ~insn_map ~insn_risg arch in + let extended_points = + Superset.entries_of_isg superset in + let _,insn_risg = Hash_set.fold ~init:(None,insn_risg) extended_points + ~f:(fun (current,insn_risg) next -> + let insn_risg = Option.value_map current ~f:(fun current -> + add_edge insn_risg current next + ) ~default:insn_risg in + Some(next),insn_risg + ) in + insn_map, insn_risg + +let construct_branch insn_map insn_risg branch_at incr = + let left = Addr.(branch_at ++ incr) in + let junk_data = String.make incr ' ' in + let left_mem = create_memory arch left junk_data |> ok_exn in + let insn_map = Map.set insn_map ~key:left ~data:(left_mem, None) in + let right = Addr.(left ++ incr) in + let right_mem = create_memory arch right junk_data |> ok_exn in + let insn_map = Map.set insn_map ~key:right ~data:(right_mem, None) in + let rejoin = Addr.(right ++ incr) in + let rejoin_mem = create_memory arch rejoin junk_data |> ok_exn in + let insn_map = Map.set insn_map ~key:rejoin ~data:(rejoin_mem, None) in + let insn_risg = add_edge insn_risg left branch_at in + let insn_risg = add_edge insn_risg right branch_at in + let insn_risg = add_edge insn_risg rejoin right in + let insn_risg = add_edge insn_risg rejoin left in + insn_map, insn_risg + +let test_branch_recognition test_ctxt = + let tail_addr = Addr.of_int ~width:addr_size 50 in + let insn_map, insn_risg = init () in + let insn_map, insn_risg = + construct_branch insn_map insn_risg tail_addr 2 in + let superset = Superset_impl.of_components + ~insn_map ~insn_risg arch in + let entries = Superset.entries_of_isg superset in + let msg = "expect at least one entry" in + assert_bool msg (Hash_set.(length entries) > 0); + let branches = Grammar.identify_branches superset in + let msg = sprintf + "expect branches to be detected! was %d" + Hash_set.(length branches) in + assert_bool msg (Hash_set.(length branches) = 1); + let msg = "expect exact branch addr to be detected!" in + assert_bool msg (Hash_set.(mem branches tail_addr)); + () + +let test_dfs_iter_order test_ctxt = + let insn_map, insn_risg = init () in + let start = Addr.of_int ~width:addr_size 40 in + let insn_risg = add_edge insn_risg start Addr.(succ start) in + let insn_risg = add_edge insn_risg start Addr.(start ++ 2) in + let visit_order = ref [] in + let superset = + Superset_impl.of_components ~insn_map ~insn_risg arch in + Traverse.with_ancestors_at + ~pre:(fun v -> visit_order := v :: !visit_order) + superset start; + visit_order := List.rev !visit_order; + let msg = sprintf "expected addr %s to be first, was %s" + Addr.(to_string start) + (List.to_string ~f:Addr.to_string !visit_order) in + match !visit_order with + | first :: _ -> + assert_equal ~msg first start + | _ -> assert_bool msg false + +(* conflicts should include both the instruction at a data address *) +(* and the instruction whose body it is inside. *) +let test_find_all_conflicts test_ctxt = + let insn_map, insn_risg = init () in + let tail_addr = Addr.of_int ~width:addr_size 50 in + let num_conflicts = 2 in + let insn_map, insn_risg = + construct_tail_conflict + insn_map insn_risg tail_addr num_conflicts in + let superset = Superset_impl.of_components + ~insn_map ~insn_risg arch in + let conflicts = Superset.Occlusion.find_all_conflicts superset in + let msg = sprintf "expect %d conflicts" num_conflicts in + assert_equal ~msg num_conflicts Set.(length conflicts) + + +(* Establish the idempotency or addition of edges. *) +let test_graph_edge_behavior test_ctxt = + let _, insn_risg = init () in + let start = Addr.of_int ~width:addr_size 50 in + let insn_risg = add_edge insn_risg start Addr.(succ start) in + let insn_risg = add_edge insn_risg start Addr.(succ start) in + let insn_risg = add_edge insn_risg start Addr.(succ start) in + let edges = Seq.filter (G.edges insn_risg) ~f:(fun e -> + Addr.((G.Edge.src e) = start) + && Addr.((G.Edge.dst e) = Addr.(succ start))) in + let msg = "expect single edge between nodes" in + assert_equal ~msg Seq.(length edges) 1 + +let test_streams test_ctxt = + let strm, sgnl = Stream.create () in + let called = ref false in + Stream.watch strm (fun id _ -> + called := true; + Stream.unsubscribe strm id + ); + Signal.send sgnl (); + assert_bool "stream did not receive signal called" !called + +let test_ssa test_ctxt = + let arch = Arch.(`x86_64) in + let make_chain bils = + let bils = List.rev bils in + let insn_map, insn_risg = init () in + let lifted = Addr.Table.create () in + let init = zero,insn_risg in + let _,insn_risg = + List.fold bils ~init ~f:(fun (accu,insn_risg) bil -> + Addr.Table.set lifted ~key:accu ~data:bil; + let s = Addr.succ accu in + let e = G.Edge.create accu s () in + s,G.Edge.insert e insn_risg + ) in + Superset_impl.of_components ~insn_map ~insn_risg ~lifted arch in + let find_ssa superset ~f = + let entries = Superset.entries_of_isg superset in + assert_bool "Expect >= 1 entry in superset" + ((Hash_set.length entries) > 0); + let fssa = Liveness.compute_liveness superset in + let ssa = Addr.Hash_set.create () in + List.iter Set.(to_list fssa) ~f:Hash_set.(add ssa); + f ssa in + let superset = + let open Bil in + let v = Var.create "v" @@ Bil.Types.Imm 32 in + let def = Bil.move v @@ Bil.Int zero in + let use = Bil.move v ((Bil.Var v) + (Bil.Int (Addr.succ zero))) in + make_chain [[def]; [use]] in + find_ssa superset ~f:(fun ssa_rax -> + assert_bool "Expect >= 1 ssa for move move" + ((Hash_set.length ssa_rax) > 0)); + let superset = + let open Bil in + let s = Size.(`r64) in + let mem = Bil.var @@ Var.create "mem" (Bil.Types.Imm 64) in + let exp = Bil.var @@ Var.create "v1" (Bil.Types.Imm 64) in + let addr = Var.create "addr" (Bil.Types.Imm 64) in + let mv_addr = Bil.move addr Bil.(exp + (Int zero)) in + let st = store ~mem ~addr:Bil.(Var addr) exp LittleEndian s; in + let stv = Var.create "st" @@ Bil.Types.Imm 32 in + let v2 = Var.create "v2" (Bil.Types.Imm 64) in + let move_v2 = Bil.(move v2 Bil.(exp + (Int zero))) in + let mem = Bil.var @@ Var.create "mem2" (Bil.Types.Imm 64) in + let ld = load ~mem ~addr:(Var v2) LittleEndian s in + let ldv = Var.create "ld" @@ Bil.Types.Imm 32 in + let st = Bil.move stv st in + let ld = Bil.move ldv ld in + make_chain [[mv_addr]; [st]; [move_v2]; [ld];] in + find_ssa superset ~f:(fun ssa_renamed -> + let actual = (Hash_set.length ssa_renamed) in + let msg = + sprintf + "Expect >= 2 ssa for recognition over renaming, was %d" + actual in + assert_bool msg (actual > 2)); + let superset = + let open Bil in + let s = Size.(`r64) in + let mem = Bil.var @@ Var.create "mem" (Bil.Types.Imm 64) in + let ld = load ~mem ~addr:(Int zero) LittleEndian s in + let ldv = Var.create "ld" @@ Bil.Types.Imm 32 in + let ld = [Bil.move ldv ld] in + let exp = Bil.var @@ Var.create "v1" (Bil.Types.Imm 64) in + let st = store ~mem ~addr:(Int zero) exp LittleEndian s; in + let stv = Var.create "st" @@ Bil.Types.Imm 32 in + let st = [Bil.move stv st] in + make_chain [ld; st] in + find_ssa superset ~f:(fun ssa_mem_mem -> + assert_bool "Expect >= 1 ssa for load store, address exp equal" + ((Hash_set.length ssa_mem_mem) > 0)); + let superset = + let open Bil in + let s = Size.(`r64) in + let mem = Bil.var @@ Var.create "mem" (Bil.Types.Imm 64) in + let ld = load ~mem ~addr:(Int zero) LittleEndian s in + let ldv = Var.create "ld" @@ Bil.Types.Imm 32 in + let ld = [Bil.move ldv ld] in + let exp = Bil.var @@ Var.create "v1" (Bil.Types.Imm 64) in + let st = store ~mem ~addr:(Int zero) exp LittleEndian s; in + let stv = Var.create "st" @@ Bil.Types.Imm 32 in + let st = [Bil.move stv st] in + make_chain [ld; st] in + find_ssa superset ~f:(fun ssa_mem_mem -> + assert_bool + "Expect >= 1 ssa for memory exp operation to same addr" + ((Hash_set.length ssa_mem_mem) > 0)); + let superset = + let open Bil in + let s = Size.(`r64) in + let mem = Bil.var @@ Var.create "x" (Bil.Types.Imm 64) in + let ld = load ~mem ~addr:(Int zero) LittleEndian s in + let ldv = Var.create "ld" @@ Bil.Types.Imm 32 in + let ld = [Bil.move ldv ld] in + let exp = Bil.((var ldv) + mem) in + let st = + store ~mem ~addr:(Int (Addr.succ zero)) exp LittleEndian s; in + let stv = Var.create "st" @@ Bil.Types.Imm 32 in + let st = [Bil.move stv st] in + make_chain [ld; st] in + find_ssa superset ~f:(fun ssa_load_store -> + assert_bool + "Expect >= 1 ssa for memory exp operation sharing variable" + ((Hash_set.length ssa_load_store) > 0)); + let superset = + let open Bil in + let v1 = Var.create "v2" @@ Bil.Types.Imm 32 in + let def1 = Bil.move v1 @@ Bil.Int zero in + let v2 = Var.create "v2" @@ Bil.Types.Imm 32 in + let def2 = Bil.move v2 @@ Bil.Var v1 in + let v3 = Var.create "v3" @@ Bil.Types.Imm 32 in + let use = Bil.move v3 ((Bil.Var v1) + (Bil.Var v2)) in + let step1 = [def1] in + let step2 = [def2] in + let step3 = [use] in + make_chain [step1; step2; step3; step3;] in + find_ssa superset ~f:(fun ssa_chain -> + assert_bool "Expect >= 2 ssa for register chain" + ((Hash_set.length ssa_chain) > 1)); + () + +let () = + let suite = + "suite">::: + [ + "test_hits_every_byte" >:: test_hits_every_byte; + "test_trim" >:: test_trim; + "test_trims_invalid_jump" >:: test_trims_invalid_jump; + "test_addr_map" >:: test_addr_map; + "test_insn_isg" >:: test_insn_isg; + "test_consistent_superset" >:: test_consistent_superset; + "test_loop_scc" >:: test_loop_scc; + "test_scc" >:: test_scc; + "test_find_conflicts" >:: test_find_conflicts; + "test_trim_scc" >:: test_trim_scc; + "test_topological_revisit" >:: test_topological_revisit; + "test_branch_recognition" >:: test_branch_recognition; + "test_dfs_iter_order" >:: test_dfs_iter_order; + "test_find_all_conflicts" >:: test_find_all_conflicts; + "test_graph_edge_behavior" >:: test_graph_edge_behavior; + "test_can_lift" >:: test_can_lift; + "test_static_successors_includes_fall_through" >:: + test_static_successors_includes_fall_through; + "test_brancher" >:: test_brancher; + "test_lift" >:: test_lift; + "test_successor_calculation" >:: test_successor_calculation; + "test_superset_contains_addr" >:: test_superset_contains_addr; + "test_target_not_in_mem" >:: test_target_not_in_mem; + "test_tag_non_insn" >:: test_tag_non_insn; + "test_tag_target_is_bad" >:: test_tag_target_is_bad; + "test_target_in_body" >:: test_target_in_body; + "test_streams" >:: test_streams; + "test_ssa" >:: test_ssa; + ] in + run_test_tt_main suite +;; diff --git a/oasis/superset-disasm b/oasis/superset-disasm new file mode 100644 index 000000000..9cc1a0ec1 --- /dev/null +++ b/oasis/superset-disasm @@ -0,0 +1,64 @@ +OASISFormat: 0.4 +Name: superset_disasm +Version: 0.1 +Synopsis: A superset disassembler that strives to make the minimum superset of disassembly possible, starting from every byte offset by default. +Authors: Kenneth Adam Miller +Maintainers: Kenneth Adam Miller +License: MIT +Plugins: META (0.4), DevFiles (0.4) +AlphaFeatures: ocamlbuild_more_args +BuildTools: ocamlbuild +XOCamlbuildExtraArgs: -j 5 +BuildDepends: + core_kernel, + bap, + ppx_inline_test, + graphlib, + landmarks, + bap-future, + zmq, + gnuplot + +Library superset_disassemblers + Path: src/ + Install: true + CompiledObject: best + BuildDepends: bap, core_kernel, graphlib, ppx_inline_test, bap-future #, landmarks.ppx + Modules: + Superset, + Metrics, + Cmdoptions, + Traverse, + Fixpoint, + Trim, + Invariants, + Grammar, + Liveness, + Heuristics, + Report, + Superset_impl, + Metadata + +Document "superset_disassemblers_api" + Type: ocamlbuild (0.4) + BuildTools: ocamldoc + Title: API reference for superset_disasm + XOCamlbuildPath: . + XOCamlbuildExtraArgs: + "-docflags '-colorize-code -short-functors -charset utf-8'" + XOCamlbuildLibraries: superset_disassemblers + +Executable plot_superset_cache + Path: src/ + Install: true + MainIs: plot_superset_cache.ml + CompiledObject: best + BuildDepends: core_kernel, bap, cmdliner, superset_disassemblers, findlib.dynload, gnuplot, bap-future, bap-knowledge #,landmarks.ppx + + +Executable test_superset_disasm + Path: unit_tests + MainIs: test_superset_disasm.ml + Install: false + BuildDepends: core_kernel, bap, cmdliner, oUnit, superset_disassemblers, graphlib, ppx_inline_test, bap-future, findlib.dynload + CompiledObject: best diff --git a/plugins/superset_disasm/build.sh b/plugins/superset_disasm/build.sh new file mode 100755 index 000000000..e8d9473c6 --- /dev/null +++ b/plugins/superset_disasm/build.sh @@ -0,0 +1,5 @@ +#pushd plugin/ +rm -rf _build +#-pkg superset_disassemblers +bapbuild -pkg findlib.dynload -pkg str -pkg zmq -pkg bap-primus -pkg bap-knowledge -pkg superset_disassemblers superset_disassembler.plugin +#popd diff --git a/plugins/superset_disasm/install.sh b/plugins/superset_disasm/install.sh new file mode 100755 index 000000000..3137aceb3 --- /dev/null +++ b/plugins/superset_disasm/install.sh @@ -0,0 +1 @@ +bapbundle install superset_disassembler.plugin diff --git a/plugins/superset_disasm/run.sh b/plugins/superset_disasm/run.sh new file mode 100755 index 000000000..5c808b014 --- /dev/null +++ b/plugins/superset_disasm/run.sh @@ -0,0 +1 @@ +bap --superset-disassembler-target $(which objdump) $(which objdump) --superset-disassembler-export=test diff --git a/plugins/superset_disasm/superset_disassembler.ml b/plugins/superset_disasm/superset_disassembler.ml new file mode 100644 index 000000000..a11bc2f35 --- /dev/null +++ b/plugins/superset_disasm/superset_disassembler.ml @@ -0,0 +1,616 @@ +open Core_kernel +open Bap.Std +open Regular.Std +open Bap_knowledge +open Bap_core_theory +open Monads.Std +open Cmdoptions +open Bap_main +open Bap_plugins.Std + +include Self() +module Dis = Disasm_expert.Basic + +let man = {| + # DESCRIPTION + + Superset disassembly is a disassembly method in which every single + byte offset within the executable region of a binary is initially + treated as being potentially compiler intended output. However, + after applying several rounds of heuristics the true positives, or + the actually intended instructions, can be distinguished from the + noise. It is an alternate disassembly method from linear sweep or + recursive descent, the two (probably most) populate mainstream + disassembly methods. This approach exchanges the possibility of + some small portion of the final output including some occlusive + unintended sequences being incorrectly kept (a superset) for the + probabilistic guarantee of having no misses of those that are + intended. + + Heuristics are broken into three main groups: invariants, analyses, + and features. Invariants are ideally lawful characteristics of + binary code, where disobedience is illegal for any well formed + assembler, and run with a limited scope/visibility of just + instructions. Analyses are typically processes that identify less + visible violations of well-formed assembler rules or other lawful + assembler characteristics that require global visibility. Heuristics + are data traits that may be dirty and require some iterative + convergence to recognize the subset within the initial superset that + can be guaranteeably cleansed. Once convergence occurs, the bodies + of lineages with sufficient evidence are cleansed of occlusion, and + any lineage that does not have enough features to support being kept + is dropped. + + # ARGUMENTS + + Fundamental arguments specific to the superset disassembler include: + rounds, loader, ground_truth_bin, ground_truth_file, target, + invariants, analyses, trim_method, tp_threshold, save_addrs, + save_gt, save_dot, collect_report + + # PASSES + + Passes are not run by the superset disassembler, but the output can + be fed into the regular disassembly pipeline by making use of the + cache. At that point, an analysis pass can be run, and it isn't + meaningful to try to run an analysis pass on the raw superset alone + because it does not reconstruct the full project type. + + # OUTPUT + + The resulting project data structure could be dumped using the + $(b,--dump) (or $(b,-d) for short) option, which accepts the desired + format and, optionally, the output file name. + + It is possible to specify the $(b,--dump) option multiple times, in + which case the project will be dumped in several formats. + + ``` + bap superset_disasm /bin/echo -dasm:out.asm + ``` + |} + +let superset_disasm options = + let module With_options = + With_options(struct + let options = options + end) in + let t = Sys.time() in + let open KB.Syntax in + With_options.with_options () >>= fun superset -> + KB.promise Metrics.Cache.time (fun o -> + KB.return (Some (int_of_float (Sys.time() -. t)))); + (* Provide the is_valid label as a check on whether a given + address is in the superset after trimming *) + KB.promise Theory.Label.is_valid @@ (fun label -> + (* (target is just the machine target) *) + Theory.Label.target label >>= fun tgt -> + (* For each address the in the knowledge base *) + (* Collect the is_valid's label address *) + KB.collect Theory.Label.addr label >>= fun addr -> + match addr with + | Some addr -> + let addr = (Word.code_addr tgt addr) in + (* And return whether it should be kept or not *) + KB.return @@ Some (Superset.Core.mem superset addr) + | None -> KB.return None + ); + KB.return () + +let features_used = [ + "disassembler"; + "lifter"; + "symbolizer"; + "brancher"; + "loader"; + "abi"; +] + +type failure = + | Expects_a_regular_file of string + | Incompatible_options of string * string + | Project of Error.t + | Unknown_format of string + | Unavailable_format_version of string + | Unknown_collator of string + | Unknown_analysis of string + | No_knowledge of string + +type Extension.Error.t += Fail of failure + +module Err = Monad.Result.Make(Extension.Error)(Monad.Ident) +open Err.Syntax + +let proj_error = Result.map_error ~f:(fun err -> Fail (Project err)) + +let knowledge_reader = Data.Read.create + ~of_bigstring:Knowledge.of_bigstring () + +let knowledge_writer = Data.Write.create + ~to_bigstring:Knowledge.to_bigstring () + +let knowledge_cache () = + Data.Cache.Service.request + knowledge_reader + knowledge_writer + +let load_cache_with_digest cache digest = + match Data.Cache.load cache digest with + | None -> false + | Some state -> + info "importing knowledge from cache"; + Toplevel.set state; + true + +let import_knowledge_from_cache digest = + let digest = digest ~namespace:"knowledge" in + info "looking for knowledge with digest %a" + Data.Cache.Digest.pp digest; + let cache = knowledge_cache () in + load_cache_with_digest cache digest + +let store_knowledge_in_cache digest = + let digest = digest ~namespace:"knowledge" in + info "caching knowledge with digest %a" + Data.Cache.Digest.pp digest; + let cache = knowledge_cache () in + Toplevel.current () |> + Data.Cache.save cache digest + +let load_knowledge digest = function + | None -> import_knowledge_from_cache digest + | Some path when not (Sys.file_exists path) -> + import_knowledge_from_cache digest + | Some path -> + info "importing knowledge from %S" path; + Toplevel.set @@ Knowledge.load path; + true + +let save_knowledge ~had_knowledge ~update digest = function + | None -> + store_knowledge_in_cache digest + | Some path when update -> + info "storing knowledge base to %S" path; + Knowledge.save (Toplevel.current ()) path + | Some _ -> () + +let outputs = + Extension.Command.parameters + ~doc:"Dumps the program to (defaults to stdout) \ + in the format (defaults to bir)." + ~as_flag:"bir" + ~aliases:["d"] + Extension.Type.("[[:]]" %: string) + "dump" + +let rw_file = Extension.Type.define + ~name:"" ~print:ident ~parse:ident + ~digest:(fun path -> + if Sys.file_exists path + then Caml.Digest.file path + else Caml.Digest.string "empty") + "" + +let update = + Extension.Command.flag "update" ~aliases:["u"] + ~doc: "Preserve the knowledge base, i.e., do not change it." + +let knowledge = + Extension.Command.parameter + ~doc:"Import the knowledge to the provided knowledge base. \ + If the $(b,--update) flag is set the knowledge base \ + will be also updated with the new information. If \ + $(b,--update) is set, the knowledge base might not \ + exist and it will be created" + ~aliases:["k"; "knowledge-base";] + (Extension.Type.some rw_file) "project" + +let input = Extension.Command.argument + ~doc:"The input file" Extension.Type.("FILE" %: string =? "a.out" ) + +let loader = + Extension.Command.parameter + ~doc:"Use the specified loader. + Use the loader `raw' to load unstructured files" + Extension.Type.(string =? "llvm") + "loader" + +let target = + let t = Extension.Type.define Theory.Target.unknown + ~name:"NAME" + ~digest:(fun t -> Caml.Digest.string@@Theory.Target.to_string t) + ~parse:(fun s -> match Theory.Target.lookup ~package:"bap" s with + | Some t -> t + | None -> + invalid_argf "unknown target %S, please see \ + `bap list targets' for the full list \ + of targets" s ()) + ~print:Theory.Target.to_string in + Extension.Command.parameter t "target" + ~doc:"Refines the target architecture of the binary. \ + See `bap list targets` for the full hierarchy of targets. \ + The specified target must be a refinement of the actual \ + target stored in the binary, otherwise an error is signaled." + +let validate_input file = + Result.ok_if_true (Sys.file_exists file) + ~error:(Fail (Expects_a_regular_file file)) + +let validate_knowledge update kb = match kb with + | None -> + Ok () + | Some path -> + let error = + Fail (No_knowledge "No initial knowledge to update") in + Result.ok_if_true (Sys.file_exists path || update) ~error + +let option_digest f = function + | None -> "none" + | Some s -> f s + +module Dump_formats = struct + let parse_fmt fmt = + match String.split ~on:'-' fmt with + | [fmt;ver] -> fmt, Some ver + | _ -> fmt,None + + let flatten (x,(y,z)) = x,y,z + + let split str = match String.split ~on:':' str with + | [fmt;dst] -> flatten (`file dst,parse_fmt fmt) + | _ -> flatten (`stdout,parse_fmt str) + + let parse_format str = + let (_,fmt,ver) as r = split str in + match Project.find_writer ?ver fmt with + | Some _ -> Ok r + | None -> match Project.find_writer fmt with + | None -> Error (Fail (Unknown_format fmt)) + | Some _ -> Error (Fail (Unavailable_format_version fmt)) + + let parse outputs = + Result.all @@ + List.map outputs ~f:parse_format +end + +let make_digest inputs = + let inputs = String.concat inputs in + fun ~namespace -> + let d = Data.Cache.Digest.create ~namespace in + Data.Cache.Digest.add d "%s" inputs + +let compute_digest target disasm = + make_digest [ + Caml.Digest.file target; + disasm; + ] + +let superset_digest options = + let open Cmdoptions in + compute_digest options.target options.disassembler + +let save_metadata options = + let digest = superset_digest options ~namespace:"knowledge" in + Metadata.with_digests (fun metadata -> + let c = Option.value metadata + ~default:Metadata.Cache_metadata.empty in + KB.promise Metadata.digests (fun o -> + let d = Data.Cache.Digest.(to_string digest) in + KB.return @@ (Some + (Metadata.Cache_metadata.set c + ~key:options.target ~data:d)) + ); + Metadata.save () + ) + +let create_and_process + input outputs loader update kb options = + (*let () = save_metadata options in*) + let digest = superset_digest options in + let had_knowledge = load_knowledge digest kb in + let () = Toplevel.exec @@ + if not had_knowledge then + superset_disasm options + else KB.return () in + (match options.ground_truth_bin with + | Some bin -> + KB.promise Metrics.Cache.ground_truth_source + (fun _ -> KB.return bin); + | None -> ()); + let ro = Metrics.Cache.reduced_occlusion in + let _ = Toplevel.eval ro Metrics.Cache.sym_label in + let _ = Toplevel.eval Metrics.Cache.size Metrics.Cache.sym_label in + store_knowledge_in_cache digest + (*save_knowledge ~had_knowledge ~update digest kb*) + +let rounds = + let doc = "Number of analysis cycles" in + Extension.Command.parameter ~doc Extension.Type.(int =? 2) "rounds" + +let tp_threshold = + let doc = "The threshold at which convergence occurs" in + Extension.Command.parameter ~doc Extension.Type.(float =? 0.99) + "threshold" + +let heuristics = + let doc = + "Specify the features used to converge upon the true positive set" in + Extension.Command.parameter ~doc + Extension.Type.(list string =? Heuristics.defaults) + "heuristics" + +let invariants = + let doc = "Specify the desired invariants to apply to the superset" in + let deflt = List.map Invariants.default_tags ~f:fst in + Extension.Command.parameter ~doc + Extension.Type.(list string =? deflt) "invariants" + +let ground_truth_bin = + let doc = ("Compare results against a ground truth constructed" ^ + " from debug symbols of an unstripped binary") in + Extension.Command.parameter + ~doc Extension.Type.("" %: (some string) =? None) + "ground_truth_bin" + +let analyses = + let deflt = ["Strongly Connected Component Data"] in + Extension.Command.parameter + Extension.Type.(list string =? deflt) "analyses" + +let save_dot = Extension.Command.flag "save_dot" + +let converge = + Extension.Command.flag "converge" + +let protect = + Extension.Command.flag "protect" + +let _superset_disassemble_command : unit = + let args = + let open Extension.Command in + args $input $outputs $loader $update $knowledge + $ground_truth_bin $invariants $analyses $tp_threshold $heuristics + $save_dot $rounds $converge $protect + in + Extension.Command.declare ~doc:man "superset_disasm" + ~requires:features_used args @@ + fun input outputs loader update kb + ground_truth_bin invariants analyses tp_threshold heuristics + save_dot rounds converge protect ctxt -> + let converge = not converge in + let protect = not protect in + let options = + Fields.create ~disassembler:loader + ~ground_truth_bin ~target:input ~save_dot ~tp_threshold + ~rounds ~heuristics ~analyses + ~converge ~protect ~invariants in + validate_knowledge update kb >>= fun () -> + validate_input input >>= fun () -> + Dump_formats.parse outputs >>= fun outputs -> + Ok (create_and_process input outputs loader + update kb options) + +let destination = + Extension.Command.parameter Extension.Type.string "destination" + +let cache_digest = + Extension.Command.parameter Extension.Type.string "cache_digest" + +type cache_msg = { + digest : string; + state : bigstring; + } [@@deriving sexp] + +exception Cache_not_present +let _send_cache : unit = + let args = + let open Extension.Command in + args $input $outputs $loader $update $knowledge + $destination $cache_digest + in + let man = + "Send a cache state to a designated address. Ex: tcp://host:port" + in + Extension.Command.declare ~doc:man "send_cache" + ~requires:features_used args @@ + fun input outputs loader update kb + destination cache_digest ctxt -> + let cache = knowledge_cache () in + let d = Data.Cache.Digest.of_string cache_digest in + let had = load_cache_with_digest cache d in + let () = + if had then + let state = Toplevel.current () in + let msg = { + digest = cache_digest; + state = Knowledge.to_bigstring state; + } in + let msg = Sexp.to_string @@ sexp_of_cache_msg msg in + let zmq_ctxt = Zmq.Context.create () in + let socket = Zmq.Socket.create zmq_ctxt Zmq.Socket.push in + let () = Zmq.Socket.connect socket destination in + Zmq.Socket.send socket msg + else + raise Cache_not_present in + Ok() + +let bind_addr = + Extension.Command.parameter Extension.Type.string "bind_addr" + +let perpetuate = + Extension.Command.flag "perpetuate" + +let _recv_cache : unit = + let args = + let open Extension.Command in + args $outputs $loader $update $knowledge + $bind_addr $perpetuate + in + let man = + "Receive a cache state on the given address bound to. Ex:" ^ + "tcp://*:" in + Extension.Command.declare ~doc:man "recv_cache" + ~requires:features_used args @@ + fun outputs loader update kb + bind_addr perpetuate ctxt -> + let zmq_ctxt = Zmq.Context.create () in + let socket = Zmq.Socket.create zmq_ctxt Zmq.Socket.pull in + let () = Zmq.Socket.bind socket bind_addr in + let ran = ref false in + Ok (while perpetuate || (not !ran) do + let s = cache_msg_of_sexp @@ Sexp.of_string + @@ Zmq.Socket.recv socket in + let { digest; state; } = s in + let state = Knowledge.of_bigstring state in + let cache = knowledge_cache () in + let d = Data.Cache.Digest.of_string digest in + Data.Cache.save cache d state; + ran := true; + done) + +let metrics = + let doc = + sprintf "%s%s%s%s" + "The format string specifying how to print the metrics" + ", which include: clean_functions, true_positives, " + "false_positives, false_negatives, reduced_occlusion, " + " occlusive_space, and function_entrances " in + Extension.Command.parameter ~doc + Extension.Type.(some (list string)) "metrics" + +let _distribution_command : unit = + let args = + let open Extension.Command in + args $input $outputs $loader $update $knowledge + $ground_truth_bin $invariants $analyses + $tp_threshold $heuristics $rounds + $converge $metrics in + let man = "Perform computational operations on the cache" in + Extension.Command.declare ~doc:man "superset_distribution" + ~requires:features_used args @@ + fun input outputs loader update kb + ground_truth_bin invariants + analyses tp_threshold heuristics rounds + converge metrics + ctxt -> + validate_knowledge update kb >>= fun () -> + validate_input input >>= fun () -> + Dump_formats.parse outputs >>= fun outputs -> + let options = + Fields.create ~disassembler:loader + ~ground_truth_bin ~target:input ~save_dot:false ~tp_threshold + ~rounds ~heuristics ~analyses ~converge:false ~protect:false + ~invariants in + let digest = superset_digest options in + let _ = load_knowledge digest kb in + let map_opt = + function | None -> "unknown" | Some v -> sprintf "%d" v in + let open KB.Syntax in + Toplevel.exec @@ + (match metrics with + | Some metrics -> + Metrics.Cache.sym_label >>= (fun label -> + let oc_space = Metrics.Cache.occlusive_space in + let ro = Metrics.Cache.reduced_occlusion in + let fns = Metrics.Cache.false_negatives in + let fps = Metrics.Cache.false_positives in + let tps = Metrics.Cache.true_positives in + let slots = [ oc_space ; ro; fns; fps; tps ] in + let slots = + List.map slots ~f:(fun slt -> + KB.collect slt label >>= fun d -> + KB.return @@ + ((KB.Name.show @@ KB.Slot.name slt),(map_opt d)) + ) in + let fe = Metrics.Cache.function_entrances in + let clean = Metrics.Cache.clean_functions in + KB.all @@ List.append slots @@ + List.map [fe; clean] ~f:(fun slt -> + KB.collect slt label >>= fun d -> + let d = Option.map d ~f:Set.length in + KB.return @@ + ((KB.Name.show @@ KB.Slot.name slt),(map_opt d)) + ) >>= fun slots -> + let metric_vals = + List.fold ~init:String.Map.empty slots + ~f:(fun m (name,v) -> String.Map.set m ~key:name ~data:v) in + let fmt,rem = List.hd metrics, List.tl metrics in + let s = + match fmt, rem with + | Some fmt, Some rem -> + let init = fmt,1 in + let s,_=List.fold rem ~init ~f:(fun (fmt,v) s -> + let r = Str.regexp @@ sprintf "%%%d" v in + let opts = + List.to_string (Map.keys metric_vals) + ~f:ident in + let default = + sprintf + "\"%s\" is not a metric, opts: %s" s opts in + let s = Map.find metric_vals s in + let s = Option.value s ~default in + Str.global_replace r s fmt,v+1 + ) in s + | _ -> "inproper arguments given to metrics" in + print_endline s; + KB.return () + ) + | None -> KB.return () + ); + Ok () + +let show_cache_digest = + Extension.Command.flag "show_cache_digest" + +let reset_cache = + Extension.Command.flag "reset_cache" + +let is_present = + Extension.Command.flag "verify_cache" + +let _cache_command : unit = + let args = + let open Extension.Command in + args $input $outputs $loader $update $knowledge + $show_cache_digest $reset_cache $is_present + in + let man = "Apply operations to the superset cache" in + Extension.Command.declare ~doc:man "superset_cache" + ~requires:features_used args @@ + fun input outputs loader update kb + show_cache_digest reset_cache verify_cache + ctxt -> + let get_raw_digest bin = + compute_digest bin loader ~namespace:"knowledge" in + let get_digest bin = + let d = get_raw_digest bin in + Data.Cache.Digest.to_string d in + let () = + if verify_cache then + let lines = In_channel.read_lines input in + List.iter lines ~f:(fun bin -> + let d = get_raw_digest bin in + let cache = knowledge_cache () in + let b = load_cache_with_digest cache d in + print_endline @@ sprintf "%s Present in cache: %b" bin b; + ) + else () in + let () = + if reset_cache then + let path = get_digest input in + let cachedir = Bap_main.Extension.Configuration.cachedir in + let l = [ ""; "/data/"; "/data2/"] in + List.iter l ~f:(fun s -> + let path = cachedir ^ "/" ^ s ^ "/" ^ path in + try + Sys.remove path; + with _ -> () + ); + else () in + let () = + if show_cache_digest then + let path = get_digest input in + printf "%s\n%!" path + else () in + Ok () + From eaa8a5435432baaf2bdada34d0ee59b921b9e717 Mon Sep 17 00:00:00 2001 From: Kenneth Adam Miller Date: Fri, 2 May 2025 12:35:58 -0500 Subject: [PATCH 02/31] Update dune file to not reference plot_superset_cache --- lib/bap_superset_disasm/dune | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/bap_superset_disasm/dune b/lib/bap_superset_disasm/dune index 64b8899c9..72f12af17 100644 --- a/lib/bap_superset_disasm/dune +++ b/lib/bap_superset_disasm/dune @@ -14,6 +14,5 @@ zmq gnuplot ) -(modules_without_implementation plot_superset_cache) ) From eb1b7687b148787b6c0c58041eec9a249741b6a3 Mon Sep 17 00:00:00 2001 From: Kenneth Adam Miller Date: Fri, 2 May 2025 13:31:16 -0500 Subject: [PATCH 03/31] Update with plugins/superset_disasm/dune --- plugins/superset_disasm/dune | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 plugins/superset_disasm/dune diff --git a/plugins/superset_disasm/dune b/plugins/superset_disasm/dune new file mode 100644 index 000000000..3253033cf --- /dev/null +++ b/plugins/superset_disasm/dune @@ -0,0 +1,12 @@ +(library + (name bap_superset_disasm_plugin) + (public_name bap-superset-disasm.plugin) + (libraries bap ) +) + +(plugin + (name superset_disasm) + (package bap-superset-disasm) + (libraries bap-superset-disasm.plugin) + +) From 69d5cd68e0f7e8d8f3de2b02aa55232edda7917b Mon Sep 17 00:00:00 2001 From: Kenneth Adam Miller Date: Fri, 2 May 2025 13:37:23 -0500 Subject: [PATCH 04/31] Update plugins/superset_disasm/dune with site stanza --- plugins/superset_disasm/dune | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/superset_disasm/dune b/plugins/superset_disasm/dune index 3253033cf..7c5596086 100644 --- a/plugins/superset_disasm/dune +++ b/plugins/superset_disasm/dune @@ -8,5 +8,5 @@ (name superset_disasm) (package bap-superset-disasm) (libraries bap-superset-disasm.plugin) - + (site (bap-common plugins)) ) From 281d2e67aeff8d1e12e42de2e48d89724fb4d2f5 Mon Sep 17 00:00:00 2001 From: Kenneth Adam Miller Date: Fri, 2 May 2025 13:45:00 -0500 Subject: [PATCH 05/31] Update modules to import the correct dependencies --- plugins/superset_disasm/superset_disassembler.ml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/plugins/superset_disasm/superset_disassembler.ml b/plugins/superset_disasm/superset_disassembler.ml index a11bc2f35..e5c2e2625 100644 --- a/plugins/superset_disasm/superset_disassembler.ml +++ b/plugins/superset_disasm/superset_disassembler.ml @@ -1,10 +1,10 @@ -open Core_kernel +open Core open Bap.Std open Regular.Std open Bap_knowledge open Bap_core_theory open Monads.Std -open Cmdoptions +open Bap_superset_disasm.Cmdoptions open Bap_main open Bap_plugins.Std From c2bf2672a59e40e47be7959ff1fcb2e3c56052e3 Mon Sep 17 00:00:00 2001 From: Kenneth Adam Miller Date: Fri, 2 May 2025 16:37:54 -0500 Subject: [PATCH 06/31] Link bap-superset-disasm library to superset-disasm plugin --- plugins/superset_disasm/dune | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/plugins/superset_disasm/dune b/plugins/superset_disasm/dune index 7c5596086..ec7ad953f 100644 --- a/plugins/superset_disasm/dune +++ b/plugins/superset_disasm/dune @@ -1,12 +1,12 @@ (library (name bap_superset_disasm_plugin) (public_name bap-superset-disasm.plugin) - (libraries bap ) + (libraries bap bap-superset-disasm ) ) (plugin - (name superset_disasm) + (name superset-disasm) (package bap-superset-disasm) - (libraries bap-superset-disasm.plugin) + (libraries bap-superset-disasm bap-superset-disasm.plugin) (site (bap-common plugins)) ) From 56ec054401dc822c78ddc8eaa131369d3e1e8b42 Mon Sep 17 00:00:00 2001 From: Kenneth Adam Miller Date: Wed, 7 May 2025 15:58:38 -0500 Subject: [PATCH 07/31] Add mli files for Abstract_ssa and Decision_trees, also add interface of bap_superset_disasm.ml --- lib/bap_superset_disasm/abstract_ssa.mli | 9 +++++++++ lib/bap_superset_disasm/bap_superset_disasm.ml | 16 ++++++++++++++++ lib/bap_superset_disasm/decision_trees.mli | 3 +++ 3 files changed, 28 insertions(+) create mode 100644 lib/bap_superset_disasm/abstract_ssa.mli create mode 100644 lib/bap_superset_disasm/bap_superset_disasm.ml create mode 100644 lib/bap_superset_disasm/decision_trees.mli diff --git a/lib/bap_superset_disasm/abstract_ssa.mli b/lib/bap_superset_disasm/abstract_ssa.mli new file mode 100644 index 000000000..1948a31f3 --- /dev/null +++ b/lib/bap_superset_disasm/abstract_ssa.mli @@ -0,0 +1,9 @@ +open Bap.Std + +val use_ssa : bil -> Exp.Set.t + +val def_ssa : bil -> Exp.Set.t + +val use_freevars : bil -> Var.Set.t + +val def_freevars : bil -> Var.Set.t diff --git a/lib/bap_superset_disasm/bap_superset_disasm.ml b/lib/bap_superset_disasm/bap_superset_disasm.ml new file mode 100644 index 000000000..a79956fec --- /dev/null +++ b/lib/bap_superset_disasm/bap_superset_disasm.ml @@ -0,0 +1,16 @@ +module Superset_impl = Superset_impl +module Superset = Superset +module Abstract_ssa = Abstract_ssa +module Decision_trees = Decision_trees +module Cmdoptions = Cmdoptions +module Features = Features +module Fixpoint = Fixpoint +module Grammar = Grammar +module Heuristics = Heuristics +module Invariants = Invariants +module Liveness = Liveness +module Metrics = Metrics +module Report = Report +module Traverse = Traverse +module Trim = Trim + diff --git a/lib/bap_superset_disasm/decision_trees.mli b/lib/bap_superset_disasm/decision_trees.mli new file mode 100644 index 000000000..1e233dc52 --- /dev/null +++ b/lib/bap_superset_disasm/decision_trees.mli @@ -0,0 +1,3 @@ +open Bap.Std + +val tails_of_conflicts : Superset_impl.t -> Addr.Set.t -> addr list Addr.Map.t From 4f98da206b0ef1e2cbbcd9e58604bd1ea3efb759 Mon Sep 17 00:00:00 2001 From: Kenneth Adam Miller Date: Fri, 9 May 2025 16:03:29 -0500 Subject: [PATCH 08/31] Move cmdoptions from lib to plugin --- lib/bap_superset_disasm/bap_superset_disasm.ml | 1 - .../superset_disasm}/cmdoptions.ml | 0 2 files changed, 1 deletion(-) rename {lib/bap_superset_disasm => plugins/superset_disasm}/cmdoptions.ml (100%) diff --git a/lib/bap_superset_disasm/bap_superset_disasm.ml b/lib/bap_superset_disasm/bap_superset_disasm.ml index a79956fec..ec32ef2ca 100644 --- a/lib/bap_superset_disasm/bap_superset_disasm.ml +++ b/lib/bap_superset_disasm/bap_superset_disasm.ml @@ -2,7 +2,6 @@ module Superset_impl = Superset_impl module Superset = Superset module Abstract_ssa = Abstract_ssa module Decision_trees = Decision_trees -module Cmdoptions = Cmdoptions module Features = Features module Fixpoint = Fixpoint module Grammar = Grammar diff --git a/lib/bap_superset_disasm/cmdoptions.ml b/plugins/superset_disasm/cmdoptions.ml similarity index 100% rename from lib/bap_superset_disasm/cmdoptions.ml rename to plugins/superset_disasm/cmdoptions.ml From dd9f51e865226145198d38ed65e94e4f6539b1ff Mon Sep 17 00:00:00 2001 From: Kenneth Adam Miller Date: Fri, 9 May 2025 16:20:59 -0500 Subject: [PATCH 09/31] Correct up plugins/superset_disasm/dune, add metadata and plot_superset_cache --- plugins/superset_disasm/dune | 2 +- plugins/superset_disasm/metadata.ml | 105 ++++++++++++++++++ .../superset_disasm/plot_superset_cache.ml | 94 ++++++++++++++++ 3 files changed, 200 insertions(+), 1 deletion(-) create mode 100644 plugins/superset_disasm/metadata.ml create mode 100644 plugins/superset_disasm/plot_superset_cache.ml diff --git a/plugins/superset_disasm/dune b/plugins/superset_disasm/dune index ec7ad953f..b73086b71 100644 --- a/plugins/superset_disasm/dune +++ b/plugins/superset_disasm/dune @@ -1,7 +1,7 @@ (library (name bap_superset_disasm_plugin) (public_name bap-superset-disasm.plugin) - (libraries bap bap-superset-disasm ) + (libraries bap bap_superset_disasm ) ) (plugin diff --git a/plugins/superset_disasm/metadata.ml b/plugins/superset_disasm/metadata.ml new file mode 100644 index 000000000..dc792446b --- /dev/null +++ b/plugins/superset_disasm/metadata.ml @@ -0,0 +1,105 @@ +open Core +open Bap.Std +open Regular.Std +open Bap_knowledge +open Bap_core_theory + +let package = "superset-cache-guide" + +let knowledge_reader = Data.Read.create + ~of_bigstring:Knowledge.of_bigstring () + +let knowledge_writer = Data.Write.create + ~to_bigstring:Knowledge.to_bigstring () + +let knowledge_cache () = + Data.Cache.Service.request + knowledge_reader + knowledge_writer + +module Cache_metadata = struct + type t = string String.Map.t [@@deriving sexp, bin_io] + let equal = String.Map.equal String.equal + let set = String.Map.set + let empty = String.Map.empty + let fold = String.Map.fold + let length = String.Map.length +end + +let cache_metadata_t = + Knowledge.Domain.optional + ~inspect:Cache_metadata.sexp_of_t ~equal:Cache_metadata.equal + "cache_metadata_t" + +let cache_persistent = + Knowledge.Persistent.of_binable + (module struct + type t = Cache_metadata.t option [@@deriving bin_io] end) + +let digests = + let attr ty persistent name desc = + let open Theory.Program in + Knowledge.Class.property ~package cls name ty + ~persistent ~public:true ~desc in + attr cache_metadata_t cache_persistent "cache_metadata_t" + "Information about what other items can be looked up in the cache" + +let load_cache_with_digest cache digest = + match Data.Cache.load cache digest with + | None -> false + | Some state -> + Toplevel.set state; + true + +let store_knowledge_in_cache digest = + let digest = digest ~namespace:"knowledge" in + let cache = knowledge_cache () in + Toplevel.current () |> + Data.Cache.save cache digest + +let import_knowledge_from_cache digest = + let cache = knowledge_cache () in + load_cache_with_digest cache digest + +let load_knowledge digest p = + let digest = digest ~namespace:"knowledge" in + match p with + | None -> import_knowledge_from_cache digest + | Some path when not (Sys.file_exists path) -> + import_knowledge_from_cache digest + | Some path -> + Toplevel.set @@ Knowledge.load path; + true + +let make_digest inputs = + let inputs = String.concat inputs in + fun ~namespace -> + let d = Data.Cache.Digest.create ~namespace in + Data.Cache.Digest.add d "%s" inputs + +let guide = KB.Symbol.intern "cache_map" Theory.Program.cls +let metadata_digest = + (make_digest [ "superset-cache-metadata" ]) + +let save () = + let _ = Toplevel.eval digests guide in + store_knowledge_in_cache metadata_digest + +(* Retrieve the metadata of all digests *) +let with_digests f = + let state = Toplevel.current () in + let _ = load_knowledge metadata_digest None in + let ds = Toplevel.eval digests guide in + let r = f ds in + Toplevel.set state; + r + +let cache_corpus_metrics ds = + match ds with + | Some ds -> + Cache_metadata.fold ds ~init:[] ~f:(fun ~key ~data l -> + let digest = Data.Cache.Digest.of_string data in + if import_knowledge_from_cache digest then + Metrics.get_summary () :: l else l + ) + | None -> [] diff --git a/plugins/superset_disasm/plot_superset_cache.ml b/plugins/superset_disasm/plot_superset_cache.ml new file mode 100644 index 000000000..70f592e59 --- /dev/null +++ b/plugins/superset_disasm/plot_superset_cache.ml @@ -0,0 +1,94 @@ +open Core +open Bap.Std +open Regular.Std +open Bap_knowledge +open Bap_core_theory +open Monads.Std + +let () = match Bap_main.init () with + | Ok () -> () + | Error err -> + let open Bap_main in + Bap_main.Extension.Error.pp Format.std_formatter err; + exit 1 + +let transform_summaries summaries = + let open Metrics in + let summaries = + List.filter_map summaries ~f:(fun s -> + match s.size, s.occ, s.occ_space, s.fe, s.clean, s.fns, s.fps, + s.tps, s.time with + | None, _, _, _, _, _, _, _, _ -> None + | _, None, _, _, _, _, _, _, _ -> None + | _, _, None, _, _, _, _, _, _ -> None + | _, _, _, None, _, _, _, _, _ -> None + | _, _, _, _, None, _, _, _, _ -> None + | _, _, _, _, _, None, _, _, _ -> None + | _, _, _, _, _, _, None, _, _ -> None + | _, _, _, _, _, _, _, None, _ -> None + | _, _, _, _, _, _, _, _, None -> None + | Some size, Some occ, Some occ_space, Some fe, Some clean, + Some fns, Some fps, Some tps, Some time -> + Some (size, occ, occ_space, fe, clean, fns, fps, tps, time) + ) in + List.fold summaries ~init:([],[],[],[],[],[],[],[],[]) + ~f:(fun (sizes,occ,occ_space,fe,clean,fns,fps,tps,time) s -> + let _size,_occ,_occ_space,_fe,_clean,_fns,_fps,_tps,_time = s in + _size :: sizes, _occ :: occ, _occ_space :: occ_space, + _fe :: fe,_clean :: clean,_fns :: fns,_fps :: fps,_tps :: tps, + _time :: time + ) + + (* Plots: + binary size to occlusive rate (occlusion by occ space) + occlusive space to occlusion + scatter plot occlusive count and number of occ functions + size and processing time + least value required for safe convergence + number of binaries and occ rate + *) +let make_plots summaries = + let open Metrics in + let sizes,occ,occ_space,fe,clean,fns,fps,tps,time = + transform_summaries summaries in + let make_plot xlabel ylabel fname x y = + let x = List.map x ~f:float_of_int in + let y = List.map y ~f:float_of_int in + match List.zip x y with + | Ok data -> + let title = (xlabel ^ " and " ^ ylabel) in + let labels = Gnuplot.Labels.create ~x:xlabel ~y:ylabel () in + let color = Gnuplot.Color.(`Blue) in + let plot = Gnuplot.Series.points_xy ~title ~color data in + let gp = Gnuplot.create () in + let output = (Gnuplot.Output.create (`Png (title ^ ".png"))) in + Gnuplot.set ~use_grid:true gp; + Gnuplot.plot ~output ~labels gp plot; + Gnuplot.close gp; + () + | _ -> () in + make_plot "Size" "Occlusion" "size_and_occlusion.png" sizes occ; + make_plot "Actual Occlusion" "Possible Occlusion" + "occlusion_and_occspace.png" occ occ_space; + let () = + match List.map2 fe clean ~f:(fun x y -> x - y) with + | List.Or_unequal_lengths.Ok occfuncs -> + make_plot "Total Occlusion" "# Unclean functions" + "occcnt_occfuncs.png" occ occfuncs + | _ -> () in + make_plot "Size" "Time" "size_time.png" sizes time; + () + +let () = + let summaries = + Metadata.with_digests Metadata.cache_corpus_metrics in + make_plots summaries; + let sizes,occ,occ_space,fe,clean,fns,fps,tps,time = + transform_summaries summaries in + let tot_fns = List.fold fns ~init:0 ~f:(+) in + let tot_occ = List.fold occ ~init:0 ~f:(+) in + let tot_occ_space = List.fold fns ~init:0 ~f:(+) in + let avg_occ = + (float_of_int tot_occ) /. (float_of_int tot_occ_space) in + printf "fns: %d, avg occ: %f" tot_fns avg_occ + From fc2b57acb828dcfedb4fc7412d5f301d39359169 Mon Sep 17 00:00:00 2001 From: Kenneth Adam Miller Date: Fri, 9 May 2025 16:25:24 -0500 Subject: [PATCH 10/31] Update lib/bap_superset_disasm/dune to correct public_name --- lib/bap_superset_disasm/dune | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/bap_superset_disasm/dune b/lib/bap_superset_disasm/dune index 72f12af17..f41d3bec3 100644 --- a/lib/bap_superset_disasm/dune +++ b/lib/bap_superset_disasm/dune @@ -1,6 +1,6 @@ (library (name bap_superset_disasm) - (public_name bap-std.superset-disasm) + (public_name bap-superset-disasm) (wrapped false) (preprocess (pps ppx_bap)) (libraries From 8cfe6f0115703f931c1414bac626c75b33c3eae6 Mon Sep 17 00:00:00 2001 From: Kenneth Adam Miller Date: Fri, 9 May 2025 18:15:49 -0500 Subject: [PATCH 11/31] Fix ppx related issues by adding (preprocess (pps ppx_bap)) --- plugins/superset_disasm/dune | 1 + 1 file changed, 1 insertion(+) diff --git a/plugins/superset_disasm/dune b/plugins/superset_disasm/dune index b73086b71..6a0acda64 100644 --- a/plugins/superset_disasm/dune +++ b/plugins/superset_disasm/dune @@ -1,6 +1,7 @@ (library (name bap_superset_disasm_plugin) (public_name bap-superset-disasm.plugin) + (preprocess (pps ppx_bap)) (libraries bap bap_superset_disasm ) ) From 12d4b832eff5c0bf680effa55ee6afac5bbd7e8c Mon Sep 17 00:00:00 2001 From: Kenneth Adam Miller Date: Fri, 9 May 2025 18:17:53 -0500 Subject: [PATCH 12/31] Revert Sys to Stdlib.Sys to avoid compile errors --- plugins/superset_disasm/metadata.ml | 2 +- plugins/superset_disasm/superset_disassembler.ml | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/plugins/superset_disasm/metadata.ml b/plugins/superset_disasm/metadata.ml index dc792446b..5719a33c7 100644 --- a/plugins/superset_disasm/metadata.ml +++ b/plugins/superset_disasm/metadata.ml @@ -65,7 +65,7 @@ let load_knowledge digest p = let digest = digest ~namespace:"knowledge" in match p with | None -> import_knowledge_from_cache digest - | Some path when not (Sys.file_exists path) -> + | Some path when not (Stdlib.Sys.file_exists path) -> import_knowledge_from_cache digest | Some path -> Toplevel.set @@ Knowledge.load path; diff --git a/plugins/superset_disasm/superset_disassembler.ml b/plugins/superset_disasm/superset_disassembler.ml index e5c2e2625..2abfbee5f 100644 --- a/plugins/superset_disasm/superset_disassembler.ml +++ b/plugins/superset_disasm/superset_disassembler.ml @@ -4,7 +4,7 @@ open Regular.Std open Bap_knowledge open Bap_core_theory open Monads.Std -open Bap_superset_disasm.Cmdoptions +open Cmdoptions open Bap_main open Bap_plugins.Std @@ -75,11 +75,11 @@ let superset_disasm options = With_options(struct let options = options end) in - let t = Sys.time() in + let t = Stdlib.Sys.time() in let open KB.Syntax in With_options.with_options () >>= fun superset -> KB.promise Metrics.Cache.time (fun o -> - KB.return (Some (int_of_float (Sys.time() -. t)))); + KB.return (Some (int_of_float (Stdlib.Sys.time() -. t)))); (* Provide the is_valid label as a check on whether a given address is in the superset after trimming *) KB.promise Theory.Label.is_valid @@ (fun label -> @@ -159,7 +159,7 @@ let store_knowledge_in_cache digest = let load_knowledge digest = function | None -> import_knowledge_from_cache digest - | Some path when not (Sys.file_exists path) -> + | Some path when not (Stdlib.Sys.file_exists path) -> import_knowledge_from_cache digest | Some path -> info "importing knowledge from %S" path; @@ -184,9 +184,9 @@ let outputs = "dump" let rw_file = Extension.Type.define - ~name:"" ~print:ident ~parse:ident + ~name:"" ~print:Fn.id ~parse:Fn.id ~digest:(fun path -> - if Sys.file_exists path + if Stdlib.Sys.file_exists path then Caml.Digest.file path else Caml.Digest.string "empty") "" @@ -233,7 +233,7 @@ let target = target stored in the binary, otherwise an error is signaled." let validate_input file = - Result.ok_if_true (Sys.file_exists file) + Result.ok_if_true (Stdlib.Sys.file_exists file) ~error:(Fail (Expects_a_regular_file file)) let validate_knowledge update kb = match kb with @@ -242,7 +242,7 @@ let validate_knowledge update kb = match kb with | Some path -> let error = Fail (No_knowledge "No initial knowledge to update") in - Result.ok_if_true (Sys.file_exists path || update) ~error + Result.ok_if_true (Stdlib.Sys.file_exists path || update) ~error let option_digest f = function | None -> "none" From 64e9e7700bb246ba6538b57a859f11f283f9e4bb Mon Sep 17 00:00:00 2001 From: Kenneth Adam Miller Date: Fri, 9 May 2025 18:21:39 -0500 Subject: [PATCH 13/31] Snapshot at fully buildinggit status --- lib/bap_superset_disasm/heuristics.mli | 1 + plugins/superset_disasm/superset_disassembler.ml | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/bap_superset_disasm/heuristics.mli b/lib/bap_superset_disasm/heuristics.mli index 07d41544f..75ada0dbf 100644 --- a/lib/bap_superset_disasm/heuristics.mli +++ b/lib/bap_superset_disasm/heuristics.mli @@ -4,3 +4,4 @@ val get_callsites : ?threshold:(int) -> Superset_impl.t -> Addr.Hash_set.t val tag_callsites : Addr.Hash_set.t -> ?callsites:Addr.Hash_set.t -> Superset_impl.t -> Superset_impl.t val with_featureset : f:(string -> (Superset_impl.t -> Superset_impl.t) -> 'a -> 'a) -> init:'a -> string list -> 'b -> 'a val with_featurepmap : string list -> Superset_impl.t -> f:((int * word * string) list Addr.Map.t -> string list -> Superset_impl.t -> unit) -> unit +val defaults : string list diff --git a/plugins/superset_disasm/superset_disassembler.ml b/plugins/superset_disasm/superset_disassembler.ml index 2abfbee5f..a2fd2690f 100644 --- a/plugins/superset_disasm/superset_disassembler.ml +++ b/plugins/superset_disasm/superset_disassembler.ml @@ -543,7 +543,7 @@ let _distribution_command : unit = let r = Str.regexp @@ sprintf "%%%d" v in let opts = List.to_string (Map.keys metric_vals) - ~f:ident in + ~f:Fn.id in let default = sprintf "\"%s\" is not a metric, opts: %s" s opts in @@ -603,7 +603,7 @@ let _cache_command : unit = List.iter l ~f:(fun s -> let path = cachedir ^ "/" ^ s ^ "/" ^ path in try - Sys.remove path; + Stdlib.Sys.remove path; with _ -> () ); else () in From 47bc22342dbdad520255ccc4ff769ca31efd1a2b Mon Sep 17 00:00:00 2001 From: Kenneth Adam Miller Date: Mon, 12 May 2025 02:56:21 -0500 Subject: [PATCH 14/31] Remove old scripts --- plugins/superset_disasm/build.sh | 5 ----- plugins/superset_disasm/install.sh | 1 - plugins/superset_disasm/run.sh | 1 - 3 files changed, 7 deletions(-) delete mode 100755 plugins/superset_disasm/build.sh delete mode 100755 plugins/superset_disasm/install.sh delete mode 100755 plugins/superset_disasm/run.sh diff --git a/plugins/superset_disasm/build.sh b/plugins/superset_disasm/build.sh deleted file mode 100755 index e8d9473c6..000000000 --- a/plugins/superset_disasm/build.sh +++ /dev/null @@ -1,5 +0,0 @@ -#pushd plugin/ -rm -rf _build -#-pkg superset_disassemblers -bapbuild -pkg findlib.dynload -pkg str -pkg zmq -pkg bap-primus -pkg bap-knowledge -pkg superset_disassemblers superset_disassembler.plugin -#popd diff --git a/plugins/superset_disasm/install.sh b/plugins/superset_disasm/install.sh deleted file mode 100755 index 3137aceb3..000000000 --- a/plugins/superset_disasm/install.sh +++ /dev/null @@ -1 +0,0 @@ -bapbundle install superset_disassembler.plugin diff --git a/plugins/superset_disasm/run.sh b/plugins/superset_disasm/run.sh deleted file mode 100755 index 5c808b014..000000000 --- a/plugins/superset_disasm/run.sh +++ /dev/null @@ -1 +0,0 @@ -bap --superset-disassembler-target $(which objdump) $(which objdump) --superset-disassembler-export=test From 08055f391158be9b0e63487abf8d8167d69a944a Mon Sep 17 00:00:00 2001 From: Kenneth Adam Miller Date: Mon, 12 May 2025 02:59:35 -0500 Subject: [PATCH 15/31] Remove metadata module and implement supersetd-graph-metrics command --- plugins/superset_disasm/metadata.ml | 105 ------------ .../superset_disasm/plot_superset_cache.ml | 45 ++++-- .../superset_disasm/superset_disassembler.ml | 153 +++++------------- 3 files changed, 76 insertions(+), 227 deletions(-) delete mode 100644 plugins/superset_disasm/metadata.ml diff --git a/plugins/superset_disasm/metadata.ml b/plugins/superset_disasm/metadata.ml deleted file mode 100644 index 5719a33c7..000000000 --- a/plugins/superset_disasm/metadata.ml +++ /dev/null @@ -1,105 +0,0 @@ -open Core -open Bap.Std -open Regular.Std -open Bap_knowledge -open Bap_core_theory - -let package = "superset-cache-guide" - -let knowledge_reader = Data.Read.create - ~of_bigstring:Knowledge.of_bigstring () - -let knowledge_writer = Data.Write.create - ~to_bigstring:Knowledge.to_bigstring () - -let knowledge_cache () = - Data.Cache.Service.request - knowledge_reader - knowledge_writer - -module Cache_metadata = struct - type t = string String.Map.t [@@deriving sexp, bin_io] - let equal = String.Map.equal String.equal - let set = String.Map.set - let empty = String.Map.empty - let fold = String.Map.fold - let length = String.Map.length -end - -let cache_metadata_t = - Knowledge.Domain.optional - ~inspect:Cache_metadata.sexp_of_t ~equal:Cache_metadata.equal - "cache_metadata_t" - -let cache_persistent = - Knowledge.Persistent.of_binable - (module struct - type t = Cache_metadata.t option [@@deriving bin_io] end) - -let digests = - let attr ty persistent name desc = - let open Theory.Program in - Knowledge.Class.property ~package cls name ty - ~persistent ~public:true ~desc in - attr cache_metadata_t cache_persistent "cache_metadata_t" - "Information about what other items can be looked up in the cache" - -let load_cache_with_digest cache digest = - match Data.Cache.load cache digest with - | None -> false - | Some state -> - Toplevel.set state; - true - -let store_knowledge_in_cache digest = - let digest = digest ~namespace:"knowledge" in - let cache = knowledge_cache () in - Toplevel.current () |> - Data.Cache.save cache digest - -let import_knowledge_from_cache digest = - let cache = knowledge_cache () in - load_cache_with_digest cache digest - -let load_knowledge digest p = - let digest = digest ~namespace:"knowledge" in - match p with - | None -> import_knowledge_from_cache digest - | Some path when not (Stdlib.Sys.file_exists path) -> - import_knowledge_from_cache digest - | Some path -> - Toplevel.set @@ Knowledge.load path; - true - -let make_digest inputs = - let inputs = String.concat inputs in - fun ~namespace -> - let d = Data.Cache.Digest.create ~namespace in - Data.Cache.Digest.add d "%s" inputs - -let guide = KB.Symbol.intern "cache_map" Theory.Program.cls -let metadata_digest = - (make_digest [ "superset-cache-metadata" ]) - -let save () = - let _ = Toplevel.eval digests guide in - store_knowledge_in_cache metadata_digest - -(* Retrieve the metadata of all digests *) -let with_digests f = - let state = Toplevel.current () in - let _ = load_knowledge metadata_digest None in - let ds = Toplevel.eval digests guide in - let r = f ds in - Toplevel.set state; - r - -let cache_corpus_metrics ds = - match ds with - | Some ds -> - Cache_metadata.fold ds ~init:[] ~f:(fun ~key ~data l -> - let digest = Data.Cache.Digest.of_string data in - if import_knowledge_from_cache digest then - Metrics.get_summary () :: l else l - ) - | None -> [] diff --git a/plugins/superset_disasm/plot_superset_cache.ml b/plugins/superset_disasm/plot_superset_cache.ml index 70f592e59..d1b931916 100644 --- a/plugins/superset_disasm/plot_superset_cache.ml +++ b/plugins/superset_disasm/plot_superset_cache.ml @@ -4,13 +4,6 @@ open Regular.Std open Bap_knowledge open Bap_core_theory open Monads.Std - -let () = match Bap_main.init () with - | Ok () -> () - | Error err -> - let open Bap_main in - Bap_main.Extension.Error.pp Format.std_formatter err; - exit 1 let transform_summaries summaries = let open Metrics in @@ -79,9 +72,41 @@ let make_plots summaries = make_plot "Size" "Time" "size_time.png" sizes time; () -let () = - let summaries = - Metadata.with_digests Metadata.cache_corpus_metrics in +let knowledge_reader = Data.Read.create + ~of_bigstring:Knowledge.of_bigstring () + +let knowledge_writer = Data.Write.create + ~to_bigstring:Knowledge.to_bigstring () + +let knowledge_cache () = + Data.Cache.Service.request + knowledge_reader + knowledge_writer + +let load_cache_with_digest cache digest = + match Data.Cache.load cache digest with + | None -> false + | Some state -> + Toplevel.set state; + true + +let import_knowledge_from_cache digest = + let cache = knowledge_cache () in + load_cache_with_digest cache digest + +let summaries_of_files fs = + List.fold fs ~init:[] ~f:(fun ls lf -> + let ds = Caml.Digest.file lf in + let digest = Data.Cache.Digest.of_string ds in + if import_knowledge_from_cache digest then + Metrics.get_summary () :: ls + else ( + print_endline @@ sprintf "%s not present in cache" lf; + ls + ) + ) + +let plot_summaries summaries = make_plots summaries; let sizes,occ,occ_space,fe,clean,fns,fps,tps,time = transform_summaries summaries in diff --git a/plugins/superset_disasm/superset_disassembler.ml b/plugins/superset_disasm/superset_disassembler.ml index a2fd2690f..dd72157d2 100644 --- a/plugins/superset_disasm/superset_disassembler.ml +++ b/plugins/superset_disasm/superset_disassembler.ml @@ -18,9 +18,9 @@ let man = {| byte offset within the executable region of a binary is initially treated as being potentially compiler intended output. However, after applying several rounds of heuristics the true positives, or - the actually intended instructions, can be distinguished from the + the compiler intended instructions, can be distinguished from the noise. It is an alternate disassembly method from linear sweep or - recursive descent, the two (probably most) populate mainstream + recursive descent, the two (probably most) popular mainstream disassembly methods. This approach exchanges the possibility of some small portion of the final output including some occlusive unintended sequences being incorrectly kept (a superset) for the @@ -28,14 +28,14 @@ let man = {| intended. Heuristics are broken into three main groups: invariants, analyses, - and features. Invariants are ideally lawful characteristics of + and heuristics. Invariants are ideally lawful characteristics of binary code, where disobedience is illegal for any well formed assembler, and run with a limited scope/visibility of just instructions. Analyses are typically processes that identify less visible violations of well-formed assembler rules or other lawful assembler characteristics that require global visibility. Heuristics are data traits that may be dirty and require some iterative - convergence to recognize the subset within the initial superset that + convergence to recognize a subset within the initial superset that can be guaranteeably cleansed. Once convergence occurs, the bodies of lineages with sufficient evidence are cleansed of occlusion, and any lineage that does not have enough features to support being kept @@ -148,7 +148,7 @@ let import_knowledge_from_cache digest = Data.Cache.Digest.pp digest; let cache = knowledge_cache () in load_cache_with_digest cache digest - + let store_knowledge_in_cache digest = let digest = digest ~namespace:"knowledge" in info "caching knowledge with digest %a" @@ -166,14 +166,6 @@ let load_knowledge digest = function Toplevel.set @@ Knowledge.load path; true -let save_knowledge ~had_knowledge ~update digest = function - | None -> - store_knowledge_in_cache digest - | Some path when update -> - info "storing knowledge base to %S" path; - Knowledge.save (Toplevel.current ()) path - | Some _ -> () - let outputs = Extension.Command.parameters ~doc:"Dumps the program to (defaults to stdout) \ @@ -289,29 +281,10 @@ let superset_digest options = let open Cmdoptions in compute_digest options.target options.disassembler -let save_metadata options = - let digest = superset_digest options ~namespace:"knowledge" in - Metadata.with_digests (fun metadata -> - let c = Option.value metadata - ~default:Metadata.Cache_metadata.empty in - KB.promise Metadata.digests (fun o -> - let d = Data.Cache.Digest.(to_string digest) in - KB.return @@ (Some - (Metadata.Cache_metadata.set c - ~key:options.target ~data:d)) - ); - Metadata.save () - ) - -let create_and_process - input outputs loader update kb options = - (*let () = save_metadata options in*) +let create_and_process kb options = let digest = superset_digest options in - let had_knowledge = load_knowledge digest kb in - let () = Toplevel.exec @@ - if not had_knowledge then - superset_disasm options - else KB.return () in + let _ = load_knowledge digest kb in + let () = Toplevel.exec @@ superset_disasm options in (match options.ground_truth_bin with | Some bin -> KB.promise Metrics.Cache.ground_truth_source @@ -321,7 +294,6 @@ let create_and_process let _ = Toplevel.eval ro Metrics.Cache.sym_label in let _ = Toplevel.eval Metrics.Cache.size Metrics.Cache.sym_label in store_knowledge_in_cache digest - (*save_knowledge ~had_knowledge ~update digest kb*) let rounds = let doc = "Number of analysis cycles" in @@ -372,7 +344,7 @@ let _superset_disassemble_command : unit = $ground_truth_bin $invariants $analyses $tp_threshold $heuristics $save_dot $rounds $converge $protect in - Extension.Command.declare ~doc:man "superset_disasm" + Extension.Command.declare ~doc:man "superset-disasm" ~requires:features_used args @@ fun input outputs loader update kb ground_truth_bin invariants analyses tp_threshold heuristics @@ -387,86 +359,43 @@ let _superset_disassemble_command : unit = validate_knowledge update kb >>= fun () -> validate_input input >>= fun () -> Dump_formats.parse outputs >>= fun outputs -> - Ok (create_and_process input outputs loader - update kb options) - -let destination = - Extension.Command.parameter Extension.Type.string "destination" + Ok (create_and_process kb options) -let cache_digest = - Extension.Command.parameter Extension.Type.string "cache_digest" +let inputs = + Extension.Command.argument + ~doc:"The input files" + Extension.Type.("FILES" %: list string =? ["a.out"] + ) -type cache_msg = { - digest : string; - state : bigstring; - } [@@deriving sexp] +exception Missing_file of string -exception Cache_not_present -let _send_cache : unit = +let _graph_metrics : unit = let args = let open Extension.Command in - args $input $outputs $loader $update $knowledge - $destination $cache_digest + args $inputs in - let man = - "Send a cache state to a designated address. Ex: tcp://host:port" - in - Extension.Command.declare ~doc:man "send_cache" + let cmdname = "supersetd-graph-metrics" in + let doc = sprintf + {| The %s command iterates over the list of input files to + disassemble and accesses the cache for each file to + collect important metrics. These metrics are then + graphed with gnuplot, and saved in the current directory. |} + cmdname in + Extension.Command.declare ~doc cmdname ~requires:features_used args @@ - fun input outputs loader update kb - destination cache_digest ctxt -> - let cache = knowledge_cache () in - let d = Data.Cache.Digest.of_string cache_digest in - let had = load_cache_with_digest cache d in + fun inputs ctxt -> + let is_missing x = + not (Stdlib.Sys.file_exists x) in + let missing = List.find inputs ~f:is_missing in let () = - if had then - let state = Toplevel.current () in - let msg = { - digest = cache_digest; - state = Knowledge.to_bigstring state; - } in - let msg = Sexp.to_string @@ sexp_of_cache_msg msg in - let zmq_ctxt = Zmq.Context.create () in - let socket = Zmq.Socket.create zmq_ctxt Zmq.Socket.push in - let () = Zmq.Socket.connect socket destination in - Zmq.Socket.send socket msg - else - raise Cache_not_present in - Ok() - -let bind_addr = - Extension.Command.parameter Extension.Type.string "bind_addr" - -let perpetuate = - Extension.Command.flag "perpetuate" - -let _recv_cache : unit = - let args = - let open Extension.Command in - args $outputs $loader $update $knowledge - $bind_addr $perpetuate - in - let man = - "Receive a cache state on the given address bound to. Ex:" ^ - "tcp://*:" in - Extension.Command.declare ~doc:man "recv_cache" - ~requires:features_used args @@ - fun outputs loader update kb - bind_addr perpetuate ctxt -> - let zmq_ctxt = Zmq.Context.create () in - let socket = Zmq.Socket.create zmq_ctxt Zmq.Socket.pull in - let () = Zmq.Socket.bind socket bind_addr in - let ran = ref false in - Ok (while perpetuate || (not !ran) do - let s = cache_msg_of_sexp @@ Sexp.of_string - @@ Zmq.Socket.recv socket in - let { digest; state; } = s in - let state = Knowledge.of_bigstring state in - let cache = knowledge_cache () in - let d = Data.Cache.Digest.of_string digest in - Data.Cache.save cache d state; - ran := true; - done) + match missing with + | None -> () + | Some f -> + print_endline @@ sprintf "%s is missing" f; + raise (Missing_file f); + in + let summaries = Plot_superset_cache.summaries_of_files inputs in + Ok (Plot_superset_cache.plot_summaries summaries) let metrics = let doc = @@ -478,7 +407,7 @@ let metrics = Extension.Command.parameter ~doc Extension.Type.(some (list string)) "metrics" -let _distribution_command : unit = +let _print_metrics_command : unit = let args = let open Extension.Command in args $input $outputs $loader $update $knowledge @@ -486,7 +415,7 @@ let _distribution_command : unit = $tp_threshold $heuristics $rounds $converge $metrics in let man = "Perform computational operations on the cache" in - Extension.Command.declare ~doc:man "superset_distribution" + Extension.Command.declare ~doc:man "supersetd-print-metrics" ~requires:features_used args @@ fun input outputs loader update kb ground_truth_bin invariants @@ -575,7 +504,7 @@ let _cache_command : unit = $show_cache_digest $reset_cache $is_present in let man = "Apply operations to the superset cache" in - Extension.Command.declare ~doc:man "superset_cache" + Extension.Command.declare ~doc:man "superset-cache" ~requires:features_used args @@ fun input outputs loader update kb show_cache_digest reset_cache verify_cache From b1129bd6e5c4de392627abf10309973afe85fb49 Mon Sep 17 00:00:00 2001 From: Kenneth Adam Miller Date: Mon, 12 May 2025 03:31:59 -0500 Subject: [PATCH 16/31] Cmdoptions is stuck with dune insisting it requires a stanza for no impl --- lib/bap_superset_disasm/dune | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/bap_superset_disasm/dune b/lib/bap_superset_disasm/dune index f41d3bec3..24bf31989 100644 --- a/lib/bap_superset_disasm/dune +++ b/lib/bap_superset_disasm/dune @@ -14,5 +14,6 @@ zmq gnuplot ) +(modules_without_implementation cmdoptions) ) From 4b8a7325c8671f25e9a2c8bfa943df9d65868b2d Mon Sep 17 00:00:00 2001 From: Kenneth Adam Miller Date: Mon, 12 May 2025 05:30:35 -0500 Subject: [PATCH 17/31] Ensure that superset disasm and graph metrics commands work harmoniously --- .../superset_disasm/plot_superset_cache.ml | 22 ++++++++++++++----- .../superset_disasm/superset_disassembler.ml | 10 +++++---- 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/plugins/superset_disasm/plot_superset_cache.ml b/plugins/superset_disasm/plot_superset_cache.ml index d1b931916..cc82f51ec 100644 --- a/plugins/superset_disasm/plot_superset_cache.ml +++ b/plugins/superset_disasm/plot_superset_cache.ml @@ -94,10 +94,21 @@ let import_knowledge_from_cache digest = let cache = knowledge_cache () in load_cache_with_digest cache digest -let summaries_of_files fs = +let make_digest inputs = + let inputs = String.concat inputs in + fun ~namespace -> + let d = Data.Cache.Digest.create ~namespace in + Data.Cache.Digest.add d "%s" inputs + +let compute_digest target disasm = + make_digest [ + Caml.Digest.file target; + disasm; + ] ~namespace:"knowledge" + +let summaries_of_files tgt fs = List.fold fs ~init:[] ~f:(fun ls lf -> - let ds = Caml.Digest.file lf in - let digest = Data.Cache.Digest.of_string ds in + let digest = compute_digest lf tgt in if import_knowledge_from_cache digest then Metrics.get_summary () :: ls else ( @@ -112,8 +123,9 @@ let plot_summaries summaries = transform_summaries summaries in let tot_fns = List.fold fns ~init:0 ~f:(+) in let tot_occ = List.fold occ ~init:0 ~f:(+) in - let tot_occ_space = List.fold fns ~init:0 ~f:(+) in + let tot_occ_space = List.fold occ_space ~init:0 ~f:(+) in let avg_occ = (float_of_int tot_occ) /. (float_of_int tot_occ_space) in - printf "fns: %d, avg occ: %f" tot_fns avg_occ + printf "fns: %d, total occ space %d, avg occ: %f\n" + tot_fns tot_occ_space avg_occ diff --git a/plugins/superset_disasm/superset_disassembler.ml b/plugins/superset_disasm/superset_disassembler.ml index dd72157d2..cc4336cbb 100644 --- a/plugins/superset_disasm/superset_disassembler.ml +++ b/plugins/superset_disasm/superset_disassembler.ml @@ -293,7 +293,9 @@ let create_and_process kb options = let ro = Metrics.Cache.reduced_occlusion in let _ = Toplevel.eval ro Metrics.Cache.sym_label in let _ = Toplevel.eval Metrics.Cache.size Metrics.Cache.sym_label in - store_knowledge_in_cache digest + let summary = Metrics.get_summary () in + print_endline @@ Sexp.to_string @@ Metrics.sexp_of_t summary; + store_knowledge_in_cache (superset_digest options) let rounds = let doc = "Number of analysis cycles" in @@ -372,7 +374,7 @@ exception Missing_file of string let _graph_metrics : unit = let args = let open Extension.Command in - args $inputs + args $inputs $loader in let cmdname = "supersetd-graph-metrics" in let doc = sprintf @@ -383,7 +385,7 @@ let _graph_metrics : unit = cmdname in Extension.Command.declare ~doc cmdname ~requires:features_used args @@ - fun inputs ctxt -> + fun inputs tgt ctxt -> let is_missing x = not (Stdlib.Sys.file_exists x) in let missing = List.find inputs ~f:is_missing in @@ -394,7 +396,7 @@ let _graph_metrics : unit = print_endline @@ sprintf "%s is missing" f; raise (Missing_file f); in - let summaries = Plot_superset_cache.summaries_of_files inputs in + let summaries = Plot_superset_cache.summaries_of_files tgt inputs in Ok (Plot_superset_cache.plot_summaries summaries) let metrics = From 6da854950dc99adf7d5c2d6166c83cccb1971464 Mon Sep 17 00:00:00 2001 From: Kenneth Adam Miller Date: Mon, 12 May 2025 07:49:35 -0500 Subject: [PATCH 18/31] Change command names for consistency --- plugins/superset_disasm/superset_disassembler.ml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/plugins/superset_disasm/superset_disassembler.ml b/plugins/superset_disasm/superset_disassembler.ml index cc4336cbb..58ecd2bde 100644 --- a/plugins/superset_disasm/superset_disassembler.ml +++ b/plugins/superset_disasm/superset_disassembler.ml @@ -324,14 +324,14 @@ let ground_truth_bin = " from debug symbols of an unstripped binary") in Extension.Command.parameter ~doc Extension.Type.("" %: (some string) =? None) - "ground_truth_bin" + "ground-truth-bin" let analyses = let deflt = ["Strongly Connected Component Data"] in Extension.Command.parameter Extension.Type.(list string =? deflt) "analyses" -let save_dot = Extension.Command.flag "save_dot" +let save_dot = Extension.Command.flag "save-dot" let converge = Extension.Command.flag "converge" From c3c69b2a55454720aeb0e29085333f95dc7fded6 Mon Sep 17 00:00:00 2001 From: Kenneth Adam Miller Date: Mon, 12 May 2025 07:53:18 -0500 Subject: [PATCH 19/31] add new script for testing superset disasm using parallel --- lib_test/bap_superset_disasm/disasm_corpora.sh | 5 +++++ 1 file changed, 5 insertions(+) create mode 100755 lib_test/bap_superset_disasm/disasm_corpora.sh diff --git a/lib_test/bap_superset_disasm/disasm_corpora.sh b/lib_test/bap_superset_disasm/disasm_corpora.sh new file mode 100755 index 000000000..dda87d539 --- /dev/null +++ b/lib_test/bap_superset_disasm/disasm_corpora.sh @@ -0,0 +1,5 @@ +find /Volumes -type f -executable -exec sh -c "file -i '{}' | grep -q 'x-executable; charset=binary'" \; -print | parallel "bap superset-disasm --ground_truth_bin={} {}" + +#$(find /Volumes -type f -executable -exec sh -c "file -i '{}' | grep -q 'x-executable; charset=binary'" \; -printa | tr "\n" ",") + +#bap supersetd-graph-metrics "/Volumes/arm-binaries/coreutils/coreutils_O0_cp", From f87faf53273eca281a030d4cefca73d7bbfd1245 Mon Sep 17 00:00:00 2001 From: Kenneth Adam Miller Date: Mon, 12 May 2025 09:30:54 -0500 Subject: [PATCH 20/31] Add new integration test and script for parallel processing binaries --- lib_test/bap_superset_disasm/disasm_corpora.sh | 7 +++---- plugins/superset_disasm/superset_disassembler.ml | 6 ++++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/lib_test/bap_superset_disasm/disasm_corpora.sh b/lib_test/bap_superset_disasm/disasm_corpora.sh index dda87d539..917176322 100755 --- a/lib_test/bap_superset_disasm/disasm_corpora.sh +++ b/lib_test/bap_superset_disasm/disasm_corpora.sh @@ -1,5 +1,4 @@ -find /Volumes -type f -executable -exec sh -c "file -i '{}' | grep -q 'x-executable; charset=binary'" \; -print | parallel "bap superset-disasm --ground_truth_bin={} {}" +time find /Volumes -type f -executable -exec sh -c "file -i '{}' | grep -q 'x-executable; charset=binary'" \; -print | parallel "bap superset-disasm --ground_truth_bin={} {}" -#$(find /Volumes -type f -executable -exec sh -c "file -i '{}' | grep -q 'x-executable; charset=binary'" \; -printa | tr "\n" ",") - -#bap supersetd-graph-metrics "/Volumes/arm-binaries/coreutils/coreutils_O0_cp", +find /Volumes -type f -executable -exec sh -c "file -i '{}' | grep -q 'x-executable; charset=binary'" \; -print > files.txt +bap supersetd-graph-metrics ./files.txt diff --git a/plugins/superset_disasm/superset_disassembler.ml b/plugins/superset_disasm/superset_disassembler.ml index 58ecd2bde..6547d5bc4 100644 --- a/plugins/superset_disasm/superset_disassembler.ml +++ b/plugins/superset_disasm/superset_disassembler.ml @@ -366,7 +366,7 @@ let _superset_disassemble_command : unit = let inputs = Extension.Command.argument ~doc:"The input files" - Extension.Type.("FILES" %: list string =? ["a.out"] + Extension.Type.("FILES" %: string =? "files.txt" ) exception Missing_file of string @@ -385,7 +385,9 @@ let _graph_metrics : unit = cmdname in Extension.Command.declare ~doc cmdname ~requires:features_used args @@ - fun inputs tgt ctxt -> + fun files_list tgt ctxt -> + let inputs = Stdlib.In_channel.with_open_text files_list + In_channel.input_lines in let is_missing x = not (Stdlib.Sys.file_exists x) in let missing = List.find inputs ~f:is_missing in From c9a8640ee3386de91cbad5ee0ff85f37320d6afe Mon Sep 17 00:00:00 2001 From: Kenneth Adam Miller Date: Tue, 13 May 2025 23:30:17 -0500 Subject: [PATCH 21/31] Add new find fn culprit flag --- lib/bap_superset_disasm/cmdoptions.mli | 0 lib/bap_superset_disasm/dune | 1 - lib/bap_superset_disasm/fixpoint.ml | 1 - lib/bap_superset_disasm/fixpoint.mli | 2 +- lib/bap_superset_disasm/heuristics.ml | 7 +- lib/bap_superset_disasm/heuristics.mli | 2 +- plugins/superset_disasm/cmdoptions.ml | 138 ++++++++++++------ .../superset_disasm/superset_disassembler.ml | 26 ++-- 8 files changed, 115 insertions(+), 62 deletions(-) delete mode 100644 lib/bap_superset_disasm/cmdoptions.mli diff --git a/lib/bap_superset_disasm/cmdoptions.mli b/lib/bap_superset_disasm/cmdoptions.mli deleted file mode 100644 index e69de29bb..000000000 diff --git a/lib/bap_superset_disasm/dune b/lib/bap_superset_disasm/dune index 24bf31989..f41d3bec3 100644 --- a/lib/bap_superset_disasm/dune +++ b/lib/bap_superset_disasm/dune @@ -14,6 +14,5 @@ zmq gnuplot ) -(modules_without_implementation cmdoptions) ) diff --git a/lib/bap_superset_disasm/fixpoint.ml b/lib/bap_superset_disasm/fixpoint.ml index 9d99e8867..2d33a46d7 100644 --- a/lib/bap_superset_disasm/fixpoint.ml +++ b/lib/bap_superset_disasm/fixpoint.ml @@ -6,7 +6,6 @@ let iterate rounds f superset = let rec do_analysis round superset = if round = rounds then superset else let (superset) = f superset in - let superset = Trim.run superset in do_analysis (round+1) superset in do_analysis 1 superset diff --git a/lib/bap_superset_disasm/fixpoint.mli b/lib/bap_superset_disasm/fixpoint.mli index a1abcca03..6113d21c1 100644 --- a/lib/bap_superset_disasm/fixpoint.mli +++ b/lib/bap_superset_disasm/fixpoint.mli @@ -1,6 +1,6 @@ open Bap.Std -val iterate : int -> (Superset_impl.t -> Superset_impl.t) -> Superset_impl.t -> Superset_impl.t +val iterate : int -> ('a -> 'a) -> 'a -> 'a val protect : Superset_impl.t -> (Superset_impl.t -> Superset_impl.t) -> Superset_impl.t val converge : Superset_impl.t -> 'a -> 'b Addr.Map.t -> Superset_impl.t diff --git a/lib/bap_superset_disasm/heuristics.ml b/lib/bap_superset_disasm/heuristics.ml index 079eb2ef7..9f6205c05 100644 --- a/lib/bap_superset_disasm/heuristics.ml +++ b/lib/bap_superset_disasm/heuristics.ml @@ -18,11 +18,12 @@ module HeurismSet(H : Heurism) = struct let package = "superset-heuristics" let addrs_t = Knowledge.Domain.optional - ~inspect:Addr.Set.sexp_of_t ~equal:Addr.Set.equal "addr.set" + ~inspect:Addr.Hash_set.sexp_of_t + ~equal:Addr.Hash_set.equal "addr.set" let addrs_persistent = Knowledge.Persistent.of_binable - (module struct type t = Addr.Set.t option [@@deriving bin_io] end) + (module struct type t = Addr.Hash_set.t option [@@deriving bin_io] end) let attr ty persistent desc = let open Theory.Program in @@ -80,7 +81,7 @@ let get_callsites ?(threshold=6) superset = callers *) let tag_callsites visited ?callsites superset = let callsites = Option.value callsites - ~default:(get_callsites ~threshold:6 superset) in + ~default:(get_callsites superset) in Hash_set.iter callsites ~f:(fun callsite -> Traverse.with_descendents_at ~visited ?post:None ?pre:None superset callsite; diff --git a/lib/bap_superset_disasm/heuristics.mli b/lib/bap_superset_disasm/heuristics.mli index 75ada0dbf..4d6863ddf 100644 --- a/lib/bap_superset_disasm/heuristics.mli +++ b/lib/bap_superset_disasm/heuristics.mli @@ -3,5 +3,5 @@ open Bap.Std val get_callsites : ?threshold:(int) -> Superset_impl.t -> Addr.Hash_set.t val tag_callsites : Addr.Hash_set.t -> ?callsites:Addr.Hash_set.t -> Superset_impl.t -> Superset_impl.t val with_featureset : f:(string -> (Superset_impl.t -> Superset_impl.t) -> 'a -> 'a) -> init:'a -> string list -> 'b -> 'a -val with_featurepmap : string list -> Superset_impl.t -> f:((int * word * string) list Addr.Map.t -> string list -> Superset_impl.t -> unit) -> unit +val with_featurepmap : string list -> Superset_impl.t -> f:((int * word * string) list Addr.Map.t -> string list -> Superset_impl.t -> 'a) -> 'a val defaults : string list diff --git a/plugins/superset_disasm/cmdoptions.ml b/plugins/superset_disasm/cmdoptions.ml index 9d847385d..3897aebfc 100644 --- a/plugins/superset_disasm/cmdoptions.ml +++ b/plugins/superset_disasm/cmdoptions.ml @@ -26,6 +26,7 @@ type t = { tp_threshold : float; rounds : int; heuristics : string list; + find_fn_culprit : bool; } [@@deriving sexp, fields, bin_io] type opts = t [@@deriving sexp, bin_io] @@ -63,34 +64,72 @@ module With_options(Conf : Provider) = struct let open KB.Syntax in KB.promise Cache.disasm_opts (fun o -> KB.return (Some options) - ) + ); + (match options.ground_truth_bin with + | Some bin -> + KB.promise Metrics.Cache.ground_truth_source + (fun _ -> KB.return bin); + | None -> ()) - let with_analyses superset analyses = - Trim.run @@ - List.fold analyses ~init:superset ~f:(fun superset analyze -> - analyze superset + let fn_addrs = Addr.Hash_set.create () + + let check_false_negs superset name sym_label = + let open KB.Syntax in + if options.find_fn_culprit then ( + KB.collect Metrics.Cache.ground_truth sym_label >>= + fun gt -> + match gt with + | None -> KB.return () + | Some gt -> ( + let missing = + Set.fold ~init:0 gt ~f:(fun tot addr -> + if not (Hash_set.mem fn_addrs addr) && + Superset.Core.mem superset addr then + tot + else ( + Hash_set.add fn_addrs addr; + tot+1 + ) + ) in + if missing <> 0 then ( + KB.return @@ + print_endline + @@ sprintf "%s triggered %d false negs" + name missing; + ) else KB.return () ) - + ) else KB.return () + let checkpoint ?addrs bin invariants = let backend = options.disassembler in + let invariants = List.map invariants ~f:snd in let invariants = Invariants.tag_success ::invariants in let f = Invariants.tag ~invariants in Superset.superset_disasm_of_file ?addrs ~backend bin ~f let args_to_funcs args funcs = - let l = List.filter_map args + List.filter_map args ~f:(fun arg -> List.find funcs ~f:(fun (name,f) -> String.equal arg name ) - ) in - List.map l ~f:snd + ) let invariants = args_to_funcs options.invariants Invariants.default_tags let analyses = args_to_funcs options.analyses list_analyses + let with_analyses superset analyses ~g = + let open KB.Syntax in + List.fold analyses ~init:superset + ~f:(fun superset (name,analyze) -> + superset >>= fun superset -> + let superset = analyze superset in + g superset name >>= fun () -> + KB.return superset + ) + let with_options () = let open KB.Syntax in Superset.Cache.sym_label >>= fun sym_label -> @@ -103,55 +142,72 @@ module With_options(Conf : Provider) = struct let () = Metrics.set_ground_truth superset in let trim = Trim.run in let superset = trim superset in - let superset = with_analyses superset analyses in - let superset = trim superset in + check_false_negs superset "Invariants " sym_label + >>= fun () -> + let g superset name = + let superset = trim superset in + check_false_negs superset name sym_label in + let superset = KB.return superset in + with_analyses superset analyses ~g >>= fun superset -> KB.promise Superset.Cache.superset_graph (fun _ -> KB.return @@ Some Superset.ISG.(to_list superset)); - superset + KB.return superset | Some graph -> let graph = Seq.of_list graph in let graph = Seq.concat @@ Seq.map graph ~f:(fun (s,d) -> Seq.of_list [s;d] ) in - checkpoint ~addrs:graph options.target [] in + let superset = checkpoint ~addrs:graph options.target [] in + check_false_negs superset + "stored in persistent graph with fns" + sym_label >>= fun () -> + KB.return @@ superset in + let f superset = Heuristics.with_featureset options.heuristics superset ~init:(superset) ~f:(fun fname feature superset -> - Trim.run @@ feature superset + superset >>= fun superset -> + KB.return @@ Trim.run @@ feature superset ) in - let superset = Fixpoint.iterate options.rounds f superset in + Fixpoint.iterate options.rounds f superset >>= fun superset -> let pnts_of_percent prcnt = Int.of_float (1.0/.(1.0-.prcnt)) in let threshold = (pnts_of_percent options.tp_threshold) in + let f pmap featureset superset = + let total_of_features l = + List.fold ~init:0 ~f:(fun x (y,_,_) -> x + y) l in + let feature_pmap = + Map.map pmap ~f:(total_of_features) in + let feature_pmap = + Map.filter feature_pmap ~f:(fun total -> + (total > threshold)) in + Report.collect_distributions superset threshold pmap; + let superset = + if options.converge then ( + let f superset = + let superset = + if options.protect then ( + Fixpoint.protect superset (fun superset -> + Fixpoint.converge superset options.heuristics + feature_pmap + ) + ) else + Fixpoint.converge superset options.heuristics + feature_pmap in + Trim.run superset in + check_false_negs superset + "feature convergence " + sym_label >>= fun () -> + KB.return @@ Fixpoint.iterate options.rounds f superset + ) else KB.return superset in + superset >>= fun superset -> + Metrics.compute_metrics superset; + KB.return superset + in Heuristics.with_featurepmap options.heuristics superset - ~f:(fun pmap featureset superset -> - let total_of_features l = - List.fold ~init:0 ~f:(fun x (y,_,_) -> x + y) l in - let feature_pmap = - Map.map pmap ~f:(total_of_features) in - let feature_pmap = - Map.filter feature_pmap ~f:(fun total -> - (total > threshold)) in - Report.collect_distributions superset threshold pmap; - let superset = - if options.converge then ( - let f superset = - let superset = - if options.protect then ( - Fixpoint.protect superset (fun superset -> - Fixpoint.converge superset options.heuristics feature_pmap - ) - ) else - Fixpoint.converge superset options.heuristics - feature_pmap in - Trim.run superset in - Fixpoint.iterate options.rounds f superset - ) else superset in - Metrics.compute_metrics superset; - ); - KB.return superset + ~f let main = with_options diff --git a/plugins/superset_disasm/superset_disassembler.ml b/plugins/superset_disasm/superset_disassembler.ml index 6547d5bc4..e3462d9ea 100644 --- a/plugins/superset_disasm/superset_disassembler.ml +++ b/plugins/superset_disasm/superset_disassembler.ml @@ -66,7 +66,7 @@ let man = {| which case the project will be dumped in several formats. ``` - bap superset_disasm /bin/echo -dasm:out.asm + bap superset-disasm /bin/echo -dasm:out.asm ``` |} @@ -285,11 +285,6 @@ let create_and_process kb options = let digest = superset_digest options in let _ = load_knowledge digest kb in let () = Toplevel.exec @@ superset_disasm options in - (match options.ground_truth_bin with - | Some bin -> - KB.promise Metrics.Cache.ground_truth_source - (fun _ -> KB.return bin); - | None -> ()); let ro = Metrics.Cache.reduced_occlusion in let _ = Toplevel.eval ro Metrics.Cache.sym_label in let _ = Toplevel.eval Metrics.Cache.size Metrics.Cache.sym_label in @@ -339,30 +334,33 @@ let converge = let protect = Extension.Command.flag "protect" +let find_fn_culprit = Extension.Command.flag "find-fn-culprit" + let _superset_disassemble_command : unit = let args = let open Extension.Command in args $input $outputs $loader $update $knowledge $ground_truth_bin $invariants $analyses $tp_threshold $heuristics - $save_dot $rounds $converge $protect + $save_dot $rounds $converge $protect $find_fn_culprit in Extension.Command.declare ~doc:man "superset-disasm" ~requires:features_used args @@ fun input outputs loader update kb ground_truth_bin invariants analyses tp_threshold heuristics - save_dot rounds converge protect ctxt -> + save_dot rounds converge protect find_fn_culprit ctxt -> let converge = not converge in let protect = not protect in let options = Fields.create ~disassembler:loader ~ground_truth_bin ~target:input ~save_dot ~tp_threshold ~rounds ~heuristics ~analyses - ~converge ~protect ~invariants in + ~converge ~protect ~invariants ~find_fn_culprit in validate_knowledge update kb >>= fun () -> validate_input input >>= fun () -> Dump_formats.parse outputs >>= fun outputs -> Ok (create_and_process kb options) + let inputs = Extension.Command.argument ~doc:"The input files" @@ -417,14 +415,14 @@ let _print_metrics_command : unit = args $input $outputs $loader $update $knowledge $ground_truth_bin $invariants $analyses $tp_threshold $heuristics $rounds - $converge $metrics in + $converge $metrics $find_fn_culprit in let man = "Perform computational operations on the cache" in Extension.Command.declare ~doc:man "supersetd-print-metrics" ~requires:features_used args @@ fun input outputs loader update kb ground_truth_bin invariants analyses tp_threshold heuristics rounds - converge metrics + converge metrics find_fn_culprit ctxt -> validate_knowledge update kb >>= fun () -> validate_input input >>= fun () -> @@ -433,7 +431,7 @@ let _print_metrics_command : unit = Fields.create ~disassembler:loader ~ground_truth_bin ~target:input ~save_dot:false ~tp_threshold ~rounds ~heuristics ~analyses ~converge:false ~protect:false - ~invariants in + ~invariants ~find_fn_culprit in let digest = superset_digest options in let _ = load_knowledge digest kb in let map_opt = @@ -491,7 +489,7 @@ let _print_metrics_command : unit = | None -> KB.return () ); Ok () - + let show_cache_digest = Extension.Command.flag "show_cache_digest" @@ -508,7 +506,7 @@ let _cache_command : unit = $show_cache_digest $reset_cache $is_present in let man = "Apply operations to the superset cache" in - Extension.Command.declare ~doc:man "superset-cache" + Extension.Command.declare ~doc:man "supersetd-cache" ~requires:features_used args @@ fun input outputs loader update kb show_cache_digest reset_cache verify_cache From 3a4e46b09959cc9b3cb82e118ab33ed4e7f35ddd Mon Sep 17 00:00:00 2001 From: Kenneth Adam Miller Date: Thu, 15 May 2025 21:20:20 -0500 Subject: [PATCH 22/31] Add new print-fn-binaries flag to supersetd-graph-metrics command --- plugins/superset_disasm/plot_superset_cache.ml | 8 ++++++-- plugins/superset_disasm/superset_disassembler.ml | 14 ++++++++++---- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/plugins/superset_disasm/plot_superset_cache.ml b/plugins/superset_disasm/plot_superset_cache.ml index cc82f51ec..828f2f04c 100644 --- a/plugins/superset_disasm/plot_superset_cache.ml +++ b/plugins/superset_disasm/plot_superset_cache.ml @@ -106,11 +106,15 @@ let compute_digest target disasm = disasm; ] ~namespace:"knowledge" -let summaries_of_files tgt fs = +let summaries_of_files print_fn_bins tgt fs = List.fold fs ~init:[] ~f:(fun ls lf -> let digest = compute_digest lf tgt in if import_knowledge_from_cache digest then - Metrics.get_summary () :: ls + let sum = Metrics.get_summary () in + let fns = Option.value sum.fns ~default:0 in + if print_fn_bins && fns > 0 then + print_endline lf; + sum :: ls else ( print_endline @@ sprintf "%s not present in cache" lf; ls diff --git a/plugins/superset_disasm/superset_disassembler.ml b/plugins/superset_disasm/superset_disassembler.ml index e3462d9ea..70cf5b447 100644 --- a/plugins/superset_disasm/superset_disassembler.ml +++ b/plugins/superset_disasm/superset_disassembler.ml @@ -289,7 +289,9 @@ let create_and_process kb options = let _ = Toplevel.eval ro Metrics.Cache.sym_label in let _ = Toplevel.eval Metrics.Cache.size Metrics.Cache.sym_label in let summary = Metrics.get_summary () in - print_endline @@ Sexp.to_string @@ Metrics.sexp_of_t summary; + print_endline @@ + sprintf "%s %s" options.target @@ + Sexp.to_string @@ Metrics.sexp_of_t summary; store_knowledge_in_cache (superset_digest options) let rounds = @@ -367,12 +369,15 @@ let inputs = Extension.Type.("FILES" %: string =? "files.txt" ) +let print_fn_bins = + Extension.Command.flag "print-fn-bins" + exception Missing_file of string let _graph_metrics : unit = let args = let open Extension.Command in - args $inputs $loader + args $inputs $print_fn_bins $loader in let cmdname = "supersetd-graph-metrics" in let doc = sprintf @@ -383,7 +388,7 @@ let _graph_metrics : unit = cmdname in Extension.Command.declare ~doc cmdname ~requires:features_used args @@ - fun files_list tgt ctxt -> + fun files_list print_fn_bins tgt ctxt -> let inputs = Stdlib.In_channel.with_open_text files_list In_channel.input_lines in let is_missing x = @@ -396,7 +401,8 @@ let _graph_metrics : unit = print_endline @@ sprintf "%s is missing" f; raise (Missing_file f); in - let summaries = Plot_superset_cache.summaries_of_files tgt inputs in + let summaries = Plot_superset_cache.summaries_of_files + print_fn_bins tgt inputs in Ok (Plot_superset_cache.plot_summaries summaries) let metrics = From a3c6a06065b448f6d394d44dc5c3e3168edbcbf1 Mon Sep 17 00:00:00 2001 From: Kenneth Adam Miller Date: Fri, 16 May 2025 02:49:40 -0500 Subject: [PATCH 23/31] Fix TrimLimitedClamped --- lib/bap_superset_disasm/heuristics.ml | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/lib/bap_superset_disasm/heuristics.ml b/lib/bap_superset_disasm/heuristics.ml index 9f6205c05..f33716f6b 100644 --- a/lib/bap_superset_disasm/heuristics.ml +++ b/lib/bap_superset_disasm/heuristics.ml @@ -84,13 +84,13 @@ let tag_callsites visited ?callsites superset = ~default:(get_callsites superset) in Hash_set.iter callsites ~f:(fun callsite -> Traverse.with_descendents_at ~visited + (* TODO ~pre should mark insn bodies as data *) ?post:None ?pre:None superset callsite; ); superset let find_free_insns superset = let mem = Superset.Core.mem superset in - let all_conflicts = Addr.Hash_set.create () in let to_clamp = Superset.Core.fold superset ~init:([]) ~f:(fun ~key ~data to_clamp -> @@ -99,14 +99,15 @@ let find_free_insns superset = let conflicts = Superset.Occlusion.range_seq_of_conflicts ~mem addr len in let no_conflicts = Seq.is_empty conflicts in - Seq.iter conflicts ~f:(fun c -> Hash_set.add all_conflicts c); - if no_conflicts && not Hash_set.(mem all_conflicts addr) then + if no_conflicts then addr :: to_clamp else ( to_clamp ) ) in - to_clamp + let to_clamp = Addr.Set.of_list to_clamp in + Set.diff to_clamp @@ + Superset.Occlusion.find_all_conflicts superset let restricted_clamp superset = let entries = Superset.entries_of_isg superset in @@ -127,7 +128,7 @@ let restricted_clamp superset = let extended_clamp superset = let to_clamp = find_free_insns superset in - List.fold to_clamp ~init:Addr.Set.empty ~f:(fun to_clamp clamp -> + Set.fold to_clamp ~init:Addr.Set.empty ~f:(fun to_clamp clamp -> let _, to_clamp = Superset.ISG.dfs_fold superset ~pre:(fun (struck,to_clamp) addr -> @@ -219,7 +220,7 @@ let extract_trim_clamped superset = let to_clamp = find_free_insns superset in let visited = Addr.Hash_set.create () in let datas = Addr.Hash_set.create () in - List.iter to_clamp ~f:(fun c -> + Set.iter to_clamp ~f:(fun c -> if not Hash_set.(mem visited c) then if Superset.Core.mem superset c then ( Traverse.mark_descendent_bodies_at @@ -227,16 +228,13 @@ let extract_trim_clamped superset = ) ); Superset.Core.clear_each superset visited; - List.iter to_clamp ~f:(Superset.Core.clear_bad superset); + Set.iter to_clamp ~f:(Superset.Core.clear_bad superset); superset let extract_trim_limited_clamped superset = let protection = Addr.Hash_set.create () in - let superset = - if Hash_set.length protection = 0 then ( - let callsites = get_callsites ~threshold:0 superset in - tag_callsites protection ~callsites superset - ) else superset in + let callsites = get_callsites ~threshold:0 superset in + let superset = tag_callsites protection ~callsites superset in Superset.Core.clear_all_bad superset; let superset = extract_trim_clamped superset in Superset.Core.clear_each superset protection; superset @@ -407,7 +405,7 @@ let _exfiltset = [ ((fun x -> transform (get_callsites ~threshold:6 x)), unfiltered)); ("Clamped", - ((fun s -> Addr.Set.of_list @@ find_free_insns s), unfiltered)); + ((fun s -> find_free_insns s), unfiltered)); ("RestrictedClamped", (restricted_clamp, unfiltered)); ("ExtendedClamped", (extended_clamp, unfiltered)); ("UnfilteredSCC", (extract_loops_to_set,unfiltered)); From b834949bead7a79d64b8560ff3c37c1a0eeba463 Mon Sep 17 00:00:00 2001 From: Kenneth Adam Miller Date: Fri, 16 May 2025 03:10:46 -0500 Subject: [PATCH 24/31] Disable troublesome invariants --- lib/bap_superset_disasm/invariants.ml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/bap_superset_disasm/invariants.ml b/lib/bap_superset_disasm/invariants.ml index 4f9a91db1..8b893ff20 100644 --- a/lib/bap_superset_disasm/invariants.ml +++ b/lib/bap_superset_disasm/invariants.ml @@ -72,8 +72,8 @@ let tag_success superset mem insn targets = Superset.ISG.link superset target src | None -> superset) -let default_tags = ["Tag non insn", tag_non_insn; - "Tag target not in mem", tag_target_not_in_mem; +let default_tags = [(*"Tag non insn", tag_non_insn;*) + (*"Tag target not in mem", tag_target_not_in_mem;*) "Tag target is bad", tag_target_is_bad; "Tag target in body", tag_target_in_body; (*tag_success;*)] From 661b56041f23ef70d65950d255c4582b5a3f7552 Mon Sep 17 00:00:00 2001 From: Kenneth Adam Miller Date: Fri, 16 May 2025 03:13:45 -0500 Subject: [PATCH 25/31] Enable TrimLimitedClamped in disasm_corpora.sh --- lib_test/bap_superset_disasm/disasm_corpora.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lib_test/bap_superset_disasm/disasm_corpora.sh b/lib_test/bap_superset_disasm/disasm_corpora.sh index 917176322..77e66e848 100755 --- a/lib_test/bap_superset_disasm/disasm_corpora.sh +++ b/lib_test/bap_superset_disasm/disasm_corpora.sh @@ -1,4 +1,5 @@ -time find /Volumes -type f -executable -exec sh -c "file -i '{}' | grep -q 'x-executable; charset=binary'" \; -print | parallel "bap superset-disasm --ground_truth_bin={} {}" - find /Volumes -type f -executable -exec sh -c "file -i '{}' | grep -q 'x-executable; charset=binary'" \; -print > files.txt -bap supersetd-graph-metrics ./files.txt + +time cat files.txt | parallel "bap superset-disasm --find-fn-culprit --heuristics=Callsites3,FixedpointGrammar,ImgEntry,TrimLimitedClamped --ground-truth-bin={} {}" + +bap supersetd-graph-metrics ./files.txt --print-fn-bins From acf71e34d6b0958044f9ba0663ce5485cc9d16e4 Mon Sep 17 00:00:00 2001 From: Kenneth Adam Miller Date: Fri, 16 May 2025 03:41:17 -0500 Subject: [PATCH 26/31] Correct bap superset disasm test library name --- lib_test/bap_superset_disasm/dune | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib_test/bap_superset_disasm/dune b/lib_test/bap_superset_disasm/dune index fd7a7faa6..402550de0 100644 --- a/lib_test/bap_superset_disasm/dune +++ b/lib_test/bap_superset_disasm/dune @@ -1,5 +1,5 @@ (library - (name test_disasm) + (name test_superset_disasm) (preprocess (pps ppx_bap)) (wrapped false) (libraries bap core_kernel ounit2 str)) From c3873cdfee77f5175a24126702afb264790302a2 Mon Sep 17 00:00:00 2001 From: Kenneth Adam Miller Date: Fri, 16 May 2025 14:15:08 -0500 Subject: [PATCH 27/31] Trim heuristics severely, drop liveness, features, abstract_ssa --- lib/bap_superset_disasm/abstract_ssa.ml | 153 ---- lib/bap_superset_disasm/abstract_ssa.mli | 9 - .../bap_superset_disasm.ml | 3 - lib/bap_superset_disasm/features.ml | 765 ------------------ lib/bap_superset_disasm/features.mli | 0 lib/bap_superset_disasm/heuristics.ml | 268 +----- lib/bap_superset_disasm/liveness.ml | 56 -- lib/bap_superset_disasm/liveness.mli | 3 - 8 files changed, 13 insertions(+), 1244 deletions(-) delete mode 100644 lib/bap_superset_disasm/abstract_ssa.ml delete mode 100644 lib/bap_superset_disasm/abstract_ssa.mli delete mode 100644 lib/bap_superset_disasm/features.ml delete mode 100644 lib/bap_superset_disasm/features.mli delete mode 100644 lib/bap_superset_disasm/liveness.ml delete mode 100644 lib/bap_superset_disasm/liveness.mli diff --git a/lib/bap_superset_disasm/abstract_ssa.ml b/lib/bap_superset_disasm/abstract_ssa.ml deleted file mode 100644 index 17921c981..000000000 --- a/lib/bap_superset_disasm/abstract_ssa.ml +++ /dev/null @@ -1,153 +0,0 @@ -open Bap.Std -open Core - -let stmt_def_vars = - object(self) - inherit [Exp.Set.t] Stmt.visitor - method enter_move def use accu = - if not Var.(is_virtual def) then - Set.add accu Exp.(Bil.Var def) - else accu - end - -let stmt_use_vars = - object(self) - inherit [Exp.Set.t] Stmt.visitor - method enter_move def use accu = - Set.add accu use - end - - -let stmt_def_freevars = - object(self) - inherit [Var.Set.t] Stmt.visitor - method enter_move def use accu = - if not Var.(is_virtual def) then - Set.add accu def - else accu - end - -let stmt_use_freevars = - object(self) - inherit [Var.Set.t] Stmt.visitor - method enter_move def use accu = - let free_vars = - Set.filter ~f:(fun v -> not Var.(is_virtual v)) (Exp.free_vars use) - in Set.union accu free_vars - end - - -let def_ssa bil = - stmt_def_vars#run bil Exp.Set.empty - -let use_ssa bil = - stmt_use_vars#run bil Exp.Set.empty - -let def_freevars bil = - stmt_def_freevars#run bil Var.Set.empty - -let use_freevars bil = - stmt_use_freevars#run bil Var.Set.empty - - -(* Abstract SSA *) -(*type t = - { - (* pointer expression *) - (* bil *) - (* ssa name *) - }*) - -(* currentDef is a mapping from each variable to it's defining *) -(* expression. When recording into this mapping, the right hand *) -(* expression of the IR is stored. *) - -(* filled - when local numbering for a block has finished. *) -(* sealed - when no further predecessors will be added to the *) -(* block. *) - -let absexp_of_bil bil = - object(self) - inherit [exp option] Stmt.finder - method! enter_jmp exp r = r - end -(* -let writeVariable variable block value = - currentDef[variable][block] <- value - -let readVariable variable block = - if Set.mem currentDef[variable] block then - currentDef[variable][block] - else readVariableRecursive varaible block - -let tryRemoveTrivialPhi phi = - let same = - List.fold phi.operands ~init:None ~f:(fun op curr -> - if op = curr or op = phi then - curr (* Unique value or self reference *) - else - Some(op) - ) in - if Option.is_some same then - (* The phi merges at least two values: not trivial *) - phi - else - (* If same is none (unreachable or in the start block, *) - (* then create an undefined value *) - let same = Option.value same ~default:(Operand.create ()) in - (* Remember all users except the phi itself *) - let users = phi.users.remove phi in - (* Reroute all uses of phi to same and remove phi *) - phi.replaceBy same; - (* Try to recursively remove all phi users, which might *) - (* have become trivial *) - Set.iter users ~f:(fun use -> - match use with - | Phi(use) -> - tryRemoveTrivialPhi use - | _ -> ()); - same - -let addPhiOperands variable phi = - (* If there is more than one, collect the definitions from all *) - (* the predecessors and construct a phi function joining them *) - (* into a single value. *) - List.iter phi.block.preds ~f:(fun pred -> - phi.appendOperand (readVariable variable pred)); - tryRemoveTrivialPhi phi - -(* if a block currently contains no definition for a variable, we *) -(* recursively look for a definition in its predecessors. *) -let rec readVariableRecursive variable block = - if not Set.(mem sealedBlocks block) then - let value = Phi.create block in - incompletePhis[block][variable] <- value; - writeVariable variable block value; - value - else if List.(length block.preds) = 1 then - (* If the block has a single predecessor (edge leading into it) *) - (* then recursively query it for a definition. *) - let value = readVariable(variable, List.(hd block.preds)) in - writeVariable variable block value; - value - else - (* Determine operands from predecessors *) - let value = Phi.create block in - writeVaraible variable block value; - let value = addPhiOperands variable value in - writeVariable variable block value; - value - - -let readVariable variable block = - if Map.mem currentDef[variable] block then - currentDef[variable][block] - else - readVariableRecursive variable block - -let sealBlock block = - Set.iter incompletePhis[block] ~f:(fun variable -> - addPhiOperands variable incompletePhis[block][variable] - ); - Set.add sealedBlocks block -*) diff --git a/lib/bap_superset_disasm/abstract_ssa.mli b/lib/bap_superset_disasm/abstract_ssa.mli deleted file mode 100644 index 1948a31f3..000000000 --- a/lib/bap_superset_disasm/abstract_ssa.mli +++ /dev/null @@ -1,9 +0,0 @@ -open Bap.Std - -val use_ssa : bil -> Exp.Set.t - -val def_ssa : bil -> Exp.Set.t - -val use_freevars : bil -> Var.Set.t - -val def_freevars : bil -> Var.Set.t diff --git a/lib/bap_superset_disasm/bap_superset_disasm.ml b/lib/bap_superset_disasm/bap_superset_disasm.ml index ec32ef2ca..9bb84ceb6 100644 --- a/lib/bap_superset_disasm/bap_superset_disasm.ml +++ b/lib/bap_superset_disasm/bap_superset_disasm.ml @@ -1,13 +1,10 @@ module Superset_impl = Superset_impl module Superset = Superset -module Abstract_ssa = Abstract_ssa module Decision_trees = Decision_trees -module Features = Features module Fixpoint = Fixpoint module Grammar = Grammar module Heuristics = Heuristics module Invariants = Invariants -module Liveness = Liveness module Metrics = Metrics module Report = Report module Traverse = Traverse diff --git a/lib/bap_superset_disasm/features.ml b/lib/bap_superset_disasm/features.ml deleted file mode 100644 index 563eed1c4..000000000 --- a/lib/bap_superset_disasm/features.ml +++ /dev/null @@ -1,765 +0,0 @@ -open Core -open Bap.Std - -module Dis = Disasm_expert - -let default_features = [ - "ImgEntry"; - (*"NoExit";*) - (*"LoopsWithBreak";*) - "BranchViolations"; - (*"LayerViolations";*) - "TrimLimitedClamped"; - "Callsites3"; - (*"TrimFixpointGrammar"; - "TrimFixpointTails";*) - (*"Clamped"; - "SCC"; - "LoopGrammar"; - "CallsiteLineage"; - "SSA";*) - (*"FreeVarSSA";*) - (*"Grammar";*) - (*"Constant";*) -] -let default_features = List.rev default_features - -let transform = Hash_set.fold ~init:Addr.Set.empty ~f:Set.add - -let clear_each superset visited = - Hash_set.iter visited ~f:(fun tp -> - Superset.Core.clear_bad superset tp - ) - -let get_non_fall_through_edges superset = - Superset.ISG.fold_edges superset - (fun child parent jmps -> - if Superset.is_fall_through superset parent child then - Map.set jmps ~key:child ~data:parent - else jmps - ) Addr.Map.empty - - -(** A callsite is a location which is shared as the target of a call - by several other locations in the binary. *) -let get_callsites ?(threshold=6) superset = - let callsites = Addr.Hash_set.create () in - Superset.ISG.iter_vertex superset - (fun v -> - let callers = Superset.ISG.ancestors superset v in - let num_callers = - List.fold callers ~init:0 ~f:(fun total caller -> - if not (Superset.is_fall_through superset caller v) then - total + 1 - else total) in - if num_callers > threshold then ( - Hash_set.add callsites v; - ) - ) ; - callsites - -(** Adds to the set visited the set of reachable descendents of a - callsite of a given sufficient threshold number of external callers *) -let tag_callsites visited ?callsites superset = - let callsites = Option.value callsites - ~default:(get_callsites ~threshold:6 superset) in - Hash_set.iter callsites ~f:(fun callsite -> - Traverse.with_descendents_at ~visited - ?post:None ?pre:None superset callsite; - ); - superset - -let find_free_insns superset = - let mem = Superset.Core.mem superset in - let all_conflicts = Addr.Hash_set.create () in - let to_clamp = - Superset.Core.fold superset ~init:(Addr.Set.empty) - ~f:(fun ~key ~data to_clamp -> - let (addr,(memory,_)) = key, data in - let len = Memory.length memory in - let conflicts = Superset.Occlusion.range_seq_of_conflicts - ~mem addr len in - let no_conflicts = Seq.is_empty conflicts in - Seq.iter conflicts ~f:(fun c -> - Hash_set.add all_conflicts c); - if no_conflicts && not Hash_set.(mem all_conflicts addr) then - Set.add to_clamp addr - else ( - to_clamp - ) - ) in - to_clamp -(*Hash_set.fold all_conflicts ~init:to_clamp ~f:Set.remove*) - -let restricted_clamp superset = - let entries = Superset.entries_of_isg superset in - let conflicts = Superset.Occlusion.find_all_conflicts superset in - let to_clamp = ref Addr.Set.empty in - Hash_set.iter entries ~f:(fun entry -> - let b = ref false in - let pre v = - if Addr.(v = entry) then - b := false - else if not (!b) then - if Set.mem conflicts v then - b := true - else to_clamp := Set.add (!to_clamp) v - in Traverse.with_ancestors_at ~post:(fun _ -> ()) ~pre superset entry; - ); - !to_clamp - -let extended_clamp superset = - let to_clamp = find_free_insns superset in - Set.fold to_clamp ~init:Addr.Set.empty ~f:(fun to_clamp clamp -> - let _, to_clamp = - Superset.ISG.dfs_fold superset - ~pre:(fun (struck,to_clamp) addr -> - if struck then (struck,to_clamp) else - let conflicts = - Superset.Occlusion.conflicts_within_insn_at - superset addr in - let no_conflicts = Set.length conflicts = 0 in - (*let conflicts = Superset.Occlusion.parent_conflict_at - insn_risg insn_map addr in - let no_conflicts = Set.length conflicts = 0 - && no_conflicts in*) - if no_conflicts then (struck, Set.(add to_clamp addr)) - else (true, to_clamp) - ) ~post:(fun x _ -> x) (false, to_clamp) clamp - in to_clamp - ) - -let extract_loop_addrs superset = - let loop_addrs = Superset.ISG.raw_loops superset in - List.fold_left ~init:Addr.Map.empty loop_addrs - ~f:(fun addrs loop -> - if List.length loop >= 2 then - Option.value ~default:addrs - Option.(map List.(hd loop) ~f:(fun addr -> - Map.set addrs ~key:addr ~data:loop)) - else addrs - ) - -let extract_filtered_loop_addrs superset = - let loop_addrs = extract_loop_addrs superset in - Map.filteri loop_addrs ~f:(fun ~key ~data -> - List.length data > 20) - -let extract_constants superset = - let imgmem = - Memmap.to_sequence @@ Superset.Inspection.get_memmap superset in - let addrs = - Seq.bind imgmem - ~f:(fun (segment,_) -> - let words_of_mem mem = - let rec yield_next addr = - let width = - Addr.bitwidth addr in - let s = Size.of_int_exn width in - let open Seq.Generator in - match Memory.view ~word_size:s ~from:addr mem with - | Ok next -> - yield next >>= fun () -> yield_next (Addr.succ addr) - | _ -> return () in - Sequence.Generator.run (yield_next Memory.(min_addr mem)) - in words_of_mem segment - ) in - Seq.fold ~init:Addr.Map.empty addrs - ~f:(fun constants m -> - let constant = Memory.(m ^ (min_addr m)) in - match constant with - | Ok constant -> - if Superset.Inspection.contains_addr superset constant - && Superset.Core.(mem superset constant) then - Map.set constants ~key:Memory.(min_addr m) ~data:constant - else constants - | _ -> constants - ) - -let stddev_of hs average pmap = - let deviation,deg_free = - Hash_set.fold ~init:(0.0,0) hs ~f:(fun (deviation,deg_free) addr -> - if Map.mem pmap addr then - let d = (Option.(value_exn Map.(find pmap addr)) -. average) in - let d = d *. d in - (deviation +. d, (deg_free+1)) - else (deviation, (deg_free)) - ) in - sqrt(deviation /. float_of_int (deg_free -1)) - -(* pre is called from descendant to ancestor order, so we want to - check for usage and put that into a map, and then for define on - post visitation, when coming back down from ancestors back to - descendants (as execution would move). *) -let pre_ssa superset lift factors var_use addr = - match Superset.Core.lookup superset addr with - | Some (mem, insn) -> ( - try - let bil = lift (mem, insn) in - Option.value_map ~default:() bil ~f:(fun (bil) -> - let use_vars = Abstract_ssa.use_ssa bil in - Set.iter use_vars ~f:(fun use_var -> - var_use := Map.set !var_use ~key:use_var ~data:addr - ) - ) - with _ -> () - ) - | None -> () - -let pre_freevarssa superset lift factors var_use addr = - match Superset.Core.lookup superset addr with - | Some (mem, insn) -> ( - try - let bil = lift (mem, insn) in - Option.value_map ~default:() bil ~f:(fun (bil) -> - let use_vars = Abstract_ssa.use_freevars bil in - Set.iter use_vars ~f:(fun use_var -> - var_use := Map.set !var_use ~key:use_var ~data:addr - ) - ) - with _ -> () - ) - | None -> () - -let post_ssa_with superset lift var_use addr f = - match Superset.Core.lookup superset addr with - | Some (mem, insn) -> ( - try - let bil = lift (mem, insn) in - Option.value_map ~default:() bil ~f:(fun (bil) -> - let use_vars = Abstract_ssa.use_ssa bil in - Set.iter use_vars ~f:(fun use_var -> - var_use := Map.remove !var_use use_var; - ); - let var_defs = Abstract_ssa.def_ssa bil in - Set.iter var_defs ~f:(fun var_def -> - match Map.find !var_use var_def with - | Some(waddr) -> - if not Addr.(waddr = addr) then ( - f waddr addr - ) - | None -> () - ); - Set.iter var_defs ~f:(fun write_reg -> - var_use := Map.remove !var_use write_reg - ) - ) - with _ -> () - ) - | None -> () - -let post_freevarssa_with superset lift var_use addr f = - match Superset.Core.lookup superset addr with - | Some (mem, insn) -> ( - try - let bil = lift (mem, insn) in - Option.value_map ~default:() bil ~f:(fun (bil) -> - let use_vars = Abstract_ssa.use_freevars bil in - let var_defs = Abstract_ssa.def_freevars bil in - Set.iter var_defs ~f:(fun var_def -> - match Map.find !var_use var_def with - | Some(waddr) -> - if not Set.(mem use_vars var_def) then ( - f waddr addr - ) - | None -> () - ); - Set.iter use_vars ~f:(fun use_var -> - var_use := Map.remove !var_use use_var; - ); - Set.iter var_defs ~f:(fun write_reg -> - var_use := Map.remove !var_use write_reg - ) - ) - with _ -> () - ) - | None -> () - -let extract_ssa_to_map superset = - let var_use = ref Exp.Map.empty in - let defuse_map = ref Addr.Map.empty in - let add_to_map def use = - defuse_map := Map.set !defuse_map ~key:def ~data:use in - let lift (mem, insn) = - Superset.Core.lift_insn superset ( (mem, insn)) in - let pre = pre_ssa superset lift () var_use in - let post addr = post_ssa_with superset lift var_use - addr add_to_map in - let entries = Superset.entries_of_isg superset in - Hash_set.iter entries ~f:(fun addr -> - Traverse.with_ancestors_at superset addr ~post ~pre; - var_use := Exp.Map.empty - ); - !defuse_map - -let extract_freevarssa_to_map superset = - let var_use = ref Var.Map.empty in - let defuse_map = ref Addr.Map.empty in - let add_to_map def use = - defuse_map := Map.set !defuse_map ~key:def ~data:use in - let lift (mem, insn) = - Superset.Core.lift_insn superset ((mem, insn)) in - let pre = pre_freevarssa superset lift () var_use in - let post addr = post_freevarssa_with superset lift var_use - addr add_to_map in - let entries = Superset.entries_of_isg superset in - Hash_set.iter entries ~f:(fun addr -> - Traverse.with_ancestors_at superset addr ~post ~pre; - var_use := Var.Map.empty - ); - !defuse_map - -let extract_cross_section_jmps superset = - let segments = Superset.Inspection.get_memmap superset in - let cross_section_edges = Superset.ISG.fold_edges superset - (fun src dst csedges -> - let collect_minaddrs addr = - let seg = Memmap.lookup segments addr in - Seq.fold seg ~init:Addr.Set.empty ~f:(fun s1 (mem,_) -> - Set.add s1 @@ Memory.min_addr mem - ) in - let s1 = collect_minaddrs src in - let s2 = collect_minaddrs dst in - if not (Set.(length @@ inter s1 s2) >= 1) then - let ft1 = Superset.is_fall_through superset src dst in - let ft2 = Superset.is_fall_through superset dst src in - if (ft1 || ft2) then ( - (*Superset_risg.G.remove_edge insn_risg src dst;*) - Map.set csedges ~key:src ~data:dst - ) else csedges - else csedges - ) Addr.Map.empty in - cross_section_edges - -let extract_trim_clamped superset = - let to_clamp = find_free_insns superset in - let visited = Addr.Hash_set.create () in - let datas = Addr.Hash_set.create () in - Set.iter to_clamp ~f:(fun c -> - if not Hash_set.(mem visited c) then - if Superset.Core.mem superset c then ( - Traverse.mark_descendent_bodies_at - ~visited ~datas superset c - ) - ); - Hash_set.iter datas ~f:(fun d -> - if Hash_set.(mem visited d) || Set.(mem to_clamp d) then - Superset.Core.clear_bad superset d - ); - clear_each superset visited; - superset - -let time ?(name="") f x = - let t = Stdlib.Sys.time() in - let fx = f x in - let s = sprintf "%s execution time: %fs\n" name (Stdlib.Sys.time() -. t) in - print_endline s; - fx - - - -let extract_trim_limited_clamped superset = - let visited = Addr.Hash_set.create () in - let callsites = get_callsites ~threshold:0 superset in - let f s = tag_callsites visited ~callsites s in - let superset = time ~name:"tagging callsites: " f superset in - let () = Superset.Core.clear_all_bad superset in - let superset = time ~name:"extract_trim_clamped " - extract_trim_clamped superset in - clear_each superset visited; - superset - -let fixpoint_descendants superset extractf depth = - let rec fix_descendants cur_features d = - if d >= depth then - cur_features - else - let visited = Addr.Hash_set.create () in - let subset_features = Addr.Hash_set.create () in - Hash_set.iter cur_features ~f:(fun cur -> - if not Hash_set.(mem visited cur) then - Traverse.with_descendents_at superset - ~pre:(fun v -> - if Hash_set.(mem cur_features v) - && not Addr.(cur = v) then - Hash_set.add subset_features v - ) ~visited cur - else Hash_set.add subset_features cur - ); - fix_descendants subset_features (d+1) - in - let cur_features = extractf superset in - fix_descendants cur_features 0 - -let fixpoint_map superset feature_pmap = - let visited = Addr.Hash_set.create () in - let entries = Superset.entries_of_isg superset in - Hash_set.fold ~init:feature_pmap entries ~f:(fun feature_pmap cur -> - if not Hash_set.(mem visited cur) then - let prev = ref [] in - let feature_pmap = ref feature_pmap in - Traverse.with_descendents_at ~pre:(fun v -> - match Map.find !feature_pmap v with - | None -> () - | Some(p) -> - prev := List.append p !prev; - feature_pmap := Map.set !feature_pmap ~key:v ~data:!prev; - ) ~visited superset cur; - !feature_pmap - else feature_pmap - ) - -let fixpoint_grammar superset depth = - let extractf superset = - Superset.get_branches superset in - fixpoint_descendants superset extractf depth - -let fixpoint_ssa superset depth = - let extractf superset = - let ssa_map = extract_ssa_to_map superset in - let ssa = Addr.Hash_set.create () in - List.iter Map.(data ssa_map) ~f:Hash_set.(add ssa); - ssa in - fixpoint_descendants superset extractf depth - -let fixpoint_freevarssa superset depth = - let extractf superset = - let freevars_map = extract_freevarssa_to_map superset in - let freevars = Addr.Hash_set.create () in - List.iter Map.(data freevars_map) ~f:Hash_set.(add freevars); - freevars in - fixpoint_descendants superset extractf depth - -let fixpoint_tails superset = - let extractf superset = - let conflicts = Superset.Occlusion.find_all_conflicts superset in - let tails_map = - Decision_trees.tails_of_conflicts superset conflicts in - let tails = Addr.Hash_set.create () in - List.iter Map.(keys tails_map) ~f:Hash_set.(add tails); - tails - in - fixpoint_descendants superset extractf 4 - -let allfeatures = - "RestrictedClamped" :: - "ExtendedClamped" :: - "ClassicGrammar" :: - "LinearGrammar" :: - "UnfilteredGrammar" :: - "FalseBranchMap" :: - "FilteredFalseBranchMap" :: - "UnfilteredSCC" :: - "FreeVarSSA" :: - "FixpointGrammar" :: - "FixpointSSA" :: - "FixpointFreevarSSA" :: - "FixpointTails" :: - default_features - -let get_branches superset = - let branches = Superset.get_branches superset in - transform branches - -let branch_map_of_branches superset branches = - let name = Superset.Inspection.filename superset in - let name = Option.value_exn name in - let true_positives = Metrics.true_positives superset name in - let branches = - Hash_set.fold true_positives ~init:branches ~f:Set.remove in - Set.fold branches ~init:Addr.Map.empty ~f:(fun fpbranchmap fpbranch -> - let target = - List.find_exn Superset.ISG.(descendants superset fpbranch) - ~f:Superset.(is_fall_through superset fpbranch) in - Map.set fpbranchmap ~key:fpbranch ~data:target - ) -let extract_fp_branches superset = - let branches = get_branches superset in - branch_map_of_branches superset branches -let extract_fp_branches superset = - let branches = get_branches superset in - branch_map_of_branches superset branches -let linear_grammar superset = - let entries = Superset.entries_of_isg superset in - transform Grammar.(linear_branch_sweep superset entries) -let classic_grammar superset = - transform Grammar.(identify_branches superset) -let extract_loops_to_set superset = - let loops = Superset.ISG.raw_loops superset in - let loops = List.filter loops ~f:(fun l -> List.length l >= 2) in - Grammar.addrs_of_loops loops - -let extract_filter_loops superset = - Grammar.addrs_of_filtered_loops superset - -let extract_loops_with_break superset = - let loop_addrs = extract_loop_addrs superset in - Map.fold ~init:Addr.Set.empty loop_addrs ~f:(fun ~key ~data loops -> - let loop = List.fold ~init:Addr.Set.empty data ~f:Set.add in - let has_break = List.exists data - ~f:(fun addr -> - let targets = Superset.ISG.descendants superset addr in - List.exists targets - ~f:(fun x -> not Set.(mem loop x)) - ) in - if has_break then Set.union loops loop else loops - ) - -let extract_constants_to_set superset = - let constants = extract_constants superset in - Map.fold constants ~init:Addr.Set.empty ~f:(fun ~key ~data consts -> - Set.add consts data - ) -let extract_exitless superset = - let returned = Addr.Hash_set.create () in - let entries = Superset.entries_of_isg superset in - Hash_set.iter entries ~f:(fun entry -> - Traverse.with_ancestors_at superset - ?post:None ~pre:(Hash_set.add returned) entry - ); - Superset.Core.fold superset ~f:(fun ~key ~data exitless -> - let v = key in - if not (Hash_set.mem returned v) - then Set.add exitless v else exitless - ) ~init:Addr.Set.empty - -let collect_descendants superset ?insn_isg ?visited ?datas targets = - let visited = Option.value visited ~default:(Addr.Hash_set.create ()) in - let datas = Option.value datas ~default:(Addr.Hash_set.create ()) in - Hash_set.iter targets ~f:(fun v -> - if not Hash_set.(mem visited v) then - Traverse.mark_descendent_bodies_at ~visited ~datas superset v - ) - -let extract_img_entry superset = - let e = Addr.Set.empty in - match Superset.Inspection.get_main_entry superset with - | Some mentry -> - let s = sprintf "entry: %s" - Addr.(to_string mentry) in - print_endline s; - Set.add e mentry - | None -> e - -let extract_trim_callsites superset = - let visited = Addr.Hash_set.create () in - let callsites = get_callsites ~threshold:2 superset in - let protection = get_callsites ~threshold:0 superset in - collect_descendants superset ~visited protection; - Superset.Core.clear_all_bad superset; - let superset = tag_callsites visited ~callsites superset in - clear_each superset visited; - superset -let extract_trim_loops_with_break superset = - (*let loops = extract_loops_with_break superset in*) - superset -let extract_trim_entry superset = - let imgentry = extract_img_entry superset in - Set.iter imgentry ~f:Traverse.(mark_descendent_bodies_at superset); - superset -let extract_trim_noexit superset = - let exitless = extract_exitless superset in - Set.iter exitless ~f:Superset.Core.(mark_bad superset); - superset -let extract_trim_fixpoint_grammar superset = - let gdesc = fixpoint_grammar superset 10 in - let visited = Addr.Hash_set.create () in - let datas = Addr.Hash_set.create () in - let callsites = get_callsites ~threshold:0 superset in - let superset = tag_callsites visited ~callsites superset in - Superset.Core.clear_all_bad superset; - collect_descendants ~visited superset gdesc; - Hash_set.iter datas ~f:(fun d -> - if Hash_set.(mem visited d) || Hash_set.(mem gdesc d) then - Superset.Core.clear_bad superset d - ); - clear_each superset visited; - clear_each superset gdesc; - superset -let extract_trim_fixpoint_ssa superset = - let gdesc = fixpoint_ssa superset 6 in - let visited = Addr.Hash_set.create () in - let datas = Addr.Hash_set.create () in - let callsites = get_callsites ~threshold:0 superset in - (*collect_descendants ~visited ~insn_isg superset callsites;*) - let superset = tag_callsites visited ~callsites superset in - Superset.Core.clear_all_bad superset; - collect_descendants ~visited superset gdesc; - Hash_set.iter datas ~f:(fun d -> - if Hash_set.(mem visited d) || Hash_set.(mem gdesc d) then - Superset.Core.clear_bad superset d - ); - clear_each superset visited; - superset -let extract_trim_fixpoint_freevarssa superset = - let gdesc = fixpoint_freevarssa superset 6 in - let visited = Addr.Hash_set.create () in - let datas = Addr.Hash_set.create () in - let callsites = get_callsites ~threshold:0 superset in - (*collect_descendants ~visited ~insn_isg superset callsites;*) - let superset = tag_callsites visited ~callsites superset in - Superset.Core.clear_all_bad superset; - collect_descendants ~visited superset gdesc; - Hash_set.iter datas ~f:(fun d -> - if Hash_set.(mem visited d) || Hash_set.(mem gdesc d) then - Superset.Core.clear_bad superset d - ); - clear_each superset visited; - superset -let extract_trim_fixpoint_tails superset = - let tdesc = fixpoint_tails superset in - let visited = Addr.Hash_set.create () in - let datas = Addr.Hash_set.create () in - let callsites = get_callsites ~threshold:0 superset in - let superset = tag_callsites visited ~callsites superset in - Superset.Core.clear_all_bad superset; - Hash_set.iter tdesc ~f:(fun v -> - if not Hash_set.(mem visited v) then - Traverse.mark_descendent_bodies_at ~visited ~datas superset v - ); - Hash_set.iter datas ~f:(fun d -> - if Hash_set.(mem visited d) || Hash_set.(mem tdesc d) then - Superset.Core.clear_bad superset d - ); - clear_each superset visited; - superset - -let discard_edges superset = - Superset.ISG.fold_edges superset - (fun child parent superset -> - if not Superset.(is_fall_through superset parent child) then ( - match Superset.Core.lookup superset parent with - | None -> superset - | Some (mem, insn) -> - match insn with - | Some(insn) -> - let insn = Insn.of_basic insn in - if Insn.(is Insn.call insn) then - Superset.ISG.unlink superset child parent - else superset - | None -> superset - ) else superset - ) superset - (*let edges = Superset.get_non_fall_through_edges superset in*) - -type extractor = (Superset.t -> Addr.Set.t) -type ('b) mapextractor = (Superset.t -> 'b Addr.Map.t) -type setfilter = (Superset.t -> Addr.Set.t -> Addr.Set.t) -type ('b) mapfilter = (Superset.t -> 'b Addr.Map.t -> 'b Addr.Map.t) -type setexfilt = extractor * setfilter -type ('a, 'b) mapexfilt = ('b) mapextractor * ('b) mapfilter -let unfiltered _ = Fn.id - -let _exfiltset = [ - ("FixpointGrammar", - ((fun x -> transform (fixpoint_grammar x 0)), unfiltered)); - ("FixpointTails", - ((fun x -> transform (fixpoint_tails x)), unfiltered)); - ("FixpointFreevarSSA", - ((fun x -> transform (fixpoint_freevarssa x 0)), unfiltered)); - ("LinearGrammar", (linear_grammar, unfiltered)); - ("UnfilteredGrammar", (get_branches, unfiltered)); - ("ClassicGrammar", (classic_grammar, unfiltered)); - ("Callsites3", - ((fun x -> transform (get_callsites - ~threshold:6 x)), unfiltered)); - ("Clamped", (find_free_insns, unfiltered)); - ("RestrictedClamped", (restricted_clamp, unfiltered)); - ("ExtendedClamped", (extended_clamp, unfiltered)); - ("UnfilteredSCC", (extract_loops_to_set,unfiltered)); - ("LoopsWithBreak", (extract_loops_with_break,unfiltered)); - ("SCC", (extract_filter_loops,unfiltered)); - ("NoExit", (extract_exitless, unfiltered)); - ("Constant", (extract_constants_to_set,unfiltered)); - ("ImgEntry", (extract_img_entry, unfiltered)); -] -let exfiltset :(setexfilt) String.Map.t - = List.fold ~init:String.Map.empty _exfiltset - ~f:(fun exfiltset (name, f) -> - String.Map.set exfiltset ~key:name ~data:f - ) - -let _exfiltmap = [ - ("SSA", (extract_ssa_to_map, unfiltered)); - ("FalseBranchMap", (extract_fp_branches, unfiltered)); - ("FreeVarSSA", (extract_freevarssa_to_map, unfiltered)); - ("SSA", (extract_ssa_to_map, unfiltered)); -] -let exfiltmap : ((unit, Addr.t) mapexfilt) String.Map.t - = List.fold ~init:String.Map.empty _exfiltmap - ~f:(fun exfiltmap (name, f) -> - String.Map.set exfiltmap ~key:name ~data:f - ) - -let featureflist = - [("Callsites3", extract_trim_callsites); - ("DiscardEdges", discard_edges); - ("LoopsWithBreak", extract_trim_loops_with_break); - ("ImgEntry",extract_trim_entry); - (*("SCC", extract_tag_loops)*) - ("NoExit", extract_trim_noexit); - ("TrimLimitedClamped" ,extract_trim_limited_clamped); - ("TrimFixpointGrammar", extract_trim_fixpoint_grammar); - ("TrimFixpointSSA", extract_trim_fixpoint_ssa); - ("TrimFixpointFreevarSSA", extract_trim_fixpoint_freevarssa); - ("TrimFixpointTails", extract_trim_fixpoint_tails); - ] -let featuremap = List.fold featureflist ~init:String.Map.empty - ~f:(fun featuremap (name, f) -> - Map.set featuremap ~key:name ~data:f - ) - -let apply_featureset featureset superset = - let superset = List.fold ~init:(superset) featureset ~f:(fun (superset) feature -> - match Map.(find featuremap feature) with - | None -> superset - | Some (f) -> - print_endline feature; - let superset = f superset in - let superset = Trim.run superset in - superset - ) in - superset - -let fdists = String.Map.empty -let fdists = String.Map.set fdists ~key:"FixpointGrammar" ~data:5 -let fdists = String.Map.set fdists ~key:"FixpointFreevarSSA" ~data:3 - -let make_featurepmap featureset superset = - List.fold ~f:(fun (feature_pmap) feature -> - let p = Map.find fdists feature in - let p = Option.value p ~default:2 in - match Map.(find exfiltset feature) with - | None -> feature_pmap - | Some (extract,filter) -> - print_endline feature; - let fset = extract superset in - Set.fold fset ~init:feature_pmap - ~f:(fun feature_pmap x -> - Map.update feature_pmap x ~f:(function - | Some l -> (p, x, feature) :: l - | None -> [(p, x, feature)] - ) - ) - ) ~init:Addr.Map.empty featureset - -let total_of_features l = - List.fold ~init:0 ~f:(fun x (y,_,_) -> x + y) l - -let apply_featurepmap featureset ?(threshold=50) superset = - let feature_pmap = make_featurepmap featureset superset in - let feature_pmap = fixpoint_map superset feature_pmap in - let feature_pmap = - Map.map feature_pmap ~f:(total_of_features) in - let feature_pmap = - Map.filter feature_pmap ~f:(fun total -> total > threshold) in - let visited = Addr.Hash_set.create () in - let callsites = get_callsites ~threshold:0 superset in - let superset = tag_callsites visited ~callsites superset in - Superset.Core.clear_all_bad superset; - List.iter Map.(keys feature_pmap) ~f:(fun addr -> - Traverse.mark_descendent_bodies_at superset ~visited addr - ); - clear_each superset visited; - superset -(*Trim.trim superset*) diff --git a/lib/bap_superset_disasm/features.mli b/lib/bap_superset_disasm/features.mli deleted file mode 100644 index e69de29bb..000000000 diff --git a/lib/bap_superset_disasm/heuristics.ml b/lib/bap_superset_disasm/heuristics.ml index f33716f6b..a1bf2324a 100644 --- a/lib/bap_superset_disasm/heuristics.ml +++ b/lib/bap_superset_disasm/heuristics.ml @@ -1,58 +1,11 @@ open Core open Bap.Std -module Dis = Disasm_expert - -module type Heurism = sig - type t - val name : string - val impl : Superset.t -> t -end - -module HeurismSet(H : Heurism) = struct - open Bap_knowledge - open Bap_core_theory - - module Cache = struct - open H - let package = "superset-heuristics" - let addrs_t = - Knowledge.Domain.optional - ~inspect:Addr.Hash_set.sexp_of_t - ~equal:Addr.Hash_set.equal "addr.set" - - let addrs_persistent = - Knowledge.Persistent.of_binable - (module struct type t = Addr.Hash_set.t option [@@deriving bin_io] end) - - let attr ty persistent desc = - let open Theory.Program in - Knowledge.Class.property ~package cls name ty - ~persistent ~public:true ~desc - let locations = - attr addrs_t addrs_persistent - ("addresses of all sites of " ^ name ^ " heuristic") - end -end - let defaults = [ "ImgEntry"; - (*"NoExit";*) - (*"LoopsWithBreak";*) - "BranchViolations"; - (*"LayerViolations";*) - "TrimLimitedClamped"; + "InterpretationDepthOne"; "Callsites3"; - (*"TrimFixpointGrammar"; - "TrimFixpointTails";*) - (*"Clamped"; - "SCC"; - "LoopGrammar"; - "CallsiteLineage"; - "SSA";*) - (*"FreeVarSSA";*) - (*"Grammar";*) - (*"Constant";*) + "FixpointGrammar"; ] let defaults = List.rev defaults @@ -84,14 +37,13 @@ let tag_callsites visited ?callsites superset = ~default:(get_callsites superset) in Hash_set.iter callsites ~f:(fun callsite -> Traverse.with_descendents_at ~visited - (* TODO ~pre should mark insn bodies as data *) ?post:None ?pre:None superset callsite; ); superset let find_free_insns superset = let mem = Superset.Core.mem superset in - let to_clamp = + let conflict_free_addrs = Superset.Core.fold superset ~init:([]) ~f:(fun ~key ~data to_clamp -> let (addr,(memory,_)) = key, data in @@ -105,118 +57,11 @@ let find_free_insns superset = to_clamp ) ) in - let to_clamp = Addr.Set.of_list to_clamp in - Set.diff to_clamp @@ + let conflict_free_addrs = Addr.Set.of_list conflict_free_addrs in + Set.diff conflict_free_addrs @@ Superset.Occlusion.find_all_conflicts superset - -let restricted_clamp superset = - let entries = Superset.entries_of_isg superset in - let conflicts = Superset.Occlusion.find_all_conflicts superset in - let to_clamp = ref Addr.Set.empty in - Hash_set.iter entries ~f:(fun entry -> - let b = ref false in - let pre v = - if Addr.(v = entry) then - b := false - else if not (!b) then - if Set.mem conflicts v then - b := true - else to_clamp := Set.add (!to_clamp) v - in Traverse.with_ancestors_at ~post:(fun _ -> ()) ~pre superset entry; - ); - !to_clamp - -let extended_clamp superset = - let to_clamp = find_free_insns superset in - Set.fold to_clamp ~init:Addr.Set.empty ~f:(fun to_clamp clamp -> - let _, to_clamp = - Superset.ISG.dfs_fold superset - ~pre:(fun (struck,to_clamp) addr -> - if struck then (struck,to_clamp) else - let conflicts = - Superset.Occlusion.conflicts_within_insn_at - superset addr in - let no_conflicts = Set.length conflicts = 0 in - (*let conflicts = Superset.Occlusion.parent_conflict_at - insn_risg insn_map addr in - let no_conflicts = Set.length conflicts = 0 - && no_conflicts in*) - if no_conflicts then (struck, Set.(add to_clamp addr)) - else (true, to_clamp) - ) ~post:(fun x _ -> x) (false, to_clamp) clamp - in to_clamp - ) - -let extract_loop_addrs superset = - let loop_addrs = Superset.ISG.raw_loops superset in - List.fold_left ~init:Addr.Map.empty loop_addrs - ~f:(fun addrs loop -> - if List.length loop >= 2 then - Option.value ~default:addrs - Option.(map List.(hd loop) ~f:(fun addr -> - Map.set addrs ~key:addr ~data:loop)) - else addrs - ) - -let extract_filtered_loop_addrs superset = - let loop_addrs = extract_loop_addrs superset in - Map.filteri loop_addrs ~f:(fun ~key ~data -> - List.length data > 20) - -let extract_constants superset = - let imgmem = - Memmap.to_sequence @@ Superset.Inspection.get_memmap superset in - let addrs = - Seq.bind imgmem - ~f:(fun (segment,_) -> - let words_of_mem mem = - let rec yield_next addr = - let width = Addr.bitwidth addr in - let s = Size.of_int_exn width in - let open Seq.Generator in - match Memory.view ~word_size:s ~from:addr mem with - | Ok next -> - yield next >>= fun () -> yield_next (Addr.succ addr) - | _ -> return () in - Sequence.Generator.run (yield_next Memory.(min_addr mem)) - in words_of_mem segment - ) in - Seq.fold ~init:Addr.Map.empty addrs - ~f:(fun constants m -> - let constant = Memory.(m ^ (min_addr m)) in - match constant with - | Ok constant -> - if Superset.Inspection.contains_addr superset constant - && Superset.Core.(mem superset constant) then - Map.set constants ~key:Memory.(min_addr m) ~data:constant - else constants - | _ -> constants - ) -let extract_cross_section_jmps superset = - let segments = Superset.Inspection.get_memmap superset in - let cross_section_edges = Superset.ISG.fold_edges superset - (fun src dst csedges -> - let collect_minaddrs addr = - let seg = Memmap.lookup segments addr in - Seq.fold seg ~init:Addr.Set.empty ~f:(fun s1 (mem,_) -> - Set.add s1 @@ Memory.min_addr mem - ) in - let s1 = collect_minaddrs src in - let s2 = collect_minaddrs dst in - if not (Set.(length @@ inter s1 s2) >= 1) then - let ft1 = Superset.is_fall_through superset src dst in - let ft2 = Superset.is_fall_through superset dst src in - if (ft1 || ft2) then ( - (*Superset_risg.G.remove_edge insn_risg src dst;*) - Map.set csedges ~key:src ~data:dst - ) else csedges - else csedges - ) Addr.Map.empty in - cross_section_edges - - -let extract_trim_clamped superset = +let extract_trim_interpretation_depth superset = let to_clamp = find_free_insns superset in let visited = Addr.Hash_set.create () in let datas = Addr.Hash_set.create () in @@ -231,13 +76,14 @@ let extract_trim_clamped superset = Set.iter to_clamp ~f:(Superset.Core.clear_bad superset); superset -let extract_trim_limited_clamped superset = +let extract_trim_protected_interpretation_depth superset = let protection = Addr.Hash_set.create () in let callsites = get_callsites ~threshold:0 superset in let superset = tag_callsites protection ~callsites superset in Superset.Core.clear_all_bad superset; - let superset = extract_trim_clamped superset in - Superset.Core.clear_each superset protection; superset + let superset = extract_trim_interpretation_depth superset in + Superset.Core.clear_each superset protection; + superset let fixpoint_descendants superset extractf depth = let rec fix_descendants cur_features d = @@ -284,74 +130,6 @@ let fixpoint_grammar superset depth = Superset.get_branches superset in fixpoint_descendants superset extractf depth -(* TODO all features is not all features *) -let allfeatures = - "RestrictedClamped" :: - "ExtendedClamped" :: - "ClassicGrammar" :: - "LinearGrammar" :: - "UnfilteredGrammar" :: - "FalseBranchMap" :: - "FilteredFalseBranchMap" :: - "UnfilteredSCC" :: - "FreeVarSSA" :: - "FixpointGrammar" :: - "FixpointSSA" :: - "FixpointFreevarSSA" :: - "FixpointTails" :: - defaults - -let get_branches superset = - let branches = Superset.get_branches superset in - transform branches - -let linear_grammar superset = - let entries = Superset.entries_of_isg superset in - transform Grammar.(linear_branch_sweep superset entries) - -let classic_grammar superset = - transform Grammar.(identify_branches superset) - -let extract_loops_to_set superset = - let loops = Superset.ISG.raw_loops superset in - let loops = List.filter loops ~f:(fun l -> List.length l >= 2) in - Grammar.addrs_of_loops loops - -let extract_filter_loops superset = - Grammar.addrs_of_filtered_loops superset - -let extract_loops_with_break superset = - let loop_addrs = extract_loop_addrs superset in - Map.fold ~init:Addr.Set.empty loop_addrs ~f:(fun ~key ~data loops -> - let loop = List.fold ~init:Addr.Set.empty data ~f:Set.add in - let has_break = List.exists data - ~f:(fun addr -> - let targets = Superset.ISG.descendants superset addr in - List.exists targets - ~f:(fun x -> not Set.(mem loop x)) - ) in - if has_break then Set.union loops loop else loops - ) - -let extract_exitless superset = - let returned = Addr.Hash_set.create () in - let entries = Superset.entries_of_isg superset in - Hash_set.iter entries ~f:(fun entry -> - Traverse.with_ancestors_at superset - ?post:None ~pre:(Hash_set.add returned) entry - ); - Superset.Core.fold superset ~f:(fun ~key ~data exitless -> - let v = key in - if not (Hash_set.mem returned v) - then Set.add exitless v else exitless - ) ~init:Addr.Set.empty - -let extract_constants_to_set superset = - let constants = extract_constants superset in - Map.fold constants ~init:Addr.Set.empty ~f:(fun ~key ~data consts -> - Set.add consts data - ) - let collect_descendants superset ?visited ?datas targets = let visited = Option.value visited ~default:(Addr.Hash_set.create ()) in let datas = Option.value datas ~default:(Addr.Hash_set.create ()) in @@ -381,11 +159,6 @@ let extract_trim_entry superset = Set.iter imgentry ~f:Traverse.(mark_descendent_bodies_at superset); superset -let extract_trim_noexit superset = - let exitless = extract_exitless superset in - Set.iter exitless ~f:Superset.Core.(mark_bad superset); - superset - type extractor = (Superset.t -> Addr.Set.t) type ('b) mapextractor = (Superset.t -> 'b Addr.Map.t) type setfilter = (Superset.t -> Addr.Set.t -> Addr.Set.t) @@ -393,26 +166,13 @@ type ('b) mapfilter = (Superset.t -> 'b Addr.Map.t -> 'b Addr.Map.t) type setexfilt = extractor * setfilter type ('a, 'b) mapexfilt = ('b) mapextractor * ('b) mapfilter let unfiltered _ = Fn.id - + let _exfiltset = [ ("FixpointGrammar", ((fun x -> transform (fixpoint_grammar x 0)), unfiltered)); - ("Liveness", (Liveness.compute_liveness,unfiltered)); - ("LinearGrammar", (linear_grammar, unfiltered)); - ("UnfilteredGrammar", (get_branches, unfiltered)); - ("ClassicGrammar", (classic_grammar, unfiltered)); ("Callsites3", ((fun x -> transform (get_callsites ~threshold:6 x)), unfiltered)); - ("Clamped", - ((fun s -> find_free_insns s), unfiltered)); - ("RestrictedClamped", (restricted_clamp, unfiltered)); - ("ExtendedClamped", (extended_clamp, unfiltered)); - ("UnfilteredSCC", (extract_loops_to_set,unfiltered)); - ("LoopsWithBreak", (extract_loops_with_break,unfiltered)); - ("SCC", (extract_filter_loops,unfiltered)); - ("NoExit", (extract_exitless, unfiltered)); - ("Constant", (extract_constants_to_set,unfiltered)); ("ImgEntry", (extract_img_entry, unfiltered)); ] let exfiltset :(setexfilt) String.Map.t @@ -424,9 +184,8 @@ let exfiltset :(setexfilt) String.Map.t let featureflist = [("Callsites3", extract_trim_callsites); ("ImgEntry",extract_trim_entry); - (*("SCC", extract_tag_loops)*) - ("NoExit", extract_trim_noexit); - ("TrimLimitedClamped" ,extract_trim_limited_clamped); + ("InterpretationDepthOne" , + extract_trim_protected_interpretation_depth); ] let featuremap = List.fold featureflist ~init:String.Map.empty ~f:(fun featuremap (name, f) -> @@ -444,7 +203,6 @@ let with_featureset ~f ~init featureset superset = let fdists = String.Map.empty let fdists = String.Map.set fdists ~key:"FixpointGrammar" ~data:1 -let fdists = String.Map.set fdists ~key:"Liveness" ~data:1 let make_featurepmap featureset superset = List.fold ~f:(fun (feature_pmap) feature -> diff --git a/lib/bap_superset_disasm/liveness.ml b/lib/bap_superset_disasm/liveness.ml deleted file mode 100644 index 4c6a1d803..000000000 --- a/lib/bap_superset_disasm/liveness.ml +++ /dev/null @@ -1,56 +0,0 @@ -open Bap.Std -open Core -open Graphlib.Std - -let stmt_def_freevars = - object(self) - inherit [Var.Set.t] Stmt.visitor - method! enter_move def use accu = - Set.add accu def - end - -type rev_ssa = { - defs : Var.Set.t; - uses : Var.Set.t; - } - -let transitions superset = - Superset.ISG.fold_vertex superset (fun addr fs -> - match Superset.Core.lift_at superset addr with - | Some bil -> - Addr.Map.add_exn fs ~key:addr ~data:{ - defs = stmt_def_freevars#run bil Var.Set.empty; - uses = Bil.free_vars bil; - } - | None -> fs - ) Addr.Map.empty - -let (++) = Set.union and (--) = Set.diff - -let compute_liveness superset = - let start = Addr.of_int ~width:1 0 in - let _exit = Addr.of_int ~width:1 1 in - let entries = Superset.entries_of_isg superset in - let superset = Hash_set.fold ~init:superset entries - ~f:(fun s e -> Superset.ISG.link s _exit e) in - let frond = Superset.frond_of_isg superset in - let superset = Hash_set.fold frond ~init:superset - ~f:(fun s e -> Superset.ISG.link s e start) in - let init = Solution.create Addr.Map.empty Var.Set.empty in - let tran = transitions superset in - let soln = Superset.ISG.fixpoint superset ~init ~start ~rev:false - ~merge:Var.Set.union ~equal:Var.Set.equal ?step:None ?steps:None - ~f:(fun n vars -> - if Addr.equal n _exit || Addr.equal n start then vars - else - match Map.find tran n with - | Some {defs; uses} -> - vars -- defs ++ uses - | None -> vars - ) in - let superset = Superset.Core.remove superset _exit in - let _ = Superset.Core.remove superset start in - let liveness_pairs = Solution.enum soln in - Seq.fold liveness_pairs ~init:Addr.Set.empty ~f:(fun s (addr,_) -> - Addr.Set.add s addr) - diff --git a/lib/bap_superset_disasm/liveness.mli b/lib/bap_superset_disasm/liveness.mli deleted file mode 100644 index c648803b7..000000000 --- a/lib/bap_superset_disasm/liveness.mli +++ /dev/null @@ -1,3 +0,0 @@ -open Bap.Std - -val compute_liveness : Superset_impl.t -> Addr.Set.t From c75de31522a1803f56b66441b68ce58e24c57f02 Mon Sep 17 00:00:00 2001 From: Kenneth Adam Miller Date: Fri, 16 May 2025 14:34:11 -0500 Subject: [PATCH 28/31] Remove Fixpoint, Decision_trees, Grammar. Add documentation to heuristics' interpretation depth --- .../bap_superset_disasm.ml | 3 - lib/bap_superset_disasm/decision_trees.ml | 292 ------------------ lib/bap_superset_disasm/decision_trees.mli | 3 - lib/bap_superset_disasm/fixpoint.ml | 27 -- lib/bap_superset_disasm/fixpoint.mli | 6 - lib/bap_superset_disasm/grammar.ml | 202 ------------ lib/bap_superset_disasm/grammar.mli | 8 - lib/bap_superset_disasm/heuristics.ml | 20 +- 8 files changed, 14 insertions(+), 547 deletions(-) delete mode 100644 lib/bap_superset_disasm/decision_trees.ml delete mode 100644 lib/bap_superset_disasm/decision_trees.mli delete mode 100644 lib/bap_superset_disasm/fixpoint.ml delete mode 100644 lib/bap_superset_disasm/fixpoint.mli delete mode 100644 lib/bap_superset_disasm/grammar.ml delete mode 100644 lib/bap_superset_disasm/grammar.mli diff --git a/lib/bap_superset_disasm/bap_superset_disasm.ml b/lib/bap_superset_disasm/bap_superset_disasm.ml index 9bb84ceb6..ca399c832 100644 --- a/lib/bap_superset_disasm/bap_superset_disasm.ml +++ b/lib/bap_superset_disasm/bap_superset_disasm.ml @@ -1,8 +1,5 @@ module Superset_impl = Superset_impl module Superset = Superset -module Decision_trees = Decision_trees -module Fixpoint = Fixpoint -module Grammar = Grammar module Heuristics = Heuristics module Invariants = Invariants module Metrics = Metrics diff --git a/lib/bap_superset_disasm/decision_trees.ml b/lib/bap_superset_disasm/decision_trees.ml deleted file mode 100644 index 8b7b4d1ec..000000000 --- a/lib/bap_superset_disasm/decision_trees.ml +++ /dev/null @@ -1,292 +0,0 @@ -open Core -open Bap.Std -open Graphlib.Std - -module G = Graphlib.Make(Addr)(Unit) - - -(** The decision tree represents a set of potentially inter-dependent - decision trees and potential ramifications of selection at each - node. The objective is to present to the user a clean interface - by which to construct mutually compatible decisions, since it it - possible for a blithely written analysis to piece together many - decisions that are not fit with the whole. *) - -type decision_tree = { - tree : G.t; - starts : Addr.Hash_set.t; - } -type decision_forest = decision_tree list -type 'a possibility -type 'a choice -type 'a consequence -type tail - -let count tree = - List.length tree -let with_trees (tree : decision_forest) = - List.fold tree - -(** For any given entry, calculate the conflicts, and filter the set - down to lists of entries that conflict with one another. *) -let conflicts_of_entries superset entries = - let visited_entries = Addr.Hash_set.create () in - Hash_set.fold entries ~init:[] ~f: - (fun conflicted_entries entry -> - if not (Hash_set.mem visited_entries entry) then ( - Hash_set.add visited_entries entry; - let in_entry_conflicts = - Superset.Occlusion.conflicts_within_insn_at superset entry in - let conflicts = Addr.Hash_set.create () in - Hash_set.add conflicts entry; - Set.iter in_entry_conflicts - ~f:(fun conflict -> - (* A conflict that an entry may have may or may not *) - (* itself be an entry. *) - if Hash_set.mem entries conflict then ( - Hash_set.add visited_entries conflict; - Hash_set.add conflicts conflict; - ) - ); - if (Hash_set.length conflicts) > 1 then ( - conflicts :: conflicted_entries - ) else conflicted_entries - ) else conflicted_entries - ) - -(** Calculate the set potential points where occlusive instructions - could rejoin to a common target, such as cease when falling - through to the same instruction. Calculate the tail, or the join - target and the conflicts that led into that. *) -let tails_of_conflicts superset conflicts = - let possible_tails = Superset.mergers superset in - (* This tail is the particular instruction - that is the fall through target of several potential - competitors. We use this instruction against the - leaders map because those will be the ones that fall - through to the tail; the tail can then be associated with - those that lead into it. *) - let tails, _ = Set.fold ~init:(Addr.Map.empty, Addr.Set.empty) - ~f:(fun (tails, added_choices) possible_tail -> - (* For each edge from tail, lookup the respective vertex; if it *) - (* is in the conflicts set, then it gets added to a sheath *) - (* of choices. *) - let f sheath poss_conflict = - let not_added = not (Set.mem added_choices poss_conflict) in - let is_conflict = Set.mem conflicts poss_conflict in - let is_connected = Superset.ISG.check_connected - superset possible_tail poss_conflict in - if not_added && is_conflict && is_connected then - poss_conflict :: sheath - else sheath in - let sheath = List.fold_left - (Superset.ISG.ancestors superset possible_tail) ~init:[] ~f - in - match sheath with - | [] | _ :: []-> tails, added_choices - | _ -> - let added_choices = - Set.inter added_choices (Addr.Set.of_list sheath) in - (Addr.Map.set tails ~key:possible_tail ~data:sheath, added_choices) - ) possible_tails in - tails - -let add_edge dtr v1 v2 = - let g = dtr.tree in - let e = G.Edge.create v1 v2 () in - let g = G.Edge.insert e g in - { dtr with tree = g } - -let mem_edge dtr v1 v2 = - let g = dtr.tree in - let e = G.Edge.create v1 v2 () in - G.Edge.mem e g - -let new_dtree () = - let tree = Graphlib.create (module G) () in - let starts = Addr.Hash_set.create () in - { tree; starts; } - -module DecisionTree = struct - let count dtree = - (G.number_of_nodes dtree.tree) + (Hash_set.length dtree.starts) - let mem addr dtree = G.Node.mem addr dtree.tree - || Hash_set.mem dtree.starts addr -end - -(** Starting from each entry in the superset, identify the tails and - build a decision tree that allows to jump from conflict to - conflict and review the options. *) -let decision_tree_of_entries superset conflicted_entries entries tails = - let visited = Addr.Hash_set.create () in - let add_choices decision_tree current_vert = - let unvisited = - not (Hash_set.mem visited current_vert) in - if unvisited then - let possible_tail = current_vert in - match Addr.Map.find tails possible_tail with - | Some(sheath) -> - List.fold sheath ~init:decision_tree ~f:(fun decision_tree competitor -> - add_edge decision_tree possible_tail competitor - ); - | _ -> decision_tree - else decision_tree; - in - let link_start decision_tree entry = - Hash_set.add decision_tree.starts entry; - { decision_tree with starts = decision_tree.starts } - in - let f decision_tree entry = - let saved_vert = ref entry in - let link_choices decision_tree current_vert = - let decision_tree = add_choices decision_tree entry in - let contained = DecisionTree.mem current_vert decision_tree in - let is_new = Hash_set.mem visited current_vert in - let decision_tree = - if contained && is_new then ( - let decision_tree = - if not @@ mem_edge decision_tree !saved_vert - current_vert then ( - add_edge decision_tree !saved_vert - current_vert - ) else decision_tree in - saved_vert := current_vert; - decision_tree - ) else decision_tree in - decision_tree - in - Superset.ISG.dfs_fold superset ~visited decision_tree - ~post:(fun g v -> g) ~pre:link_choices entry - in - let conflicted_trees = - List.filter_map conflicted_entries ~f:(fun conflicted -> - if Hash_set.length conflicted > 0 then - let decision_tree = new_dtree () in - let f decision_tree entry = - if not (Hash_set.mem visited entry) then ( - let decision_tree = link_start decision_tree entry in - f decision_tree entry) else decision_tree - in - let decision_tree = - Hash_set.fold ~init:decision_tree conflicted ~f in - Some(decision_tree) - else None - ) in - Hash_set.fold entries ~init:conflicted_trees - ~f:(fun all_trees entry -> - if not (Hash_set.mem visited entry) then - let decision_tree = new_dtree () in - let decision_tree = f decision_tree entry in - if DecisionTree.count decision_tree > 0 then - decision_tree :: all_trees - else all_trees - else (all_trees) - ) - -(** Accepts a superset, and calculates the decision trees over groups - of instructions. The returned trees index from tails to the - options available. *) -let decision_trees_of_superset superset = - (* Here, for each vertex, look up the insn from the map and *) - (* identify conflicts. *) - let conflicts = Superset.Occlusion.find_all_conflicts superset in - (* entries variable: - We want to know the superset of all nodes that could be the - terminating point that would otherwise be the return instruction - of a function. *) - let entries = Superset.entries_of_isg superset in - (* - we need to keep track of the subset of potential choices - that fall in line with the normal control flow graph, and - leave the discovery of overlapping redirection to a - second pass, in order that when we do a map over all - instructions to check for conflicts, we know which are tails - in order to properly construct the sheath type. - *) - let tails = tails_of_conflicts superset conflicts in - (* It may be that some entries are accidental indirections that *) - (* happen to preside at the intended entry. These must map to to an *) - (* entirely distinct interpretation. *) - let conflicted_entries = conflicts_of_entries superset entries in - (* For each of the potentially conflicting entries, construct a *) - (* decision tree. *) - let decision_trees = decision_tree_of_entries - superset conflicted_entries entries tails in - decision_trees - -let insn_is_option superset addr = - let open Superset in - let len = Superset.Inspection.len_at superset addr in - let bound = Addr.(addr ++ len) in - let previous = Superset.ISG.descendants superset addr in - List.fold ~init:false previous ~f:(fun current descedant -> - if not current then - let further = Superset.ISG.ancestors superset descedant in - List.fold ~init:current further ~f:(fun current opt -> - if not current then - if Addr.(addr <= opt) && Addr.(opt < bound) then - true - else false - else current - ) - else current - ) - -(** For a given superset that contains groups of instruction lineages - as potential choices, calculate the result of picking a given - choice as a delta. *) -let calculate_deltas ?entries ?is_option superset = - let is_option = - Option.value is_option - ~default:(insn_is_option superset) in - let entries = Option.value entries - ~default:(Superset.entries_of_isg superset) in - let add_data_of_insn dataset at = - Superset.Occlusion.with_data_of_insn - superset at ~f:(Hash_set.add dataset) - in - let deltas = ref Addr.Map.empty in - let delta = ref None in - let make_deltas addr = - let insns, datas = - match !delta with - | Some (insns, datas) -> (insns, datas) - | None -> - let insns = Addr.Hash_set.create () in - let datas = Addr.Hash_set.create () in - delta := Some(insns, datas); - insns, datas in - if is_option addr then ( - deltas := Addr.Map.set !deltas addr (insns, datas); - delta := None - ) else ( - add_data_of_insn datas addr; - Hash_set.add insns addr; - ) - (* else if is in entries then store the delta in the deltas map *) - in - let visited = Addr.Hash_set.create () in - Hash_set.iter entries - ~f:(Traverse.with_ancestors_at - ~visited ~post:make_deltas ?pre:None superset); - !deltas - -module Speculate = struct - let weigh_possibilities _ _ = () - let make_choices x _ _ = x -end - -(** A delta from decision trees is constructed and passed to the - visitor functions during a visit. *) -let visit_with_deltas ?pre ?post ~is_option superset entries = - let pre = Option.value pre ~default:(fun _ _ -> ()) in - let post = Option.value post ~default:(fun _ _ -> ()) in - let deltas = ref (calculate_deltas - superset ~entries ~is_option) in - let pre addr = - pre !deltas addr in - let post addr = - post !deltas addr; - deltas := Map.remove !deltas addr - in - Traverse.visit ~pre ~post superset entries diff --git a/lib/bap_superset_disasm/decision_trees.mli b/lib/bap_superset_disasm/decision_trees.mli deleted file mode 100644 index 1e233dc52..000000000 --- a/lib/bap_superset_disasm/decision_trees.mli +++ /dev/null @@ -1,3 +0,0 @@ -open Bap.Std - -val tails_of_conflicts : Superset_impl.t -> Addr.Set.t -> addr list Addr.Map.t diff --git a/lib/bap_superset_disasm/fixpoint.ml b/lib/bap_superset_disasm/fixpoint.ml deleted file mode 100644 index 2d33a46d7..000000000 --- a/lib/bap_superset_disasm/fixpoint.ml +++ /dev/null @@ -1,27 +0,0 @@ -open Core -open Bap.Std - -let iterate rounds f superset = - let (superset) = f superset in - let rec do_analysis round superset = - if round = rounds then superset else - let (superset) = f superset in - do_analysis (round+1) superset in - do_analysis 1 superset - -let protect superset f = - let visited = Addr.Hash_set.create () in - let callsites = Heuristics.get_callsites ~threshold:0 superset in - let superset = Heuristics.tag_callsites visited ~callsites superset in - let superset = f superset in - Superset.Core.clear_each superset visited; - Trim.run superset - -let converge superset heuristics feature_pmap = - let superset = Trim.run superset in - let cache = Addr.Hash_set.create () in - List.iter Map.(keys feature_pmap) ~f:(fun addr -> - Traverse.mark_descendent_bodies_at superset ~visited:cache addr - ); - superset - diff --git a/lib/bap_superset_disasm/fixpoint.mli b/lib/bap_superset_disasm/fixpoint.mli deleted file mode 100644 index 6113d21c1..000000000 --- a/lib/bap_superset_disasm/fixpoint.mli +++ /dev/null @@ -1,6 +0,0 @@ -open Bap.Std - - -val iterate : int -> ('a -> 'a) -> 'a -> 'a -val protect : Superset_impl.t -> (Superset_impl.t -> Superset_impl.t) -> Superset_impl.t -val converge : Superset_impl.t -> 'a -> 'b Addr.Map.t -> Superset_impl.t diff --git a/lib/bap_superset_disasm/grammar.ml b/lib/bap_superset_disasm/grammar.ml deleted file mode 100644 index 2d53f2c4c..000000000 --- a/lib/bap_superset_disasm/grammar.ml +++ /dev/null @@ -1,202 +0,0 @@ -open Core -open Bap.Std - -(** In superset disassembly, branches can occur from within the - bodies of instructions originally intended by the - compiler. Therefore, identify branches tries to visit starting - from entries, and see what set of branches include some child - that is a descendent from which the traversal originated. *) -let identify_branches superset = - let deferred = ref Addr.Map.empty in - let entries = Superset.entries_of_isg superset in - (* need to create a sequence of non-fall through edges *) - let insns = Addr.Hash_set.create () in - let branches = Addr.Hash_set.create () in - let tag_branches addr = - if Superset.is_branch superset addr then - let inbound = Superset.ISG.descendants superset addr in - List.iter inbound ~f:(fun child -> - (* check for edges between instructions that are not - fall through, but for which *) - if Hash_set.mem insns child then - let ft = Superset.fall_through_of superset addr in - if not Addr.(ft = child) && - not Addr.(addr = child) then - deferred := Map.set !deferred ~key:ft ~data:(child, addr) - ); - in - let confirm_branches addr = - match Map.find !deferred addr with - | Some (child, branch) -> - if Hash_set.mem insns child then - Hash_set.add branches branch - | None -> () - in - let pre addr = - Hash_set.add insns addr; - tag_branches addr - in - let post addr = - Hash_set.remove insns addr in - Traverse.visit - ~pre ~post superset entries; - let pre addr = - Hash_set.add insns addr; - confirm_branches addr - in - Traverse.visit - ~pre ~post superset entries; - branches - -let increment_map_at m ?(x=1) addr = - m := Map.update !m addr - ~f:(fun hits -> Option.value_map hits ~default:1 - ~f:(fun hits -> hits +x)); - Option.value ~default:x Map.(find !m addr) - -(** This searches through the set of blocks starting from entries for - branches that got hit at least twice. The idea is to respect the - diamond structure of control flow, which is that both sides - around a conditional (diamond) must be constructed in the cfg - statically in order for the final target to be sound - w.r.t. assembler rules. Using this technique, can filter false - positive branches. *) -let linear_branch_sweep superset entries = - let jmp_hit_cnt = ref Addr.Map.empty in - let update_hit_count = increment_map_at jmp_hit_cnt in - let pre jmps targets addr = - if Set.mem targets addr then ( - ignore (update_hit_count addr); - ); - match Map.find jmps addr with - | Some(branch) -> - ignore (update_hit_count branch); - | None -> (); - in - let post _ _ _ = () in - let _ = Traverse.visit_by_block superset ~pre ~post entries in - let final_jmps = Addr.Hash_set.create () in - Map.iteri !jmp_hit_cnt ~f:(fun ~key ~data -> - let jmp_addr = key in - let cnt = data in - if cnt = 2 then - Hash_set.add final_jmps jmp_addr; - ); - final_jmps - -(** The objective here is to tag grammar structures while traversing - topologically in such a manner that we can converge the - probability of recognizing an intended sequence by the - compiler. After we've hit some recognition threshold, we begin - traversing forward from some activation point whereby we trim - occlusive instructions. To recognize grammars, we have several - means: one, loops are strongly connected components, and if - sequences must branch at some point only to reify at a common - point, expressing a path by which they can finally rejoin. *) -let tag_by_traversal ?(threshold=8) superset = - let visited = Addr.Hash_set.create () in - (*let callsites = Superset.get_callsites ~threshold:6 superset in - let superset = tag_callsites visited ~callsites superset in - let superset = Invariants.tag_layer_violations superset in - let superset = Invariants.tag_branch_violations superset in*) - let entries = Superset.entries_of_isg superset in - let branches = Superset.get_branches superset in - (*let branches = identify_branches superset in*) - (*let branches = linear_branch_sweep superset entries in*) - let cur_total = ref 0 in - let positives = ref [] in - let entry = ref None in - let tps = Addr.Hash_set.create () in - (* In the case that our current starting point, entry, is none, set *) - (* it to being the address of the lambda parameter, addr. Then, set *) - (* the current total number of recognized grammar items to zero, *) - (* as well as the positives since we're starting over *) - let pre addr = - if Option.is_none !entry then ( - entry := Some(addr); - cur_total := 0; - positives := []; - ); - if Hash_set.mem branches addr then ( - cur_total := !cur_total + 1; - positives := addr :: !positives; - if !cur_total >= threshold then ( - let open Option in - ignore (List.nth !positives threshold >>| - (fun convergent_point -> - Hash_set.add tps convergent_point)); - ) - ) in - let post addr = - entry := Option.value_map !entry ~default:!entry - ~f:(fun e -> if Addr.(e = addr) then None else Some(e)); - if Hash_set.mem branches addr then ( - cur_total := !cur_total - 1; - match !positives with - | _ :: remaining -> positives := remaining - | [] -> (); - ) in - Traverse.visit ~visited - ~pre ~post superset entries; - Hash_set.iter tps ~f:(fun tp -> - if not (Hash_set.mem visited tp) then ( - Traverse.with_descendents_at superset tp ~pre:(fun tp -> - let mark_bad addr = - if Superset.ISG.mem_vertex superset addr then ( - Superset.Core.mark_bad superset addr - ) in - Superset.Occlusion.with_data_of_insn superset tp ~f:mark_bad; - Hash_set.add visited tp; - ) ; - ) - ); - Hash_set.iter visited - ~f:(fun tp -> Superset.Core.clear_bad superset tp); - superset - - -let parents_of_insns superset component = - Set.fold component ~init:Addr.Set.empty ~f:(fun potential_parents addr -> - List.fold (Superset.ISG.ancestors superset addr) - ~init:potential_parents - ~f:(fun potential_parents ancestor -> - if not Set.(mem component ancestor) then - Set.add potential_parents ancestor - else potential_parents - ) - ) - -let addrs_of_loops loops = - List.fold_left loops ~init:Addr.Set.empty - ~f:(fun keep loop -> - Addr.Set.(union keep (of_list loop)) - ) - -let filter_loops ?(min_size=20) loops = - let loops = - List.filter loops ~f:(fun l -> List.length l > min_size) in - addrs_of_loops loops - -let addrs_of_filtered_loops ?(min_size=20) superset = - filter_loops ~min_size @@ Superset.ISG.raw_loops superset - -(** In the body of a loop, instructions fall through eventually to - themselves, which amounts to effectively a trigger of an - invariant. But the level at which invariants operate is too fine - grained to see the consequence propagated from conflicts that are - potentially in loops that are many instructions long. This - function cleanses the bodies of instructions that occur in loops - of a minimum size. *) -let tag_loop_contradictions ?(min_size=20) superset = - let keep = addrs_of_filtered_loops ~min_size superset in - (* Here we have to be careful; we only want to find instructions - that occur within a loop that produce a self-contradiction *) - let parents = parents_of_insns superset keep in - let to_remove = - Superset.Occlusion.conflicts_within_insns superset keep in - let to_remove = Set.inter to_remove parents in - let to_remove = Set.diff to_remove keep in - Set.iter to_remove ~f:(Superset.Core.mark_bad superset); - superset - -let default_tags = [tag_loop_contradictions] diff --git a/lib/bap_superset_disasm/grammar.mli b/lib/bap_superset_disasm/grammar.mli deleted file mode 100644 index 3a066fea2..000000000 --- a/lib/bap_superset_disasm/grammar.mli +++ /dev/null @@ -1,8 +0,0 @@ -open Bap.Std - -val tag_loop_contradictions : ?min_size:int -> Superset_impl.t -> Superset_impl.t -val tag_by_traversal : ?threshold:int -> Superset_impl.t -> Superset_impl.t -val linear_branch_sweep : Superset_impl.t -> Addr.Hash_set.t -> Addr.Hash_set.t -val identify_branches : Superset_impl.t -> Addr.Hash_set.t -val addrs_of_loops : addr list list -> Addr.Set.t -val addrs_of_filtered_loops : ?min_size:int -> Superset_impl.t -> Addr.Set.t diff --git a/lib/bap_superset_disasm/heuristics.ml b/lib/bap_superset_disasm/heuristics.ml index a1bf2324a..b68c1aaa4 100644 --- a/lib/bap_superset_disasm/heuristics.ml +++ b/lib/bap_superset_disasm/heuristics.ml @@ -45,16 +45,16 @@ let find_free_insns superset = let mem = Superset.Core.mem superset in let conflict_free_addrs = Superset.Core.fold superset ~init:([]) - ~f:(fun ~key ~data to_clamp -> + ~f:(fun ~key ~data free_insns -> let (addr,(memory,_)) = key, data in let len = Memory.length memory in let conflicts = Superset.Occlusion.range_seq_of_conflicts ~mem addr len in let no_conflicts = Seq.is_empty conflicts in if no_conflicts then - addr :: to_clamp + addr :: free_insns else ( - to_clamp + free_insns ) ) in let conflict_free_addrs = Addr.Set.of_list conflict_free_addrs in @@ -62,10 +62,10 @@ let find_free_insns superset = Superset.Occlusion.find_all_conflicts superset let extract_trim_interpretation_depth superset = - let to_clamp = find_free_insns superset in + let free_insns = find_free_insns superset in let visited = Addr.Hash_set.create () in let datas = Addr.Hash_set.create () in - Set.iter to_clamp ~f:(fun c -> + Set.iter free_insns ~f:(fun c -> if not Hash_set.(mem visited c) then if Superset.Core.mem superset c then ( Traverse.mark_descendent_bodies_at @@ -73,9 +73,17 @@ let extract_trim_interpretation_depth superset = ) ); Superset.Core.clear_each superset visited; - Set.iter to_clamp ~f:(Superset.Core.clear_bad superset); + Set.iter free_insns ~f:(Superset.Core.clear_bad superset); superset +(** Interpretation depth is a heuristics that recognizes that true *) +(** positive compiler intended instructions are the only kind of *) +(** instruction that can occur where the body of the instruction is *) +(** completely free of occlusion, aside from random false positives *) +(** that happen to be free. These false positives should not affect *) +(** the convergence on the true positive set. It is called *) +(** interpretation depth one because, without noise in the body, *) +(** there is a single instruction that can be interpreted. *) let extract_trim_protected_interpretation_depth superset = let protection = Addr.Hash_set.create () in let callsites = get_callsites ~threshold:0 superset in From 7735f77b735434114e505d22cb53577d37573fcc Mon Sep 17 00:00:00 2001 From: Kenneth Adam Miller Date: Fri, 16 May 2025 15:03:16 -0500 Subject: [PATCH 29/31] Restore grammar and fixpoint, used in plugin --- lib/bap_superset_disasm/fixpoint.ml | 27 ++++++ lib/bap_superset_disasm/fixpoint.mli | 6 ++ lib/bap_superset_disasm/grammar.ml | 118 +++++++++++++++++++++++++++ lib/bap_superset_disasm/grammar.mli | 5 ++ 4 files changed, 156 insertions(+) create mode 100644 lib/bap_superset_disasm/fixpoint.ml create mode 100644 lib/bap_superset_disasm/fixpoint.mli create mode 100644 lib/bap_superset_disasm/grammar.ml create mode 100644 lib/bap_superset_disasm/grammar.mli diff --git a/lib/bap_superset_disasm/fixpoint.ml b/lib/bap_superset_disasm/fixpoint.ml new file mode 100644 index 000000000..2d33a46d7 --- /dev/null +++ b/lib/bap_superset_disasm/fixpoint.ml @@ -0,0 +1,27 @@ +open Core +open Bap.Std + +let iterate rounds f superset = + let (superset) = f superset in + let rec do_analysis round superset = + if round = rounds then superset else + let (superset) = f superset in + do_analysis (round+1) superset in + do_analysis 1 superset + +let protect superset f = + let visited = Addr.Hash_set.create () in + let callsites = Heuristics.get_callsites ~threshold:0 superset in + let superset = Heuristics.tag_callsites visited ~callsites superset in + let superset = f superset in + Superset.Core.clear_each superset visited; + Trim.run superset + +let converge superset heuristics feature_pmap = + let superset = Trim.run superset in + let cache = Addr.Hash_set.create () in + List.iter Map.(keys feature_pmap) ~f:(fun addr -> + Traverse.mark_descendent_bodies_at superset ~visited:cache addr + ); + superset + diff --git a/lib/bap_superset_disasm/fixpoint.mli b/lib/bap_superset_disasm/fixpoint.mli new file mode 100644 index 000000000..6113d21c1 --- /dev/null +++ b/lib/bap_superset_disasm/fixpoint.mli @@ -0,0 +1,6 @@ +open Bap.Std + + +val iterate : int -> ('a -> 'a) -> 'a -> 'a +val protect : Superset_impl.t -> (Superset_impl.t -> Superset_impl.t) -> Superset_impl.t +val converge : Superset_impl.t -> 'a -> 'b Addr.Map.t -> Superset_impl.t diff --git a/lib/bap_superset_disasm/grammar.ml b/lib/bap_superset_disasm/grammar.ml new file mode 100644 index 000000000..4ce8511d6 --- /dev/null +++ b/lib/bap_superset_disasm/grammar.ml @@ -0,0 +1,118 @@ +open Core +open Bap.Std + +(** The objective here is to tag grammar structures while traversing + topologically in such a manner that we can converge the + probability of recognizing an intended sequence by the + compiler. After we've hit some recognition threshold, we begin + traversing forward from some activation point whereby we trim + occlusive instructions. To recognize grammars, we have several + means: one, loops are strongly connected components, and if + sequences must branch at some point only to reify at a common + point, expressing a path by which they can finally rejoin. *) +let tag_by_traversal ?(threshold=8) superset = + let visited = Addr.Hash_set.create () in + (*let callsites = Superset.get_callsites ~threshold:6 superset in + let superset = tag_callsites visited ~callsites superset in + let superset = Invariants.tag_layer_violations superset in + let superset = Invariants.tag_branch_violations superset in*) + let entries = Superset.entries_of_isg superset in + let branches = Superset.get_branches superset in + (*let branches = identify_branches superset in*) + (*let branches = linear_branch_sweep superset entries in*) + let cur_total = ref 0 in + let positives = ref [] in + let entry = ref None in + let tps = Addr.Hash_set.create () in + (* In the case that our current starting point, entry, is none, set *) + (* it to being the address of the lambda parameter, addr. Then, set *) + (* the current total number of recognized grammar items to zero, *) + (* as well as the positives since we're starting over *) + let pre addr = + if Option.is_none !entry then ( + entry := Some(addr); + cur_total := 0; + positives := []; + ); + if Hash_set.mem branches addr then ( + cur_total := !cur_total + 1; + positives := addr :: !positives; + if !cur_total >= threshold then ( + let open Option in + ignore (List.nth !positives threshold >>| + (fun convergent_point -> + Hash_set.add tps convergent_point)); + ) + ) in + let post addr = + entry := Option.value_map !entry ~default:!entry + ~f:(fun e -> if Addr.(e = addr) then None else Some(e)); + if Hash_set.mem branches addr then ( + cur_total := !cur_total - 1; + match !positives with + | _ :: remaining -> positives := remaining + | [] -> (); + ) in + Traverse.visit ~visited + ~pre ~post superset entries; + Hash_set.iter tps ~f:(fun tp -> + if not (Hash_set.mem visited tp) then ( + Traverse.with_descendents_at superset tp ~pre:(fun tp -> + let mark_bad addr = + if Superset.ISG.mem_vertex superset addr then ( + Superset.Core.mark_bad superset addr + ) in + Superset.Occlusion.with_data_of_insn superset tp ~f:mark_bad; + Hash_set.add visited tp; + ) ; + ) + ); + Hash_set.iter visited + ~f:(fun tp -> Superset.Core.clear_bad superset tp); + superset + + +let parents_of_insns superset component = + Set.fold component ~init:Addr.Set.empty ~f:(fun potential_parents addr -> + List.fold (Superset.ISG.ancestors superset addr) + ~init:potential_parents + ~f:(fun potential_parents ancestor -> + if not Set.(mem component ancestor) then + Set.add potential_parents ancestor + else potential_parents + ) + ) + +let addrs_of_loops loops = + List.fold_left loops ~init:Addr.Set.empty + ~f:(fun keep loop -> + Addr.Set.(union keep (of_list loop)) + ) + +let filter_loops ?(min_size=20) loops = + let loops = + List.filter loops ~f:(fun l -> List.length l > min_size) in + addrs_of_loops loops + +let addrs_of_filtered_loops ?(min_size=20) superset = + filter_loops ~min_size @@ Superset.ISG.raw_loops superset + +(** In the body of a loop, instructions fall through eventually to + themselves, which amounts to effectively a trigger of an + invariant. But the level at which invariants operate is too fine + grained to see the consequence propagated from conflicts that are + potentially in loops that are many instructions long. This + function cleanses the bodies of instructions that occur in loops + of a minimum size. *) +let tag_loop_contradictions ?(min_size=20) superset = + let keep = addrs_of_filtered_loops ~min_size superset in + (* Here we have to be careful; we only want to find instructions + that occur within a loop that produce a self-contradiction *) + let parents = parents_of_insns superset keep in + let to_remove = + Superset.Occlusion.conflicts_within_insns superset keep in + let to_remove = Set.inter to_remove parents in + let to_remove = Set.diff to_remove keep in + Set.iter to_remove ~f:(Superset.Core.mark_bad superset); + superset + diff --git a/lib/bap_superset_disasm/grammar.mli b/lib/bap_superset_disasm/grammar.mli new file mode 100644 index 000000000..f6cceeedd --- /dev/null +++ b/lib/bap_superset_disasm/grammar.mli @@ -0,0 +1,5 @@ +open Bap.Std + +val tag_loop_contradictions : ?min_size:int -> Superset_impl.t -> Superset_impl.t +val tag_by_traversal : ?threshold:int -> Superset_impl.t -> Superset_impl.t + From 502a7db7075e9b22fb021ac583dbfd918cd08505 Mon Sep 17 00:00:00 2001 From: Kenneth Adam Miller Date: Fri, 16 May 2025 19:09:42 -0500 Subject: [PATCH 30/31] Make sure analyses arguments can be supplied --- plugins/superset_disasm/cmdoptions.ml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/plugins/superset_disasm/cmdoptions.ml b/plugins/superset_disasm/cmdoptions.ml index 3897aebfc..963fa9b2e 100644 --- a/plugins/superset_disasm/cmdoptions.ml +++ b/plugins/superset_disasm/cmdoptions.ml @@ -10,8 +10,8 @@ let tag_loop_contradictions = let tag_grammar = Grammar.tag_by_traversal ?threshold:None let list_analyses = [ - "Strongly Connected Component Data", tag_loop_contradictions; - "Grammar convergent", tag_grammar; + "StronglyConnectedComponentData", tag_loop_contradictions; + "Grammar-convergent", tag_grammar; ] type t = { From 7e23620465c345cfeb3402200d573ea7e8413553 Mon Sep 17 00:00:00 2001 From: Kenneth Adam Miller Date: Fri, 16 May 2025 20:32:49 -0500 Subject: [PATCH 31/31] Update analyses and heuristics args in disasm_corpora.sh --- lib_test/bap_superset_disasm/disasm_corpora.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib_test/bap_superset_disasm/disasm_corpora.sh b/lib_test/bap_superset_disasm/disasm_corpora.sh index 77e66e848..96b713cd6 100755 --- a/lib_test/bap_superset_disasm/disasm_corpora.sh +++ b/lib_test/bap_superset_disasm/disasm_corpora.sh @@ -1,5 +1,5 @@ find /Volumes -type f -executable -exec sh -c "file -i '{}' | grep -q 'x-executable; charset=binary'" \; -print > files.txt -time cat files.txt | parallel "bap superset-disasm --find-fn-culprit --heuristics=Callsites3,FixedpointGrammar,ImgEntry,TrimLimitedClamped --ground-truth-bin={} {}" +time cat files.txt | parallel "bap superset-disasm --find-fn-culprit --analyses="StronglyConnectedComponentData","Grammar-convergent" --heuristics=Callsites3,FixpointGrammar,ImgEntry,InterpretationDepthOne,FixpointInterpDepthOne --ground-truth-bin={} {}" bap supersetd-graph-metrics ./files.txt --print-fn-bins