diff --git a/README.md b/README.md index 3a62d202b..23c761493 100644 --- a/README.md +++ b/README.md @@ -112,10 +112,16 @@ flow analysis labels. * macOS ≥ 10.15 or GNU / Linux (we recommend Ubuntu Linux ≥ 18.04). * bazel ≥ 2.0 * Python ≥ 3.6 +* MySQL client (N.B. this is not the full MySQL server, just the connector) + * On macOS: `brew install mysql-client` + * On Ubuntu: `sudo apt-get install libmysqlclient-dev` +* A Fortran compiler: + * On macOS: `brew cask install gfortran` + * (Ubuntu has one by default) * (Optional) NVIDIA GPU with CUDA drivers for TensorFlow and PyTorch -Test that you have everything prepared by building and running the full test -suite: +Once you have the above requirements installed, test that everything is working +by building and running full test suite: ```sh $ bazel test //programl/... diff --git a/programl/Documentation/arXiv.2003.10536/paper.png b/programl/Documentation/arXiv.2003.10536/paper.png index 1f22afa9a..6d0c5189e 100644 Binary files a/programl/Documentation/arXiv.2003.10536/paper.png and b/programl/Documentation/arXiv.2003.10536/paper.png differ diff --git a/programl/Documentation/assets/llvm2graph-1-ir.png b/programl/Documentation/assets/llvm2graph-1-ir.png index 9ed4f0d32..398170404 100644 Binary files a/programl/Documentation/assets/llvm2graph-1-ir.png and b/programl/Documentation/assets/llvm2graph-1-ir.png differ diff --git a/programl/Documentation/assets/llvm2graph-5-types.dot b/programl/Documentation/assets/llvm2graph-5-types.dot new file mode 100644 index 000000000..c5397198f --- /dev/null +++ b/programl/Documentation/assets/llvm2graph-5-types.dot @@ -0,0 +1,126 @@ +digraph main { +margin=0; + +graph [ + fontsize=100, + nodesep=0.2, + ranksep=0.2, +]; +node [ + fontname=Inconsolata, + fontsize=25, + penwidth=2, + margin=0, +]; +edge [ + fontname=Inconsolata, + fontsize=22, + arrowsize=.8, + penwidth=1, +] + +// === Nodes === +external [shape=box, style=filled, fillcolor="#C0DAFF", color="#C0DAFF", fontcolor="#345393" label="[external]", width=2]; + +// Types: +i32 [shape=diamond, margin=0, style=filled, fillcolor="#CCCCCC", width=1, color="#CCCCCC", fontcolor="#222222", label="i32"]; + +// Constants: +const_0 [shape=octagon, margin=0, style=filled, fillcolor="#F4CCCC", width=1, color="#F4CCCC", fontcolor="#990000", label="val"]; // 0 +const_1 [shape=octagon, margin=0, style=filled, fillcolor="#F4CCCC", width=1, color="#F4CCCC", fontcolor="#990000", label="val"]; // 1 +const_minus_1 [shape=octagon, margin=0, style=filled, fillcolor="#F4CCCC", width=1, color="#F4CCCC", fontcolor="#990000", label="val"]; // -1 +const_minus_2 [shape=octagon, margin=0, style=filled, fillcolor="#F4CCCC", width=1, color="#F4CCCC", fontcolor="#990000", label="val"]; // -2 + +// Instructions: +inst_switch [shape=box, style=filled, fillcolor="#C0DAFF", color="#C0DAFF", fontcolor="#345393", width=2, label="switch"]; +inst_br [shape=box, style=filled, fillcolor="#C0DAFF", color="#C0DAFF", fontcolor="#345393", width=1, label="br"]; +phi [shape=box, style=filled, fillcolor="#C0DAFF", color="#C0DAFF", fontcolor="#345393", width=1, label="phi"]; +inst_add_minus_1 [shape=box, style=filled, fillcolor="#C0DAFF", color="#C0DAFF", fontcolor="#345393", width=1, label="add"]; +call_1 [shape=box, style=filled, fillcolor="#C0DAFF", color="#C0DAFF", fontcolor="#345393", width=1, label="call"]; +inst_add_minus_2 [shape=box, style=filled, fillcolor="#C0DAFF", color="#C0DAFF", fontcolor="#345393", width=1, label="add"]; +call_2 [shape=box, style=filled, fillcolor="#C0DAFF", color="#C0DAFF", fontcolor="#345393", width=1, label="call"]; +add_3 [shape=box, style=filled, fillcolor="#C0DAFF", color="#C0DAFF", fontcolor="#345393", width=1, label="add"]; +ret_2 [shape=box, style=filled, fillcolor="#C0DAFF", color="#C0DAFF", fontcolor="#345393", width=1, label="ret"]; +ret_1 [shape=box, style=filled, fillcolor="#C0DAFF", color="#C0DAFF", fontcolor="#345393", width=1, label="ret"]; + +// Variables: +arg_0 [shape=ellipse, style=filled, fillcolor="#E99C9C", color="#E99C9C", width=1, fontcolor="#990000", label="var"]; // %0 +var_4 [shape=ellipse, style=filled, fillcolor="#E99C9C", color="#E99C9C", width=1, fontcolor="#990000", label="var"]; // %4 +var_5 [shape=ellipse, style=filled, fillcolor="#E99C9C", color="#E99C9C", width=1, fontcolor="#990000", label="var"]; // %5 +var_6 [shape=ellipse, style=filled, fillcolor="#E99C9C", color="#E99C9C", width=1, fontcolor="#990000", label="var"]; // %6 +var_7 [shape=ellipse, style=filled, fillcolor="#E99C9C", color="#E99C9C", width=1, fontcolor="#990000", label="var"]; // %7 +var_8 [shape=ellipse, style=filled, fillcolor="#E99C9C", color="#E99C9C", width=1, fontcolor="#990000", label="var"]; // %8 +var_10 [shape=ellipse, style=filled, fillcolor="#E99C9C", color="#E99C9C", width=1, fontcolor="#990000", label="var"]; // %10 + + +// === Edges === + +// Control edges: +inst_switch -> inst_add_minus_1 [color="#345393", weight=10, labelfontcolor="#345393", minlen=2]; +inst_switch -> phi [color="#345393", weight=10, labelfontcolor="#345393", minlen=2]; +inst_switch -> inst_br [color="#345393", weight=10, labelfontcolor="#345393", minlen=2]; +inst_br -> phi [color="#345393", weight=10]; +inst_add_minus_1 -> call_1 [color="#345393", weight=10]; +call_1 -> inst_add_minus_2 [color="#345393", weight=10]; +inst_add_minus_2 -> call_2 [color="#345393", weight=10]; +call_2 -> add_3 [color="#345393", weight=10]; +add_3 -> ret_2 [color="#345393", weight=10]; +phi -> ret_1 [color="#345393", weight=10]; + +// Data edges: +inst_add_minus_1 -> var_4 [color="#EA9999", labelfontcolor="#990000", weight=0]; +call_1 -> var_5 [color="#EA9999", labelfontcolor="#990000", weight=0]; +var_4 -> call_1 [color="#EA9999", labelfontcolor="#990000", weight=0]; +inst_add_minus_2 -> var_6 [color="#EA9999", labelfontcolor="#990000", weight=0]; +call_2 -> var_7 [color="#EA9999", labelfontcolor="#990000", weight=0]; +var_6 -> call_2 [color="#EA9999", labelfontcolor="#990000", weight=0]; +add_3 -> var_8 [color="#EA9999", labelfontcolor="#990000", weight=0]; +var_7 -> add_3 [color="#EA9999", labelfontcolor="#990000", weight=0]; +var_5 -> add_3 [color="#EA9999", labelfontcolor="#990000", weight=0]; +var_8 -> ret_2 [color="#EA9999", labelfontcolor="#990000", weight=0]; +phi -> var_10 [color="#EA9999", labelfontcolor="#990000", weight=0]; +var_10 -> ret_1 [color="#EA9999", labelfontcolor="#990000", weight=0]; +arg_0 -> inst_switch [color="#EA9999", labelfontcolor="#990000", weight=0]; +arg_0 -> inst_add_minus_1 [color="#EA9999", labelfontcolor="#990000", weight=0]; +arg_0 -> inst_add_minus_2 [color="#EA9999", labelfontcolor="#990000", weight=0]; +arg_0 -> phi [color="#EA9999", labelfontcolor="#990000", weight=0]; + +// Data edges (constants): + +const_0 -> inst_switch [color="#EA9999", labelfontcolor="#990000"]; +const_1 -> inst_switch [color="#EA9999", labelfontcolor="#990000"]; +const_1 -> phi [color="#EA9999", labelfontcolor="#990000"]; +const_minus_1 -> inst_add_minus_1 [color="#EA9999", labelfontcolor="#990000"]; +const_minus_2 -> inst_add_minus_2 [color="#EA9999", labelfontcolor="#990000"]; + +// Call edges +external -> inst_switch [color="#5dba83", weight=0]; +ret_2 -> external [color="#5dba83", weight=0]; +ret_1 -> external [color="#5dba83", weight=0]; +ret_1 -> call_2 [color="#5dba83", weight=0]; +ret_1 -> call_1 [color="#5dba83", weight=0]; +call_1 -> inst_switch [color="#5dba83", weight=0]; +call_2 -> inst_switch [color="#5dba83", weight=0]; +ret_2 -> call_2 [color="#5dba83", weight=0]; +ret_2 -> call_1 [color="#5dba83", weight=0]; + +// Type edges +i32 -> const_0 [color="#AAAAAA", penwidth=3, weight=1]; +i32 -> const_1 [color="#AAAAAA", penwidth=3, weight=1]; +i32 -> const_minus_1 [color="#AAAAAA", penwidth=3, weight=1]; +i32 -> const_minus_2 [color="#AAAAAA", penwidth=3, weight=1]; +i32 -> arg_0 [color="#AAAAAA", penwidth=3, weight=1]; +i32 -> var_4 [color="#AAAAAA", penwidth=3, weight=1]; +i32 -> var_5 [color="#AAAAAA", penwidth=3, weight=1]; +i32 -> var_6 [color="#AAAAAA", penwidth=3, weight=1]; +i32 -> var_7 [color="#AAAAAA", penwidth=3, weight=1]; +i32 -> var_8 [color="#AAAAAA", penwidth=3, weight=1]; +i32 -> var_10 [color="#AAAAAA", penwidth=3, weight=1]; + +rankdir = TB; +{rank = same; inst_add_minus_1; phi; inst_br;} +{rank = same; ret_1; call_1;} +{rank = same; call_2, var_5, var_6;} +{rank = same; add_3, var_7;} +{rank = same; ret_2, var_8;} +} diff --git a/programl/Documentation/assets/llvm2graph-5-types.png b/programl/Documentation/assets/llvm2graph-5-types.png new file mode 100644 index 000000000..8ca279557 Binary files /dev/null and b/programl/Documentation/assets/llvm2graph-5-types.png differ diff --git a/programl/Documentation/assets/program_explorer.png b/programl/Documentation/assets/program_explorer.png index f371bc84d..25b851043 100644 Binary files a/programl/Documentation/assets/program_explorer.png and b/programl/Documentation/assets/program_explorer.png differ diff --git a/programl/graph/format/graphviz_converter.cc b/programl/graph/format/graphviz_converter.cc index 194aeeb98..ef6511674 100644 --- a/programl/graph/format/graphviz_converter.cc +++ b/programl/graph/format/graphviz_converter.cc @@ -134,7 +134,7 @@ labm8::Status SerializeGraphVizToString(const ProgramGraph& graph, // Determine the subgraph to add this node to. boost::subgraph* dst = &external; - if (i && node.type() != Node::CONSTANT) { + if (i && (node.type() == Node::INSTRUCTION || node.type() == Node::VARIABLE)) { dst = &functionGraphs[node.function()].get(); } @@ -192,29 +192,33 @@ labm8::Status SerializeGraphVizToString(const ProgramGraph& graph, } labm8::TruncateWithEllipsis(text, kMaximumLabelLen); attributes["label"] = text; + attributes["style"] = "filled"; // Set the node shape. switch (node.type()) { case Node::INSTRUCTION: attributes["shape"] = "box"; - attributes["style"] = "filled"; attributes["fillcolor"] = "#3c78d8"; attributes["fontcolor"] = "#ffffff"; break; case Node::VARIABLE: attributes["shape"] = "ellipse"; - attributes["style"] = "filled"; attributes["fillcolor"] = "#f4cccc"; attributes["color"] = "#990000"; attributes["fontcolor"] = "#990000"; break; case Node::CONSTANT: - attributes["shape"] = "diamond"; - attributes["style"] = "filled"; + attributes["shape"] = "octagon"; attributes["fillcolor"] = "#e99c9c"; attributes["color"] = "#990000"; attributes["fontcolor"] = "#990000"; break; + case Node::TYPE: + attributes["shape"] = "diamond"; + attributes["fillcolor"] = "#cccccc"; + attributes["color"] = "#cccccc"; + attributes["fontcolor"] = "#222222"; + break; } } @@ -242,15 +246,21 @@ labm8::Status SerializeGraphVizToString(const ProgramGraph& graph, attributes["color"] = "#65ae4d"; attributes["weight"] = "1"; break; + case Edge::TYPE: + attributes["color"] = "#aaaaaa"; + attributes["weight"] = "1"; + attributes["penwidth"] = "1.5"; + break; } // Set the edge label. if (edge.position()) { // Position labels for control edge are drawn close to the originating - // instruction. For data edges, they are drawn closer to the consuming - // instruction. + // instruction. For control edges, they are drawn close to the branching + // instruction. For data and type edges, they are drawn close to the + // consuming node. const string label = - edge.flow() == Edge::DATA ? "headlabel" : "taillabel"; + edge.flow() == Edge::CONTROL ? "taillabel" : "headlabel"; attributes[label] = std::to_string(edge.position()); attributes["labelfontcolor"] = attributes["color"]; } diff --git a/programl/graph/program_graph_builder.cc b/programl/graph/program_graph_builder.cc index 53843bbd9..2420c805b 100644 --- a/programl/graph/program_graph_builder.cc +++ b/programl/graph/program_graph_builder.cc @@ -26,7 +26,7 @@ namespace graph { ProgramGraphBuilder::ProgramGraphBuilder() { // Create the graph root node. - AddNode(Node::INSTRUCTION, ""); + AddNode(Node::INSTRUCTION, "[external]"); } Module* ProgramGraphBuilder::AddModule(const string& name) { @@ -67,6 +67,10 @@ Node* ProgramGraphBuilder::AddConstant(const string& text) { return AddNode(Node::CONSTANT, text); } +Node* ProgramGraphBuilder::AddType(const string& text) { + return AddNode(Node::TYPE, text); +} + labm8::StatusOr ProgramGraphBuilder::AddControlEdge(int32_t position, const Node* source, const Node* target) { @@ -143,6 +147,26 @@ labm8::StatusOr ProgramGraphBuilder::AddCallEdge(const Node* source, return AddEdge(Edge::CALL, /*position=*/0, source, target); } +labm8::StatusOr ProgramGraphBuilder::AddTypeEdge(int32_t position, + const Node* source, + const Node* target) { + DCHECK(source) << "nullptr argument"; + DCHECK(target) << "nullptr argument"; + + if (source->type() != Node::TYPE) { + return Status(labm8::error::Code::INVALID_ARGUMENT, + "Invalid source type ({}) for type edge. Expected type", + Node::Type_Name(source->type())); + } + if (target->type() == Node::INSTRUCTION) { + return Status(labm8::error::Code::INVALID_ARGUMENT, + "Invalid destination type (instruction) for type edge. " + "Expected {variable,constant,type}"); + } + + return AddEdge(Edge::TYPE, position, source, target); +} + labm8::StatusOr ProgramGraphBuilder::Build() { // Check that all nodes except the root are connected. The root is allowed to // have no connections in the case where it is an empty graph. diff --git a/programl/graph/program_graph_builder.h b/programl/graph/program_graph_builder.h index 4f524b3e1..a21c64c10 100644 --- a/programl/graph/program_graph_builder.h +++ b/programl/graph/program_graph_builder.h @@ -61,6 +61,8 @@ class ProgramGraphBuilder { Node* AddConstant(const string& text); + Node* AddType(const string& text); + // Edge factories. [[nodiscard]] labm8::StatusOr AddControlEdge(int32_t position, const Node* source, @@ -73,6 +75,10 @@ class ProgramGraphBuilder { [[nodiscard]] labm8::StatusOr AddCallEdge(const Node* source, const Node* target); + [[nodiscard]] labm8::StatusOr AddTypeEdge(int32_t position, + const Node* source, + const Node* target); + const Node* GetRootNode() const { return &graph_.node(0); } // Return the graph protocol buffer. @@ -99,6 +105,9 @@ class ProgramGraphBuilder { inline Edge* AddEdge(const Edge::Flow& flow, int32_t position, const Node* source, const Node* target); + // Return a mutable pointer to the root node in the graph. + Node* GetMutableRootNode() { return graph_.mutable_node(0); } + // Return a mutable pointer to the graph protocol buffer. ProgramGraph* GetMutableProgramGraph() { return &graph_; } @@ -110,7 +119,7 @@ class ProgramGraphBuilder { int32_t GetIndex(const Function* function); int32_t GetIndex(const Node* node); - // Maps which covert store the index of objects in repeated field lists. + // Maps that store the index of objects in repeated field lists. absl::flat_hash_map moduleIndices_; absl::flat_hash_map functionIndices_; absl::flat_hash_map nodeIndices_; diff --git a/programl/graph/py/program_graph_builder_test.py b/programl/graph/py/program_graph_builder_test.py index a1adb16ce..cb8652d8b 100644 --- a/programl/graph/py/program_graph_builder_test.py +++ b/programl/graph/py/program_graph_builder_test.py @@ -27,7 +27,7 @@ def test_empty_proto(): builder = program_graph_builder.ProgramGraphBuilder() with test.Raises(ValueError) as e_ctx: builder.Build() - assert "INSTRUCTION has no connections: ``" == str(e_ctx.value) + assert "INSTRUCTION has no connections: `[external]`" == str(e_ctx.value) def test_add_empty_module(): @@ -75,7 +75,7 @@ def test_linear_statement_control_flow(): assert len(builder.Build().node) == 3 - assert builder.Build().node[builder.root].text == "" + assert builder.Build().node[builder.root].text == "[external]" assert builder.Build().node[builder.root].type == node_pb2.Node.INSTRUCTION assert builder.Build().node[a].text == "a" diff --git a/programl/ir/llvm/inst2vec_encoder.py b/programl/ir/llvm/inst2vec_encoder.py index 783ded2d2..8e77f694f 100644 --- a/programl/ir/llvm/inst2vec_encoder.py +++ b/programl/ir/llvm/inst2vec_encoder.py @@ -50,13 +50,13 @@ ) -def NodeFullText(node: node_pb2.Node) -> str: +def NodeFullText( + graph: program_graph_pb2.ProgramGraph, + node: node_pb2.Node +) -> str: """Get the full text of a node, or an empty string if not set.""" - if len(node.features.feature["full_text"].bytes_list.value): - return ( - node.features.feature["full_text"].bytes_list.value[0].decode("utf-8") - ) - return "" + idx = node.features.feature["llvm_string"].int64_list.value[0] + return graph.features.feature["strings"].bytes_list.value[idx].decode("utf-8") class Inst2vecEncoder(object): @@ -94,7 +94,7 @@ def Encode( """ # Gather the instruction texts to pre-process. lines = [ - [NodeFullText(node)] + [NodeFullText(proto, node)] for node in proto.node if node.type == node_pb2.Node.INSTRUCTION ] @@ -122,6 +122,7 @@ def Encode( # Add the node features. var_embedding = self.dictionary["!IDENTIFIER"] const_embedding = self.dictionary["!IMMEDIATE"] + type_embedding = self.dictionary["!IMMEDIATE"] # Types are immediates text_index = 0 for node in proto.node: @@ -143,6 +144,12 @@ def Encode( node.features.feature["inst2vec_embedding"].int64_list.value.append( const_embedding ) + elif node.type == node_pb2.Node.TYPE: + node.features.feature["inst2vec_embedding"].int64_list.value.append( + type_embedding + ) + else: + raise TypeError(f"Unknown node type {node}") proto.features.feature["inst2vec_annotated"].int64_list.value.append(1) return proto diff --git a/programl/ir/llvm/inst2vec_encoder_test.py b/programl/ir/llvm/inst2vec_encoder_test.py index 5401099db..e72ea1634 100644 --- a/programl/ir/llvm/inst2vec_encoder_test.py +++ b/programl/ir/llvm/inst2vec_encoder_test.py @@ -56,10 +56,21 @@ def AddVariable(self, full_text: str): def Build(self): proto = super(Inst2vecGraphBuilder, self).Build() + + # Add the root node string feature. + proto.node[0].features.feature["llvm_string"].int64_list.value[:] = [0] + + # Build the strings list. + strings_list = list(set(self.full_texts.values())) + proto.features.feature["strings"].bytes_list.value[:] = [ + string.encode("utf-8") for string in strings_list + ] + + # Add the string indices. for node, full_text in self.full_texts.items(): - proto.node[node].features.feature["full_text"].bytes_list.value.append( - full_text.encode("utf-8") - ) + idx = strings_list.index(full_text) + node_feature = proto.node[node].features.feature["llvm_string"] + node_feature.int64_list.value.append(idx) return proto diff --git a/programl/ir/llvm/internal/BUILD b/programl/ir/llvm/internal/BUILD index 0d200ebb7..5947b63d4 100644 --- a/programl/ir/llvm/internal/BUILD +++ b/programl/ir/llvm/internal/BUILD @@ -43,6 +43,7 @@ cc_library( "@com_google_absl//absl/container:flat_hash_set", "@labm8//labm8/cpp:status_macros", "@labm8//labm8/cpp:statusor", + "@labm8//labm8/cpp:logging", "@labm8//labm8/cpp:string", "@llvm//10.0.0", ], diff --git a/programl/ir/llvm/internal/program_graph_builder.cc b/programl/ir/llvm/internal/program_graph_builder.cc index eece4c3c8..f14abda0a 100644 --- a/programl/ir/llvm/internal/program_graph_builder.cc +++ b/programl/ir/llvm/internal/program_graph_builder.cc @@ -20,6 +20,7 @@ #include "absl/container/flat_hash_map.h" #include "absl/container/flat_hash_set.h" +#include "labm8/cpp/logging.h" #include "labm8/cpp/status_macros.h" #include "labm8/cpp/string.h" #include "llvm/IR/BasicBlock.h" @@ -39,6 +40,18 @@ namespace ir { namespace llvm { namespace internal { +ProgramGraphBuilder::ProgramGraphBuilder(const ProgramGraphOptions& options) + : programl::graph::ProgramGraphBuilder(), + options_(options), + blockCount_(0), + stringsList_((*GetMutableProgramGraph() + ->mutable_features() + ->mutable_feature())["strings"] + .mutable_bytes_list()) { + // Add an empty + graph::AddScalarFeature(GetMutableRootNode(), "llvm_string", AddString("")); +} + labm8::StatusOr ProgramGraphBuilder::VisitBasicBlock( const ::llvm::BasicBlock& block, const Function* functionMessage, InstructionMap* instructions, ArgumentConsumerMap* argumentConsumers, @@ -194,7 +207,7 @@ labm8::StatusOr ProgramGraphBuilder::VisitFunction( if (function.isDeclaration()) { Node* node = AddInstruction("; undefined function", functionMessage); - graph::AddScalarFeature(node, "full_text", ""); + graph::AddScalarFeature(node, "llvm_string", AddString("")); functionEntryExits.first = node; functionEntryExits.second.push_back(node); return functionEntryExits; @@ -325,7 +338,7 @@ Node* ProgramGraphBuilder::AddLlvmInstruction( const LlvmTextComponents text = textEncoder_.Encode(instruction); Node* node = AddInstruction(text.opcode_name, function); node->set_block(blockCount_); - graph::AddScalarFeature(node, "full_text", text.text); + graph::AddScalarFeature(node, "llvm_string", AddString(text.text)); // Add profiling information features, if available. uint64_t profTotalWeight; @@ -345,9 +358,13 @@ Node* ProgramGraphBuilder::AddLlvmInstruction( Node* ProgramGraphBuilder::AddLlvmVariable(const ::llvm::Instruction* operand, const programl::Function* function) { const LlvmTextComponents text = textEncoder_.Encode(operand); - Node* node = AddVariable(text.lhs_type, function); + Node* node = AddVariable("var", function); node->set_block(blockCount_); - graph::AddScalarFeature(node, "full_text", text.lhs); + graph::AddScalarFeature(node, "llvm_string", AddString(text.lhs)); + + compositeTypeParts_.clear(); // Reset after previous call. + Node* type = GetOrCreateType(operand->getType()); + CHECK(AddTypeEdge(/*position=*/0, type, node).ok()); return node; } @@ -355,22 +372,114 @@ Node* ProgramGraphBuilder::AddLlvmVariable(const ::llvm::Instruction* operand, Node* ProgramGraphBuilder::AddLlvmVariable(const ::llvm::Argument* argument, const programl::Function* function) { const LlvmTextComponents text = textEncoder_.Encode(argument); - Node* node = AddVariable(text.lhs_type, function); + Node* node = AddVariable("var", function); node->set_block(blockCount_); - graph::AddScalarFeature(node, "full_text", text.lhs); + graph::AddScalarFeature(node, "llvm_string", AddString(text.lhs)); + + compositeTypeParts_.clear(); // Reset after previous call. + Node* type = GetOrCreateType(argument->getType()); + CHECK(AddTypeEdge(/*position=*/0, type, node).ok()); return node; } Node* ProgramGraphBuilder::AddLlvmConstant(const ::llvm::Constant* constant) { const LlvmTextComponents text = textEncoder_.Encode(constant); - Node* node = AddConstant(text.lhs_type); + Node* node = AddConstant("val"); node->set_block(blockCount_); - graph::AddScalarFeature(node, "full_text", text.text); + graph::AddScalarFeature(node, "llvm_string", AddString(text.text)); + + compositeTypeParts_.clear(); // Reset after previous call. + Node* type = GetOrCreateType(constant->getType()); + CHECK(AddTypeEdge(/*position=*/0, type, node).ok()); return node; } +Node* ProgramGraphBuilder::AddLlvmType(const ::llvm::Type* type) { + // Dispatch to the type-specific handlers. + if (::llvm::dyn_cast<::llvm::StructType>(type)) { + return AddLlvmType(::llvm::dyn_cast<::llvm::StructType>(type)); + } else if (::llvm::dyn_cast<::llvm::PointerType>(type)) { + return AddLlvmType(::llvm::dyn_cast<::llvm::PointerType>(type)); + } else if (::llvm::dyn_cast<::llvm::FunctionType>(type)) { + return AddLlvmType(::llvm::dyn_cast<::llvm::FunctionType>(type)); + } else if (::llvm::dyn_cast<::llvm::ArrayType>(type)) { + return AddLlvmType(::llvm::dyn_cast<::llvm::ArrayType>(type)); + } else if (::llvm::dyn_cast<::llvm::VectorType>(type)) { + return AddLlvmType(::llvm::dyn_cast<::llvm::VectorType>(type)); + } else { + const LlvmTextComponents text = textEncoder_.Encode(type); + Node *node = AddType(text.text); + graph::AddScalarFeature(node, "llvm_string", AddString(text.text)); + return node; + } +} + +Node* ProgramGraphBuilder::AddLlvmType(const ::llvm::StructType* type) { + Node* node = AddType("struct"); + compositeTypeParts_[type] = node; + graph::AddScalarFeature(node, "llvm_string", + AddString(textEncoder_.Encode(type).text)); + + // Add types for the struct elements, and add type edges. + for (int i = 0; i < type->getNumElements(); ++i) { + const auto& member = type->elements()[i]; + // Re-use the type if it already exists to prevent duplication of member + // types. + auto memberNode = GetOrCreateType(member); + CHECK(AddTypeEdge(/*position=*/i, memberNode, node).ok()); + } + + return node; +} + +Node* ProgramGraphBuilder::AddLlvmType(const ::llvm::PointerType* type) { + Node* node = AddType("*"); + graph::AddScalarFeature(node, "llvm_string", + AddString(textEncoder_.Encode(type).text)); + + auto elementType = type->getElementType(); + auto parent = compositeTypeParts_.find(elementType); + if (parent == compositeTypeParts_.end()) { + // Re-use the type if it already exists to prevent duplication. + auto elementNode = GetOrCreateType(type->getElementType()); + CHECK(AddTypeEdge(/*position=*/0, elementNode, node).ok()); + } else { + // Bottom-out for self-referencing types. + CHECK(AddTypeEdge(/*position=*/0, node, parent->second).ok()); + } + + return node; +} + +Node* ProgramGraphBuilder::AddLlvmType(const ::llvm::FunctionType* type) { + Node* node = AddType("fn"); + graph::AddScalarFeature(node, "llvm_string", + AddString(textEncoder_.Encode(type).text)); + return node; +} + +Node* ProgramGraphBuilder::AddLlvmType(const ::llvm::ArrayType* type) { + Node* node = AddType("[]"); + graph::AddScalarFeature(node, "llvm_string", + AddString(textEncoder_.Encode(type).text)); + // Re-use the type if it already exists to prevent duplication. + auto elementType = GetOrCreateType(type->getElementType()); + CHECK(AddTypeEdge(/*position=*/0, elementType, node).ok()); + return node; +} + +Node* ProgramGraphBuilder::AddLlvmType(const ::llvm::VectorType* type) { + Node* node = AddType("vector"); + graph::AddScalarFeature(node, "llvm_string", + AddString(textEncoder_.Encode(type).text)); + // Re-use the type if it already exists to prevent duplication. + auto elementType = GetOrCreateType(type->getElementType()); + CHECK(AddTypeEdge(/*position=*/0, elementType, node).ok()); + return node; +} + labm8::StatusOr ProgramGraphBuilder::Build( const ::llvm::Module& module) { // A map from functions to their entry and exit nodes. @@ -465,6 +574,27 @@ void ProgramGraphBuilder::Clear() { programl::graph::ProgramGraphBuilder::Clear(); } +Node* ProgramGraphBuilder::GetOrCreateType(const ::llvm::Type* type) { + auto it = types_.find(type); + if (it == types_.end()) { + Node* node = AddLlvmType(type); + types_[type] = node; + return node; + } + return it->second; +} + +int32_t ProgramGraphBuilder::AddString(const string& text) { + auto it = stringsListPositions_.find(text); + if (it == stringsListPositions_.end()) { + int32_t index = stringsListPositions_.size(); + stringsListPositions_[text] = index; + stringsList_->add_value(text); + return index; + } + return it->second; +} + } // namespace internal } // namespace llvm } // namespace ir diff --git a/programl/ir/llvm/internal/program_graph_builder.h b/programl/ir/llvm/internal/program_graph_builder.h index a2e03dd37..beb533eb1 100644 --- a/programl/ir/llvm/internal/program_graph_builder.h +++ b/programl/ir/llvm/internal/program_graph_builder.h @@ -64,16 +64,19 @@ using ArgumentConsumerMap = // A specialized program graph builder for LLVM-IR. class ProgramGraphBuilder : public programl::graph::ProgramGraphBuilder { public: - explicit ProgramGraphBuilder(const ProgramGraphOptions& options) - : programl::graph::ProgramGraphBuilder(), - options_(options), - blockCount_(0){} + explicit ProgramGraphBuilder(const ProgramGraphOptions& options); - [[nodiscard]] labm8::StatusOr Build( - const ::llvm::Module& module); + [[nodiscard]] labm8::StatusOr Build( + const ::llvm::Module& module); void Clear(); + // Return the node representing a type. If no node already exists + // for this type, a new node is created and added to the graph. In + // the case of composite types, multiple new nodes may be added by + // this call, and the root type returned. + Node* GetOrCreateType(const ::llvm::Type* type); + protected: [[nodiscard]] labm8::StatusOr VisitFunction( const ::llvm::Function& function, const Function* functionMessage); @@ -93,6 +96,19 @@ class ProgramGraphBuilder : public programl::graph::ProgramGraphBuilder { Node* AddLlvmVariable(const ::llvm::Argument* argument, const Function* function); Node* AddLlvmConstant(const ::llvm::Constant* constant); + Node* AddLlvmType(const ::llvm::Type* type); + Node* AddLlvmType(const ::llvm::StructType* type); + Node* AddLlvmType(const ::llvm::PointerType* type); + Node* AddLlvmType(const ::llvm::FunctionType* type); + Node* AddLlvmType(const ::llvm::ArrayType* type); + Node* AddLlvmType(const ::llvm::VectorType* type); + + // Add a string to the strings list and return its position. + // + // We use a graph-level "strings" feature to store a list of the original + // LLVM-IR string corresponding to each graph nodes. This allows to us to + // refer to the same string from multiple nodes without duplication. + int32_t AddString(const string& text); private: const ProgramGraphOptions options_; @@ -110,6 +126,32 @@ class ProgramGraphBuilder : public programl::graph::ProgramGraphBuilder { // visited. absl::flat_hash_map> constants_; + + // A mapping from string table value to its position in the "strings_table" + // graph-level feature. + absl::flat_hash_map stringsListPositions_; + // The underlying storage for the strings table. + BytesList* stringsList_; + + // A map from an LLVM type to the node message that represents it. + absl::flat_hash_map types_; + + // When adding a new type to the graph we need to know whether the type that + // we are adding is part of a composite type that references itself. For + // example: + // + // struct BinaryTree { + // int data; + // struct BinaryTree* left; + // struct BinaryTree* right; + // } + // + // When the recursive GetOrCreateType() resolves the "left" member, it needs + // to know that the parent BinaryTree type has already been processed. This + // map stores the Nodes corresponding to any parent structs that have been + // already added in a call to GetOrCreateType(). It must be cleared between + // calls. + absl::flat_hash_map compositeTypeParts_; }; } // namespace internal diff --git a/programl/ir/llvm/py/llvm_test.py b/programl/ir/llvm/py/llvm_test.py index c8640cdfa..0882b6b2a 100644 --- a/programl/ir/llvm/py/llvm_test.py +++ b/programl/ir/llvm/py/llvm_test.py @@ -38,8 +38,13 @@ """ -def GetStringScalar(proto, name): - return proto.features.feature[name].bytes_list.value[0].decode("utf-8") +def NodeFullText( + graph: program_graph_pb2.ProgramGraph, + node: node_pb2.Node +) -> str: + """Get the full text of a node, or an empty string if not set.""" + idx = node.features.feature["llvm_string"].int64_list.value[0] + return graph.features.feature["strings"].bytes_list.value[idx].decode("utf-8") def test_simple_ir(): @@ -51,33 +56,35 @@ def test_simple_ir(): assert len(proto.module) == 1 assert proto.module[0].name == "foo.c" - assert len(proto.node) == 6 - assert proto.node[0].text == "" + assert len(proto.node) == 7 + assert proto.node[0].text == "[external]" assert proto.node[0].type == node_pb2.Node.INSTRUCTION assert proto.node[1].text == "add" assert proto.node[1].type == node_pb2.Node.INSTRUCTION - assert ( - GetStringScalar(proto.node[1], "full_text") == "%3 = add nsw i32 %1, %0" - ) + assert NodeFullText(proto, proto.node[1]) == "%3 = add nsw i32 %1, %0" assert proto.node[2].text == "ret" assert proto.node[2].type == node_pb2.Node.INSTRUCTION - assert GetStringScalar(proto.node[2], "full_text") == "ret i32 %3" + assert NodeFullText(proto, proto.node[2]) == "ret i32 %3" - assert proto.node[3].text == "i32" + assert proto.node[3].text == "var" assert proto.node[3].type == node_pb2.Node.VARIABLE - assert GetStringScalar(proto.node[3], "full_text") == "i32 %3" + assert NodeFullText(proto, proto.node[3]) == "i32 %3" - # Use startswith() to compare names for these last two variables as thier - # order may differ. assert proto.node[4].text == "i32" - assert proto.node[4].type == node_pb2.Node.VARIABLE - assert GetStringScalar(proto.node[4], "full_text").startswith("i32 %") + assert proto.node[4].type == node_pb2.Node.TYPE + assert NodeFullText(proto, proto.node[4]) == "i32" - assert proto.node[5].text == "i32" + # Use startswith() to compare names for these last two variables as thier + # order may differ. + assert proto.node[5].text == "var" assert proto.node[5].type == node_pb2.Node.VARIABLE - assert GetStringScalar(proto.node[5], "full_text").startswith("i32 %") + assert NodeFullText(proto, proto.node[5]).startswith("i32 %") + + assert proto.node[6].text == "var" + assert proto.node[6].type == node_pb2.Node.VARIABLE + assert NodeFullText(proto, proto.node[6]).startswith("i32 %") def test_opt_level(): diff --git a/programl/models/lstm/BUILD b/programl/models/lstm/BUILD index 7c9c47677..b1b643a16 100644 --- a/programl/models/lstm/BUILD +++ b/programl/models/lstm/BUILD @@ -27,7 +27,7 @@ py_library( "//programl/proto:epoch_py", "//third_party/py/labm8", "//third_party/py/numpy", - "//third_party/py/tensorflow", + "//third_party/py/torch", ], ) diff --git a/programl/models/lstm/lstm.py b/programl/models/lstm/lstm.py index df853a020..90c5566e7 100644 --- a/programl/models/lstm/lstm.py +++ b/programl/models/lstm/lstm.py @@ -14,25 +14,26 @@ # See the License for the specific language governing permissions and # limitations under the License. """An LSTM for instruction classification.""" -import pathlib -import tempfile from typing import Any from typing import Dict from typing import List import numpy as np -import tensorflow as tf +import torch +from torch import nn +from torch import optim from labm8.py import app from labm8.py.progress import NullContext from labm8.py.progress import ProgressContext +from programl.models.ggnn.node_embeddings import NodeEmbeddings +from programl.models.ggnn.loss import Loss from programl.models.batch_data import BatchData from programl.models.batch_results import BatchResults from programl.models.lstm.lstm_batch import LstmBatchData from programl.models.model import Model from programl.proto import epoch_pb2 - FLAGS = app.FLAGS app.DEFINE_integer( @@ -61,19 +62,24 @@ "The value used for the positive class in the 1-hot selector embedding " "vectors. Has no effect when selector embeddings are not used.", ) -app.DEFINE_boolean( - "cudnn_lstm", - True, - "If set, use CuDNNLSTM implementation when a GPU is available. Else use " - "default Keras implementation. Note that the two implementations are " - "incompatible - a model saved using one LSTM type cannot be restored using " - "the other LSTM type.", -) app.DEFINE_float("learning_rate", 0.001, "The mode learning rate.") app.DEFINE_boolean( "trainable_embeddings", True, "Whether the embeddings are trainable." ) +# Embeddings options. +app.DEFINE_string( + "text_embedding_type", + "random", + "The type of node embeddings to use. One of " + "{constant_zero, constant_random, random}.", +) +app.DEFINE_integer( + "text_embedding_dimensionality", + 32, + "The dimensionality of node text embeddings.", +) + class Lstm(Model): """An LSTM model for node-level classification.""" @@ -82,103 +88,56 @@ def __init__( self, vocabulary: Dict[str, int], node_y_dimensionality: int, + graph_y_dimensionality: int, + graph_x_dimensionality: int, + use_selector_embeddings: bool, test_only: bool = False, name: str = "lstm", ): """Constructor.""" - super(Lstm, self).__init__( - test_only=test_only, vocabulary=vocabulary, name=name - ) + super().__init__(test_only=test_only, vocabulary=vocabulary, name=name) self.vocabulary = vocabulary self.node_y_dimensionality = node_y_dimensionality + self.graph_y_dimensionality = graph_y_dimensionality + self.graph_x_dimensionality = graph_x_dimensionality + self.node_selector_dimensionality = 2 if use_selector_embeddings else 0 # Flag values. self.batch_size = FLAGS.batch_size self.padded_sequence_length = FLAGS.padded_sequence_length - # Reset any previous Tensorflow session. This is required when running - # consecutive LSTM models in the same process. - tf.compat.v1.keras.backend.clear_session() - - @staticmethod - def MakeLstmLayer(*args, **kwargs): - """Construct an LSTM layer. - - If a GPU is available and --cudnn_lstm, this will use NVIDIA's fast - CuDNNLSTM implementation. Else it will use Keras' builtin LSTM, which is - much slower but works on CPU. - """ - if FLAGS.cudnn_lstm and tf.compat.v1.test.is_gpu_available(): - return tf.compat.v1.keras.layers.CuDNNLSTM(*args, **kwargs) - else: - return tf.compat.v1.keras.layers.LSTM(*args, **kwargs, implementation=1) - - def CreateKerasModel(self) -> tf.compat.v1.keras.Model: - """Construct the tensorflow computation graph.""" - vocab_ids = tf.compat.v1.keras.layers.Input( - batch_shape=(self.batch_size, self.padded_sequence_length,), - dtype="int32", - name="sequence_in", - ) - embeddings = tf.compat.v1.keras.layers.Embedding( - input_dim=len(self.vocabulary) + 2, - input_length=self.padded_sequence_length, - output_dim=FLAGS.hidden_size, - name="embedding", - trainable=FLAGS.trainable_embeddings, - )(vocab_ids) - - selector_vectors = tf.compat.v1.keras.layers.Input( - batch_shape=(self.batch_size, self.padded_sequence_length, 2), - dtype="float32", - name="selector_vectors", + self.model = LstmModel( + node_embeddings=NodeEmbeddings( + node_embeddings_type=FLAGS.text_embedding_type, + use_selector_embeddings=self.node_selector_dimensionality, + selector_embedding_value=FLAGS.selector_embedding_value, + embedding_shape=( + # Add one to the vocabulary size to account for the out-of-vocab token. + len(vocabulary) + 1, + FLAGS.text_embedding_dimensionality, + ), + ), + loss=Loss( + num_classes=self.node_y_dimensionality, + has_aux_input=self.has_aux_input, + intermediate_loss_weight=None, # NOTE(cec): Intentionally broken. + class_prevalence_weighting=False, + ), + padded_sequence_length=self.padded_sequence_length, + learning_rate=FLAGS.learning_rate, + test_only=test_only, + hidden_size=FLAGS.hidden_size, + hidden_dense_layer_count=FLAGS.hidden_dense_layer_count, ) - lang_model_input = tf.compat.v1.keras.layers.Concatenate( - axis=2, name="embeddings_and_selector_vectorss" - )([embeddings, selector_vectors],) - - # Recurrent layers. - lang_model = self.MakeLstmLayer( - FLAGS.hidden_size, return_sequences=True, name="lstm_1" - )(lang_model_input) - lang_model = self.MakeLstmLayer( - FLAGS.hidden_size, - return_sequences=True, - return_state=False, - name="lstm_2", - )(lang_model) - - # Dense layers. - for i in range(1, FLAGS.hidden_dense_layer_count + 1): - lang_model = tf.compat.v1.keras.layers.Dense( - FLAGS.hidden_size, activation="relu", name=f"dense_{i}", - )(lang_model) - node_out = tf.compat.v1.keras.layers.Dense( - self.node_y_dimensionality, activation="sigmoid", name="node_out", - )(lang_model) - - model = tf.compat.v1.keras.Model( - inputs=[vocab_ids, selector_vectors], outputs=[node_out], - ) - model.compile( - optimizer=tf.compat.v1.keras.optimizers.Adam( - learning_rate=FLAGS.learning_rate - ), - metrics=["accuracy"], - loss=["categorical_crossentropy"], - loss_weights=[1.0], - ) + @property + def num_classes(self) -> int: + return self.node_y_dimensionality or self.graph_y_dimensionality - return model - - def CreateModelData(self, test_only: bool) -> None: - """Initialize an LSTM model. This is called during Initialize().""" - # Create the Tensorflow session and graph for the model. - tf.get_logger().setLevel("ERROR") - SetAllowedGrowthOnKerasSession() - self.model = self.CreateKerasModel() + @property + def has_aux_input(self) -> bool: + return self.graph_x_dimensionality > 0 def RunBatch( self, @@ -203,24 +162,32 @@ def RunBatch( self.batch_size, self.padded_sequence_length, ), model_data.encoded_sequences.shape - assert model_data.selector_vectors.shape == ( + assert model_data.selector_ids.shape == ( self.batch_size, self.padded_sequence_length, - 2, - ), model_data.selector_vectors.shape - - x = [model_data.encoded_sequences, model_data.selector_vectors] - y = [model_data.node_labels] + ), model_data.selector_ids.shape if epoch_type == epoch_pb2.TRAIN: - loss, *_ = self.model.train_on_batch(x, y) + if not self.model.training: + self.model.train() + targets, logits = self.model(model_data.encoded_sequences, model_data.selector_ids, model_data.node_labels) else: - loss = None + if self.model.training: + self.model.eval() + self.model.opt.zero_grad() + # Inference only, don't trace the computation graph. + with torch.no_grad(): + targets, logits = self.model(model_data.encoded_sequences, model_data.selector_ids, model_data.node_labels) - padded_predictions = self.model.predict_on_batch(x) + loss = self.model.loss((logits, None), targets) + + if epoch_type == epoch_pb2.TRAIN: + loss.backward() + self.model.opt.step() + self.model.opt.zero_grad() # Reshape the outputs. - predictions = self.ReshapePaddedModelOutput(batch_data, padded_predictions) + predictions = self.ReshapePaddedModelOutput(batch_data, outputs) # Flatten the targets and predictions lists so that we can compare them. # Shape (batch_node_count, node_y_dimensionality). @@ -228,7 +195,10 @@ def RunBatch( predictions = np.concatenate(predictions) return BatchResults.Create( - targets=targets, predictions=predictions, loss=loss, + targets=model_data.node_labels, + predictions=logits.detach().cpu().numpy(), + learning_rate=self.model.learning_rate, + loss=loss.item(), ) def ReshapePaddedModelOutput( @@ -274,36 +244,71 @@ def ReshapePaddedModelOutput( def GetModelData(self) -> Any: """Get the model state.""" - # According to https://keras.io/getting-started/faq/, it is not recommended - # to pickle a Keras model. So as a workaround, I use Keras's saving - # mechanism to store the weights, and pickle that. - with tempfile.TemporaryDirectory(prefix="lstm_pickle_") as d: - path = pathlib.Path(d) / "weights.h5" - self.model.save(path) - with open(path, "rb") as f: - model_data = f.read() - return model_data + return { + "model_state_dict": self.model.state_dict(), + "optimizer_state_dict": self.model.opt.state_dict(), + "scheduler_state_dict": self.model.scheduler.state_dict(), + } def LoadModelData(self, data_to_load: Any) -> None: """Restore the model state.""" - # Load the weights from a file generated by ModelDataToSave(). - with tempfile.TemporaryDirectory(prefix="lstm_pickle_") as d: - path = pathlib.Path(d) / "weights.h5" - with open(path, "wb") as f: - f.write(data_to_load) - - # The default TF graph is finalized in Initialize(), so we must - # first reset the session and create a new graph. - tf.compat.v1.reset_default_graph() - SetAllowedGrowthOnKerasSession() - - self.model = tf.compat.v1.keras.models.load_model(path) - - -def SetAllowedGrowthOnKerasSession(): - """Allow growth on GPU for Keras.""" - config = tf.compat.v1.ConfigProto() - config.gpu_options.allow_growth = True - session = tf.compat.v1.Session(config=config) - tf.compat.v1.keras.backend.set_session(session) - return session + self.model.load_state_dict(data_to_load["model_state_dict"]) + # only restore opt if needed. opt should be None o/w. + if not self.test_only: + self.model.opt.load_state_dict(data_to_load["optimizer_state_dict"]) + self.model.scheduler.load_state_dict(data_to_load["scheduler_state_dict"]) + + + +class LstmModel(nn.Module): + + def __init__(self, node_embeddings: NodeEmbeddings, + loss: Loss, padded_sequence_length: int, test_only: bool, learning_rate: float, + hidden_size: int, + hidden_dense_layer_count: int, # TODO(cec): Implement. + ): + super().__init__() + self.node_embeddings = node_embeddings + self.loss = loss + self.padded_sequence_length = padded_sequence_length + self.learning_rate = learning_rate + self.hidden_size = hidden_size + self.learning_rate = learning_rate + + self.lstm = nn.LSTM( + self.node_embeddings.embedding_dimensionality + 2, + self.hidden_size, + ) + self.hidden2label = nn.Linear(self.hidden_size, 2) + + if test_only: + self.opt = None + self.eval() + else: + self.opt = optim.AdamW(self.parameters(), lr=self.learning_rate) + + def forward( + self, + encoded_sequences, + selector_ids, + node_labels, + ): + print("SHAPES", encoded_sequences.shape, selector_ids.shape, node_labels.shape) + + encoded_sequences = torch.tensor(encoded_sequences, dtype=torch.long) + selector_ids = torch.tensor(selector_ids, dtype=torch.long) + node_labels = torch.tensor(node_labels, dtype=torch.long) + + # Embed and concatenate sequences and selector vectors. + embeddings = self.node_embeddings(encoded_sequences, selector_ids) + + lstm_out, _ = self.lstm(embeddings.view( + self.padded_sequence_length, len(encoded_sequences), -1 + )) + print(lstm_out.shape) + + label_space = self.hidden2label(lstm_out.view(self.padded_sequence_length, -1)) + logits = F.log_softmax(label_space, dim=2) + + targets = node_labels + return logits, targets diff --git a/programl/models/lstm/lstm_batch.py b/programl/models/lstm/lstm_batch.py index 5d112a151..fe71c1ec8 100644 --- a/programl/models/lstm/lstm_batch.py +++ b/programl/models/lstm/lstm_batch.py @@ -32,8 +32,8 @@ class LstmBatchData(NamedTuple): # Shape (batch_size, padded_sequence_length, 1), dtype np.int32 encoded_sequences: np.array - # Shape (batch_size, padded_sequence_length, 2), dtype np.int32 - selector_vectors: np.array + # Shape (batch_size, padded_sequence_length, 1), dtype np.int32 + selector_ids: np.array # Shape (batch_size, padded_sequence_length, node_y_dimensionality), # dtype np.float32 node_labels: np.array diff --git a/programl/proto/edge.proto b/programl/proto/edge.proto index ffea46182..eef0afae1 100644 --- a/programl/proto/edge.proto +++ b/programl/proto/edge.proto @@ -38,6 +38,7 @@ message Edge { CONTROL = 0; DATA = 1; CALL = 2; + TYPE = 3; } Flow flow = 1; int32 position = 2; diff --git a/programl/proto/node.proto b/programl/proto/node.proto index 886931d16..b96587f88 100644 --- a/programl/proto/node.proto +++ b/programl/proto/node.proto @@ -34,6 +34,7 @@ message Node { INSTRUCTION = 0; VARIABLE = 1; CONSTANT = 2; + TYPE = 3; } Type type = 1; // The text of a node. This is the raw representation of a node, such as the diff --git a/programl/task/dataflow/BUILD b/programl/task/dataflow/BUILD index 8ad5bbb00..b730ea703 100644 --- a/programl/task/dataflow/BUILD +++ b/programl/task/dataflow/BUILD @@ -119,6 +119,7 @@ py_library( "//programl/models/lstm:lstm_batch", "//third_party/py/labm8", "//third_party/py/numpy", + "//third_party/py/keras_preprocessing", ], ) @@ -178,6 +179,20 @@ py_binary( ], ) +py_test( + name = "train_lstm_test", + srcs = ["train_lstm_test.py"], + data = [ + "//programl/test/data:llvm_ir_graphs", + "//programl/test/data:llvm_ir", + "//programl/test/data:llvm_ir_reachability_features" + ], + deps = [ + ":train_lstm", + "//third_party/py/labm8", + ] +) + py_library( name = "vocabulary", srcs = ["vocabulary.py"], diff --git a/programl/task/dataflow/dataset/BUILD b/programl/task/dataflow/dataset/BUILD index cce93bb05..3eb86db64 100644 --- a/programl/task/dataflow/dataset/BUILD +++ b/programl/task/dataflow/dataset/BUILD @@ -118,7 +118,7 @@ cc_library( py_library( name = "pathflag", srcs = ["pathflag.py"], - visibility = ["//programl/task/dataflow:__subpackages__"], + visibility = ["//programl/task:__subpackages__"], deps = [ "//third_party/py/labm8", ], diff --git a/programl/task/dataflow/lstm_batch_builder.py b/programl/task/dataflow/lstm_batch_builder.py index b767da740..706fe6d32 100644 --- a/programl/task/dataflow/lstm_batch_builder.py +++ b/programl/task/dataflow/lstm_batch_builder.py @@ -18,8 +18,8 @@ from typing import Optional import numpy as np -import tensorflow as tf from labm8.py import app +from keras_preprocessing.sequence import pad_sequences from programl.graph.format.py import graph_serializer from programl.models.base_batch_builder import BaseBatchBuilder @@ -51,12 +51,12 @@ def __init__( # Mutable state. self.graph_node_sizes = [] self.vocab_ids = [] - self.selector_vectors = [] + self.selector_ids = [] self.targets = [] # Padding values. self._vocab_id_pad = len(self.vocabulary) + 1 - self._selector_vector_pad = np.zeros((0, 2), dtype=np.int32) + self._selector_id_pad = 0 self._node_label_pad = np.zeros( (0, self.node_y_dimensionality), dtype=np.int32 ) @@ -77,14 +77,14 @@ def _Build(self) -> BatchData: self.vocab_ids += [ np.array([self._vocab_id_pad], dtype=np.int32) ] * pad_count - self.selector_vectors += [self._selector_vector_pad] * pad_count + self.selector_ids += [np.array([self._selector_id_pad], dtype=np.int32)] * pad_count self.targets += [self._node_label_pad] * pad_count batch = BatchData( graph_count=len(self.graph_node_sizes), model_data=LstmBatchData( graph_node_sizes=np.array(self.graph_node_sizes, dtype=np.int32), - encoded_sequences=tf.compat.v1.keras.preprocessing.sequence.pad_sequences( + encoded_sequences=pad_sequences( self.vocab_ids, maxlen=self.padded_sequence_length, dtype="int32", @@ -92,15 +92,15 @@ def _Build(self) -> BatchData: truncating="post", value=self._vocab_id_pad, ), - selector_vectors=tf.compat.v1.keras.preprocessing.sequence.pad_sequences( - self.selector_vectors, + selector_ids=pad_sequences( + self.selector_ids, maxlen=self.padded_sequence_length, - dtype="float32", + dtype="int32", padding="pre", truncating="post", - value=np.zeros(2, dtype=np.float32), + value=self._selector_id_pad, ), - node_labels=tf.compat.v1.keras.preprocessing.sequence.pad_sequences( + node_labels=pad_sequences( self.targets, maxlen=self.padded_sequence_length, dtype="float32", @@ -116,7 +116,7 @@ def _Build(self) -> BatchData: # Reset mutable state. self.graph_node_sizes = [] self.vocab_ids = [] - self.selector_vectors = [] + self.selector_ids = [] self.targets = [] return batch @@ -142,7 +142,7 @@ def OnItem(self, item) -> Optional[BatchData]: ) for n in node_list ] - selector_values = np.array( + selector_ids = np.array( [ features.node_features.feature_list["data_flow_root_node"] .feature[n] @@ -151,10 +151,7 @@ def OnItem(self, item) -> Optional[BatchData]: ], dtype=np.int32, ) - selector_vectors = np.zeros((selector_values.size, 2), dtype=np.float32) - selector_vectors[ - np.arange(selector_values.size), selector_values - ] = FLAGS.selector_embedding_value + # TODO: FLAGS.selector_embedding_value targets = np.array( [ features.node_features.feature_list["data_flow_value"] @@ -174,7 +171,7 @@ def OnItem(self, item) -> Optional[BatchData]: self.graph_node_sizes.append(len(node_list)) self.vocab_ids.append(vocab_ids) - self.selector_vectors.append(selector_vectors) + self.selector_ids.append(selector_ids) self.targets.append(targets_1hot) if len(self.graph_node_sizes) >= self.batch_size: diff --git a/programl/task/dataflow/train_lstm.py b/programl/task/dataflow/train_lstm.py index 7c400e6fc..16d2d1400 100644 --- a/programl/task/dataflow/train_lstm.py +++ b/programl/task/dataflow/train_lstm.py @@ -19,11 +19,13 @@ classification targets for data flow problems. """ import pathlib +import sys import time from typing import Dict import numpy as np from labm8.py import app +from labm8.py import bazelutil from labm8.py import gpu_scheduler from labm8.py import humanize from labm8.py import pbutil @@ -35,6 +37,10 @@ from programl.task.dataflow import dataflow from programl.task.dataflow.graph_loader import DataflowGraphLoader from programl.task.dataflow.lstm_batch_builder import DataflowLstmBatchBuilder + +# NOTE(cec): Workaround to prevent third_party package name shadowing from +# labm8. +sys.path.insert(0, str(bazelutil.DataPath("programl"))) from third_party.py.ncc import vocabulary @@ -116,7 +122,9 @@ def TrainDataflowLSTM( # # For these data flow experiments, our graphs contain per-node binary # classification targets (e.g. reachable / not-reachable). - model = Lstm(vocabulary=vocab, test_only=False, node_y_dimensionality=2,) + model = Lstm(vocabulary=vocab, test_only=False, node_y_dimensionality=2, + graph_y_dimensionality=0, + graph_x_dimensionality=0, use_selector_embeddings=True) if restore_from: # Pick up training where we left off. @@ -134,8 +142,6 @@ def TrainDataflowLSTM( model.Initialize() start_epoch_step, start_graph_cumsum = 1, 0 - model.model.summary() - # Create training batches and split into epochs. epochs = EpochBatchIterator( MakeBatchBuilder( diff --git a/programl/task/dataflow/train_lstm_test.py b/programl/task/dataflow/train_lstm_test.py new file mode 100644 index 000000000..7e5a0773b --- /dev/null +++ b/programl/task/dataflow/train_lstm_test.py @@ -0,0 +1,92 @@ +# Copyright 2019-2020 the ProGraML authors. +# +# Contact Chris Cummins . +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import subprocess +import shutil +import sys +import os +import tempfile +from pathlib import Path + +from labm8.py import app +from labm8.py import bazelutil + +TRAIN_LSTM = bazelutil.DataPath("programl/programl/task/dataflow/train_lstm") + +LLVM_IR = bazelutil.DataPath( + "programl/programl/test/data/llvm_ir" +) + +LLVM_IR_GRAPHS = bazelutil.DataPath( + "programl/programl/test/data/llvm_ir_graphs" +) + +LLVM_IR_GRAPH_REACHABILITY_FEATURES = bazelutil.DataPath( + "programl/programl/test/data/llvm_ir_reachability" +) + +def make_test_reachability_dataflow_dataset(root: Path) -> Path: + """Make a miniature dataset for reachability dataflow.""" + (root / "train").mkdir() + (root / "val").mkdir() + (root / "test").mkdir() + (root / "labels").mkdir() + + shutil.copytree(LLVM_IR_GRAPHS, root / "graphs") + shutil.copytree(LLVM_IR, root / "ir") + shutil.copytree( + LLVM_IR_GRAPH_REACHABILITY_FEATURES, root / "labels" / "reachability" + ) + + ngraphs = len(list(LLVM_IR_GRAPHS.iterdir())) + ntrain = int(ngraphs * .6) + nval = int(ngraphs * .8) + + for i, graph in enumerate(LLVM_IR_GRAPHS.iterdir()): + if i < ntrain: + dst = "train" + elif i < nval: + dst = "val" + else: + dst = "test" + name = graph.name[:-len(".ProgramGraph.pb")] + os.symlink( + f"../graphs/{name}.ProgramGraph.pb", + root / dst / f"{name}.ProgramGraph.pb", + ) + + return root + + +def main(): + with tempfile.TemporaryDirectory() as d: + p = subprocess.Popen([ + TRAIN_LSTM, + "--path", str(make_test_reachability_dataflow_dataset(Path(d))), + "--analysis", "reachability", + "--max_data_flow_steps", str(10), + "--val_graph_count", str(10), + "--val_seed", str(0xCC), + "--train_graph_counts", "10,20", + "--padded_sequence_length", str(10), + "--batch_size", str(8), + ]) + p.communicate() + if p.returncode: + sys.exit(1) + + +if __name__ == "__main__": + app.Run(main) diff --git a/programl/task/devmap/dataset/BUILD b/programl/task/devmap/dataset/BUILD new file mode 100644 index 000000000..09381a5af --- /dev/null +++ b/programl/task/devmap/dataset/BUILD @@ -0,0 +1,38 @@ +# Copyright 2019-2020 the ProGraML authors. +# +# Contact Chris Cummins . +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +py_binary( + name = "create", + srcs = ["create.py"], + deps = [ + "//programl/ir/llvm/py:llvm", + "//programl/proto:features_py", + "//programl/task/dataflow/dataset:pathflag", + "//third_party/py/labm8", + "//third_party/py/numpy", + "//third_party/py/pandas", + ], +) + +py_test( + name = "create_test", + timeout = "long", + srcs = ["create_test.py"], + deps = [ + ":create", + "//third_party/py/labm8", + ], +) diff --git a/programl/task/devmap/dataset/create.py b/programl/task/devmap/dataset/create.py new file mode 100644 index 000000000..506dd0bdc --- /dev/null +++ b/programl/task/devmap/dataset/create.py @@ -0,0 +1,195 @@ +# Copyright 2019-2020 the ProGraML authors. +# +# Contact Chris Cummins . +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Prepare the CPU/GPU OpenCL device-mapping dataset.""" +import io +import os +import shutil +import tempfile +from pathlib import Path +from zipfile import ZipFile + +import numpy as np +import pandas as pd +import requests +from labm8.py import app +from labm8.py import crypto +from labm8.py import pbutil + +from programl.ir.llvm.py import llvm +from programl.task.dataflow.dataset import pathflag + +FLAGS = app.FLAGS + + +def cachedir() -> Path: + """Return the path of the cache directory.""" + if os.environ.get("TEST_TMPDIR"): + return Path(os.environ["TEST_TMPDIR"]) + else: + return Path("~/.cache/programl").expanduser() + + +def download(url: str, checksum: str) -> bytes: + """Download from a URL and return its contents.""" + cachepath = cachedir() / f"{checksum}.data" + if cachepath.is_file(): + with open(cachepath, "rb") as f: + content = f.read() + else: + print("downloading", url, "...") + content = requests.get(url).content + cachepath.parent.mkdir(parents=True, exist_ok=True) + with open(cachepath, "wb") as f: + f.write(content) + + actual_checksum = crypto.sha256(content) + if actual_checksum != checksum: + raise ValueError( + f"Checksum mismatch of downloaded file {url}. " + f"Expected: {checksum}. Actual: {actual_checksum}" + ) + return content + + +def download_csv(url: str, checksum: str) -> pd.DataFrame: + """Download and return a CSV file as a pandas data frame.""" + return pd.read_csv(io.StringIO(download(url, checksum).decode("utf-8"))) + + +def reshape_df(df: pd.DataFrame) -> pd.DataFrame: + """Extract and reshape the useful bits of the dataframe.""" + names = [ + f"{benchmark}-{dataset}" + for benchmark, dataset in df[["benchmark", "dataset"]].values + ] + return pd.DataFrame( + { + "name": names, + "transfer_bytes": df["transfer"], + "transfer_bytes_log1p": np.log1p(df["transfer"]), + "wgsize": df["wgsize"], + "wgsize_log1p": np.log1p(df["transfer"]), + "label": df["runtime_gpu"] < df["runtime_cpu"], + } + ) + + +def name2ncc_path(name: str, src_dir: Path, extension: str): + """Resolve a NCC data archive path from a kernel name.""" + path = src_dir / f"{name}{extension}" + if path.is_file(): + return path + + # Some of the benchmark sources are dataset dependent. This is reflected by + # the dataset name being concatenated to the path. + name_components = name.split("-") + + new_name = "-".join(name_components[:-1]) + path = src_dir / f"{new_name}{extension}" + if path.is_file(): + return path + + new_name = "-".join(name_components[:-1]) + "_" + name_components[-1] + path = src_dir / f"{new_name}{extension}" + if path.is_file(): + return path + + raise FileNotFoundError(f"No OpenCL source found for {name}") + + +def dump_src(path: Path, df: pd.DataFrame, ncc_dir: Path): + """Dump the OpenCL source files.""" + for name in df["name"].values: + try: + src = name2ncc_path(name, ncc_dir / "kernels_cl", ".cl") + dst = path / "src" / f"{name}.cl" + shutil.copyfile(src, dst) + except FileNotFoundError: + # Not all kernels correspond to OpenCL files. This is fine. + pass + + +def dump_ir(path: Path, df: pd.DataFrame, ncc_dir: Path): + """Dump the LLVM-IR files.""" + for name in df["name"].values: + src = name2ncc_path(name, ncc_dir / "kernels_ir", ".ll") + dst = path / "ir" / f"{name}.ll" + shutil.copyfile(src, dst) + + +def build_graphs(df: pd.DataFrame, ir_dir: Path, graph_dir: Path): + """Build ProgramGraphs from LLVM-IR and features.""" + for _, row in df.iterrows(): + with open(ir_dir / f"{row['name']}.ll") as f: + ir = f.read() + graph = llvm.BuildProgramGraph(ir) + graph.features.feature["devmap_label"].int64_list.value[:] = [row["label"]] + graph.features.feature["wgsize"].int64_list.value[:] = [row["wgsize"]] + graph.features.feature["transfer_bytes"].int64_list.value[:] = [ + row["transfer_bytes"] + ] + graph.features.feature["wgsize_log1p"].float_list.value[:] = [ + row["wgsize_log1p"] + ] + graph.features.feature["transfer_bytes_log1p"].float_list.value[:] = [ + row["transfer_bytes_log1p"] + ] + pbutil.ToFile( + graph, graph_dir / f"{row['name']}.ProgramGraph.pb", exist_ok=False + ) + + +def create_devmap_dataset(path: Path): + """Create the devmap dataset.""" + # First create the output directories. Fail if they already exist. + (path / "graphs_amd").mkdir(parents=True) + (path / "graphs_nvidia").mkdir() + (path / "ir").mkdir() + (path / "src").mkdir() + + amd = download_csv( + url="http://raw.githubusercontent.com/ChrisCummins/phd/65643fa5ad6769ce4678535cd2f9f37b6a467c45/datasets/opencl/device_mapping/amd.csv", + checksum="0076271192aa9a0a7c21aa9a637e34cd4460f8e21e756215dd23ffb2ae62dc62", + ) + nvidia = download_csv( + url="http://raw.githubusercontent.com/ChrisCummins/phd/65643fa5ad6769ce4678535cd2f9f37b6a467c45/datasets/opencl/device_mapping/nvidia.csv", + checksum="095c1ccef333e0a65e0e70b3ebde0aef851b61528ec46496a5d1687905abd099", + ) + opencl_ir_zip = download( + url="http://polybox.ethz.ch/index.php/s/U08Z3xLhvbLk8io/download", + checksum="3c840f84936a83e329c7a94d011c45ddfcfce8bdbb1a9b1904123e83851913d5", + ) + + amd = reshape_df(amd) + nvidia = reshape_df(nvidia) + + with tempfile.TemporaryDirectory() as tmpdir: + with ZipFile(io.BytesIO(opencl_ir_zip), "r") as f: + f.extractall(tmpdir) + dump_src(path, amd, Path(tmpdir)) + dump_ir(path, amd, Path(tmpdir)) + + build_graphs(amd, path / "ir", path / "graphs_amd") + build_graphs(nvidia, path / "ir", path / "graphs_nvidia") + + +def main(): + """Main entry point.""" + create_devmap_dataset(Path(pathflag.path())) + + +if __name__ == "__main__": + app.Run(main) diff --git a/programl/task/devmap/dataset/create_test.py b/programl/task/devmap/dataset/create_test.py new file mode 100644 index 000000000..d0b81a453 --- /dev/null +++ b/programl/task/devmap/dataset/create_test.py @@ -0,0 +1,37 @@ +# Copyright 2019-2020 the ProGraML authors. +# +# Contact Chris Cummins . +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Smoke test for //programl/task/devmap/dataset:create.""" +from pathlib import Path + +from labm8.py import test + +from programl.task.devmap.dataset.create import create_devmap_dataset + + +def test_create_devmap_dataset(tempdir: Path): + """Test dataset creation.""" + create_devmap_dataset(tempdir) + assert (tempdir / "ir").is_dir() + assert (tempdir / "src").is_dir() + assert (tempdir / "graphs_amd").is_dir() + assert (tempdir / "graphs_nvidia").is_dir() + + assert len(list((tempdir / "graphs_amd").iterdir())) == 680 + assert len(list((tempdir / "graphs_nvidia").iterdir())) == 680 + + +if __name__ == "__main__": + test.Main() diff --git a/programl/test/data/BUILD b/programl/test/data/BUILD index fe02b11d4..06106e7ea 100644 --- a/programl/test/data/BUILD +++ b/programl/test/data/BUILD @@ -18,6 +18,12 @@ package(default_visibility = ["//programl:__subpackages__"]) +filegroup( + name = "dataflow_task_mini_dataset_tar", + testonly = 1, + srcs = ["dataflow_task_mini_dataset.tar.bz2"], +) + filegroup( name = "hlo_protos", testonly = 1, diff --git a/requirements.txt b/requirements.txt index 86482c188..2690e91d7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,24 +1,23 @@ -cycler==0.10.0 # Needed by matplotlib. -decorator==4.3.0 -gast==0.2.2. # Dependency of tensorflow. -GPUtil==1.4.0 -Keras==2.3.1 -kiwisolver==1.0.1 # Needed by matplotlib. -matplotlib==2.2.0rc1 -networkx==2.2 -numpy==1.16.4 -pandas==0.24.1 -pathlib==1.0.1 -protobuf==3.6.1 -psutil==5.4.5 -pyparsing==2.2.0 -python-dateutil==2.6.1 -pytz==2018.3 -PyYAML==4.2b4 -scikit-learn==0.20.3 -scipy==1.2.1 -seaborn==0.9.0 -tensorflow==1.14.0 # NOTE: Must be installed manually with `pip install`. -torch==1.3.0 -tqdm==4.38.0 -labm8==2020.06.07 +absl-py>=0.9.0 +cycler>=0.10.0 # Needed by matplotlib. +decorator>=4.3.0 +GPUtil>=1.4.0 +keras_preprocessing >= 1.1.1, < 1.2 +kiwisolver>=1.0.1 # Needed by matplotlib. +labm8>=2020.06.07 +matplotlib>=2.2.0rc1 +networkx>=2.2 +numpy >= 1.16.0, < 1.19.0 +pandas>=0.24.1 +pathlib>=1.0.1 +protobuf>=3.13.0 +psutil>=5.4.5 +pyparsing>=2.2.0 +python-dateutil>=2.6.1 +pytz>=2018.3 +PyYAML>=4.2b4 +scikit-learn>=0.23.2 +scipy>=1.2.1 +seaborn>=0.9.0 +torch>=1.3.0 +tqdm>=4.38.0 diff --git a/third_party/py/keras/BUILD b/third_party/py/absl/BUILD similarity index 53% rename from third_party/py/keras/BUILD rename to third_party/py/absl/BUILD index 9148ce991..ec133c8d1 100644 --- a/third_party/py/keras/BUILD +++ b/third_party/py/absl/BUILD @@ -4,15 +4,12 @@ load("@programl_requirements//:requirements.bzl", "requirement") package(default_visibility = ["//visibility:public"]) -licenses(["notice"]) # MIT +licenses(["notice"]) # Apache 2.0 py_library( - name = "keras", + name = "absl", srcs = ["//third_party/py:empty.py"], deps = [ - requirement("keras"), - requirement("PyYAML"), # Implicit dependency. - "//third_party/py/scipy", # Implicit dependency. - "//third_party/py/tensorflow", + requirement("absl-py"), ], ) diff --git a/third_party/py/scikit_learn/BUILD b/third_party/py/scikit_learn/BUILD index ca8a9eacb..6e061272c 100644 --- a/third_party/py/scikit_learn/BUILD +++ b/third_party/py/scikit_learn/BUILD @@ -11,6 +11,7 @@ py_library( srcs = ["//third_party/py:empty.py"], deps = [ requirement("scikit-learn"), + requirement("joblib"), "//third_party/py/scipy", ], ) diff --git a/third_party/py/tensorflow/BUILD b/third_party/py/tensorflow/BUILD index 0ca627bf8..981f8c2a3 100644 --- a/third_party/py/tensorflow/BUILD +++ b/third_party/py/tensorflow/BUILD @@ -1,35 +1,32 @@ -# A wrapper around tensorflow pip package to support optional gpu. -# -# If a python target requires TensorFlow, it should depend on this package -# (i.e. //third_party/py/tensorflow), instead of requirement("tensorflow"). -# This is because the pip package for TensorFlow with CUDA support has a -# different name. -# -# Use: -# -# from third_party.py.tensorflow import tf -# -# to import Tensorflow rather than "import tensorflow as tf" due to a bug in -# packing Tensorflow as a pip dependency for bazel. -# See github.com/bazelbuild/rules_python/issues/71 - -load("@programl_requirements//:requirements.bzl", "requirement") +load("@programl_tensorflow_requirements//:requirements.bzl", "requirement") package(default_visibility = ["//visibility:public"]) licenses(["notice"]) # Apache 2.0. -exports_files(["LICENSE"]) - py_library( name = "tensorflow", - srcs = [":tf.py"], + srcs = ["//third_party/py:empty.py"], deps = [ - # rules_pip fails for TensorFlow, causing an empty package to be - # downloaded. Because of this, we require the user to manually install - # the version of TensorFlow specified in requirements.txt: - # requirement("tensorflow"), - "//third_party/py/numpy", - "//third_party/py/protobuf", + requirement("tensorflow"), + # Copied from: + # https://github.com/tensorflow/tensorflow/blob/f3a015274fadab00ec8cad92af2a968e0ecd434f/tensorflow/tools/pip_package/setup.py#L54-L73 + requirement("absl-py"), + requirement("astunparse"), + requirement("flatbuffers"), + requirement("gast"), + requirement("google_pasta"), + requirement("h5py"), + requirement("keras_preprocessing"), + requirement("numpy"), + requirement("opt_einsum"), + requirement("protobuf"), + requirement("six"), + requirement("tensorboard"), + requirement("tensorflow_estimator"), + requirement("termcolor"), + requirement("wheel"), + requirement("wrapt"), + "//third/party/py/keras_preprocessing", ], ) diff --git a/third_party/py/tensorflow/LICENSE b/third_party/py/tensorflow/LICENSE deleted file mode 100644 index 4862420c0..000000000 --- a/third_party/py/tensorflow/LICENSE +++ /dev/null @@ -1,203 +0,0 @@ -Copyright 2018 The TensorFlow Authors. All rights reserved. - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright 2017, The TensorFlow Authors. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/third_party/py/tensorflow/requirements.txt b/third_party/py/tensorflow/requirements.txt new file mode 100644 index 000000000..9118dd403 --- /dev/null +++ b/third_party/py/tensorflow/requirements.txt @@ -0,0 +1,19 @@ +tensorflow == 2.3.0 +# Tensorflow requirements +# https://github.com/tensorflow/tensorflow/blob/f3a015274fadab00ec8cad92af2a968e0ecd434f/tensorflow/tools/pip_package/setup.py#L54-L73 +absl-py >= 0.7.0 +astunparse == 1.6.3 +flatbuffers >= 1.12 +gast == 0.3.3 +google_pasta >= 0.1.8 +h5py >= 2.10.0, < 2.11.0 +keras_preprocessing >= 1.1.1, < 1.2 +numpy >= 1.16.0, < 1.19.0 +opt_einsum >= 2.3.2 +protobuf >= 3.13.0 +tensorboard >= 2.3.0, < 3 +tensorflow_estimator >= 2.3.0, < 2.4.0 +termcolor >= 1.1.0 +wrapt >= 1.11.1 +wheel >= 0.26 +six >= 1.12.0 \ No newline at end of file diff --git a/third_party/py/tensorflow/tf.py b/third_party/py/tensorflow/tf.py deleted file mode 100644 index 751c76348..000000000 --- a/third_party/py/tensorflow/tf.py +++ /dev/null @@ -1,56 +0,0 @@ -"""Import Tensorflow. - -This module is a drop-in replacement for regular tensorflow. Replace: - - import tensorflow as tf - -with: - - from third_party.py.tensorflow import tf - -This wrapper is required to workaround a known bug with packaging Tensorflow -as a pip dependency with bazel. See: -github.com/bazelbuild/rules_python/issues/71 -""" -import pathlib -import sys - -try: - # Try importing Tensorflow the vanilla way. This will succeed once - # github.com/bazelbuild/rules_python/issues/71 is fixed. - import tensorflow -except (ImportError, ModuleNotFoundError): - # That failed, so see if there is a version of the package elsewhere on the - # system that we can force python into loading. - extra_site_packages = [ - "/usr/local/lib/python3.7/site-packages", - ] - for path in extra_site_packages: - tensorflow_site_package = pathlib.Path(path) / "tensorflow" - if tensorflow_site_package.is_dir(): - # Add the additional packages location to the python path. - sys.path.insert(0, path) - try: - import tensorflow - - break - except (ImportError, ModuleNotFoundError): - pass - finally: - # Restore python path. - del sys.path[0] - -# Disable deprecation warnings on the glob import below. -import tensorflow as tf - -tf.get_logger().setLevel("ERROR") - -# Import Tensorflow into this module's namespace. If the above import attempts -# failed, this will raise an error. -from tensorflow import * - -# Reset logging level now that we have imported everything. -tf.get_logger().setLevel("INFO") - -# Spoof that we've imported the package generically. -__file__ = tensorflow.__file__