[clangd] Split DecisionForest Evaluate() into one func per tree.

This allows us MSAN to instrument this function. Previous version is not
instrumentable due to it shear volume.

Differential Revision: https://reviews.llvm.org/D88536
This commit is contained in:
Utkarsh Saxena 2020-09-30 09:36:37 +02:00
parent f4b9dfd9bc
commit 45698ac005
1 changed files with 60 additions and 39 deletions

View File

@ -1,7 +1,7 @@
"""Code generator for Code Completion Model Inference.
Tool runs on the Decision Forest model defined in {model} directory.
It generates two files: {output_dir}/{filename}.h and {output_dir}/{filename}.cpp
It generates two files: {output_dir}/{filename}.h and {output_dir}/{filename}.cpp
The generated files defines the Example class named {cpp_class} having all the features as class members.
The generated runtime provides an `Evaluate` function which can be used to score a code completion candidate.
"""
@ -39,34 +39,32 @@ def header_guard(filename):
def boost_node(n, label, next_label):
"""Returns code snippet for a leaf/boost node.
Adds value of leaf to the score and jumps to the root of the next tree."""
return "%s: Score += %s; goto %s;" % (
label, n['score'], next_label)
"""Returns code snippet for a leaf/boost node."""
return "%s: return %s;" % (label, n['score'])
def if_greater_node(n, label, next_label):
"""Returns code snippet for a if_greater node.
Jumps to true_label if the Example feature (NUMBER) is greater than the threshold.
Comparing integers is much faster than comparing floats. Assuming floating points
Jumps to true_label if the Example feature (NUMBER) is greater than the threshold.
Comparing integers is much faster than comparing floats. Assuming floating points
are represented as IEEE 754, it order-encodes the floats to integers before comparing them.
Control falls through if condition is evaluated to false."""
threshold = n["threshold"]
return "%s: if (E.%s >= %s /*%s*/) goto %s;" % (
label, n['feature'], order_encode(threshold), threshold, next_label)
return "%s: if (E.get%s() >= %s /*%s*/) goto %s;" % (
label, n['feature'], order_encode(threshold), threshold, next_label)
def if_member_node(n, label, next_label):
"""Returns code snippet for a if_member node.
Jumps to true_label if the Example feature (ENUM) is present in the set of enum values
Jumps to true_label if the Example feature (ENUM) is present in the set of enum values
described in the node.
Control falls through if condition is evaluated to false."""
members = '|'.join([
"BIT(%s_type::%s)" % (n['feature'], member)
for member in n["set"]
])
return "%s: if (E.%s & (%s)) goto %s;" % (
label, n['feature'], members, next_label)
return "%s: if (E.get%s() & (%s)) goto %s;" % (
label, n['feature'], members, next_label)
def node(n, label, next_label):
@ -94,8 +92,6 @@ def tree(t, tree_num, node_num):
"""
label = "t%d_n%d" % (tree_num, node_num)
code = []
if node_num == 0:
code.append("t%d:" % tree_num)
if t["operation"] == "boost":
code.append(node(t, label=label, next_label="t%d" % (tree_num + 1)))
@ -119,13 +115,15 @@ def gen_header_code(features_json, cpp_class, filename):
"""Returns code for header declaring the inference runtime.
Declares the Example class named {cpp_class} inside relevant namespaces.
The Example class contains all the features as class members. This
The Example class contains all the features as class members. This
class can be used to represent a code completion candidate.
Provides `float Evaluate()` function which can be used to score the Example.
"""
setters = []
getters = []
for f in features_json:
feature = f["name"]
if f["kind"] == "NUMBER":
# Floats are order-encoded to integers for faster comparison.
setters.append(
@ -138,8 +136,15 @@ def gen_header_code(features_json, cpp_class, filename):
raise ValueError("Unhandled feature type.", f["kind"])
# Class members represent all the features of the Example.
class_members = ["uint32_t %s = 0;" % f['name'] for f in features_json]
class_members = [
"uint32_t %s = 0;" % f['name']
for f in features_json
]
getters = [
"LLVM_ATTRIBUTE_ALWAYS_INLINE uint32_t get%s() const { return %s; }"
% (f['name'], f['name'])
for f in features_json
]
nline = "\n "
guard = header_guard(filename)
return """#ifndef %s
@ -150,6 +155,10 @@ def gen_header_code(features_json, cpp_class, filename):
%s
class %s {
public:
// Setters.
%s
// Getters.
%s
private:
@ -158,18 +167,16 @@ private:
// Produces an integer that sorts in the same order as F.
// That is: a < b <==> orderEncode(a) < orderEncode(b).
static uint32_t OrderEncode(float F);
friend float Evaluate(const %s&);
};
// The function may have large number of lines of code. MSAN
// build times out in such case.
LLVM_NO_SANITIZE("memory")
float Evaluate(const %s&);
%s
#endif // %s
""" % (guard, guard, cpp_class.ns_begin(), cpp_class.name, nline.join(setters),
nline.join(class_members), cpp_class.name, cpp_class.name,
cpp_class.ns_end(), guard)
""" % (guard, guard, cpp_class.ns_begin(), cpp_class.name,
nline.join(setters),
nline.join(getters),
nline.join(class_members),
cpp_class.name, cpp_class.ns_end(), guard)
def order_encode(v):
@ -182,21 +189,33 @@ def order_encode(v):
def evaluate_func(forest_json, cpp_class):
"""Generates code for `float Evaluate(const {Example}&)` function.
The generated function can be used to score an Example."""
code = "float Evaluate(const %s& E) {\n" % cpp_class.name
lines = []
lines.append("float Score = 0;")
"""Generates evaluation functions for each tree and combines them in
`float Evaluate(const {Example}&)` function. This function can be
used to score an Example."""
code = ""
# Generate evaluation function of each tree.
code += "namespace {\n"
tree_num = 0
for tree_json in forest_json:
lines.extend(tree(tree_json, tree_num=tree_num, node_num=0)[0])
lines.append("")
code += "LLVM_ATTRIBUTE_NOINLINE float EvaluateTree%d(const %s& E) {\n" % (tree_num, cpp_class.name)
code += " " + \
"\n ".join(
tree(tree_json, tree_num=tree_num, node_num=0)[0]) + "\n"
code += "}\n\n"
tree_num += 1
code += "} // namespace\n\n"
# Combine the scores of all trees in the final function.
# MSAN will timeout if these functions are inlined.
code += "float Evaluate(const %s& E) {\n" % cpp_class.name
code += " float Score = 0;\n"
for tree_num in range(len(forest_json)):
code += " Score += EvaluateTree%d(E);\n" % tree_num
code += " return Score;\n"
code += "}\n"
lines.append("t%s: // No such tree." % len(forest_json))
lines.append("return Score;")
code += " " + "\n ".join(lines)
code += "\n}"
return code
@ -218,9 +237,9 @@ def gen_cpp_code(forest_json, features_json, filename, cpp_class):
# using-decl for ENUM features.
using_decls = "\n".join("using %s_type = %s;" % (
feature['name'], feature['type'])
for feature in features_json
if feature["kind"] == "ENUM")
feature['name'], feature['type'])
for feature in features_json
if feature["kind"] == "ENUM")
nl = "\n"
return """%s
@ -287,7 +306,9 @@ def main():
with open(header_file, 'w+t') as output_h:
output_h.write(gen_header_code(
features_json=features_json, cpp_class=cpp_class, filename=filename))
features_json=features_json,
cpp_class=cpp_class,
filename=filename))
if __name__ == '__main__':