Revert "Temporarily Revert "[clangd] Add Random Forest runtime for code completion.""

We intend to replace heuristics based code completion ranking with a Decision Forest Model.

This patch introduces a format for representing the model and an inference runtime that is code-generated at build time.
- Forest.json contains all the trees as an array of trees.
- Features.json describes the features to be used.
- Codegen file takes the above two files and generates CompletionModel containing Feature struct and corresponding Evaluate function.
   The Evaluate function maps a feature to a real number describing the relevance of this candidate.
- The codegen is part of build system and these files are generated at build time.
- Proposes a way to test the generated runtime using a test model.
  - Replicates the model structure in unittests.
  - unittest tests both the test model (for correct tree traversal) and the real model (for sanity).

This reverts commit 549e55b3d5.
This commit is contained in:
Utkarsh Saxena 2020-09-19 10:07:34 +02:00
parent f64903fd81
commit 985deba931
12 changed files with 706 additions and 0 deletions

View File

@ -28,6 +28,9 @@ set(LLVM_LINK_COMPONENTS
FrontendOpenMP
Option
)
include(${CMAKE_CURRENT_SOURCE_DIR}/quality/CompletionModel.cmake)
gen_decision_forest(${CMAKE_CURRENT_SOURCE_DIR}/quality/model CompletionModel clang::clangd::Example)
if(MSVC AND NOT CLANG_CL)
set_source_files_properties(CompileCommands.cpp PROPERTIES COMPILE_FLAGS -wd4130) # disables C4130: logical operation on address of string constant
@ -77,6 +80,7 @@ add_clang_library(clangDaemon
TUScheduler.cpp
URI.cpp
XRefs.cpp
${CMAKE_CURRENT_BINARY_DIR}/CompletionModel.cpp
index/Background.cpp
index/BackgroundIndexLoader.cpp
@ -117,6 +121,11 @@ add_clang_library(clangDaemon
omp_gen
)
# Include generated CompletionModel headers.
target_include_directories(clangDaemon PUBLIC
$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
)
clang_target_link_libraries(clangDaemon
PRIVATE
clangAST

View File

@ -0,0 +1,37 @@
# Run the Completion Model Codegenerator on the model present in the
# ${model} directory.
# Produces a pair of files called ${filename}.h and ${filename}.cpp in the
# ${CMAKE_CURRENT_BINARY_DIR}. The generated header
# will define a C++ class called ${cpp_class} - which may be a
# namespace-qualified class name.
function(gen_decision_forest model filename cpp_class)
set(model_compiler ${CMAKE_SOURCE_DIR}/../clang-tools-extra/clangd/quality/CompletionModelCodegen.py)
set(output_dir ${CMAKE_CURRENT_BINARY_DIR})
set(header_file ${output_dir}/${filename}.h)
set(cpp_file ${output_dir}/${filename}.cpp)
add_custom_command(OUTPUT ${header_file} ${cpp_file}
COMMAND "${Python3_EXECUTABLE}" ${model_compiler}
--model ${model}
--output_dir ${output_dir}
--filename ${filename}
--cpp_class ${cpp_class}
COMMENT "Generating code completion model runtime..."
DEPENDS ${model_compiler} ${model}/forest.json ${model}/features.json
VERBATIM )
set_source_files_properties(${header_file} PROPERTIES
GENERATED 1)
set_source_files_properties(${cpp_file} PROPERTIES
GENERATED 1)
# Disable unused label warning for generated files.
if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
set_source_files_properties(${cpp_file} PROPERTIES
COMPILE_FLAGS /wd4102)
else()
set_source_files_properties(${cpp_file} PROPERTIES
COMPILE_FLAGS -Wno-unused)
endif()
endfunction()

View File

@ -0,0 +1,290 @@
"""Code generator for Code Completion Model Inference.
Tool runs on the Decision Forest model defined in {model} directory.
It generates two files: {output_dir}/{filename}.h and {output_dir}/{filename}.cpp
The generated files defines the Example class named {cpp_class} having all the features as class members.
The generated runtime provides an `Evaluate` function which can be used to score a code completion candidate.
"""
import argparse
import json
import struct
class CppClass:
"""Holds class name and names of the enclosing namespaces."""
def __init__(self, cpp_class):
ns_and_class = cpp_class.split("::")
self.ns = [ns for ns in ns_and_class[0:-1] if len(ns) > 0]
self.name = ns_and_class[-1]
if len(self.name) == 0:
raise ValueError("Empty class name.")
def ns_begin(self):
"""Returns snippet for opening namespace declarations."""
open_ns = ["namespace %s {" % ns for ns in self.ns]
return "\n".join(open_ns)
def ns_end(self):
"""Returns snippet for closing namespace declarations."""
close_ns = [
"} // namespace %s" % ns for ns in reversed(self.ns)]
return "\n".join(close_ns)
def header_guard(filename):
'''Returns the header guard for the generated header.'''
return "GENERATED_DECISION_FOREST_MODEL_%s_H" % filename.upper()
def boost_node(n, label, next_label):
"""Returns code snippet for a leaf/boost node.
Adds value of leaf to the score and jumps to the root of the next tree."""
return "%s: Score += %s; goto %s;" % (
label, n['score'], next_label)
def if_greater_node(n, label, next_label):
"""Returns code snippet for a if_greater node.
Jumps to true_label if the Example feature (NUMBER) is greater than the threshold.
Comparing integers is much faster than comparing floats. Assuming floating points
are represented as IEEE 754, it order-encodes the floats to integers before comparing them.
Control falls through if condition is evaluated to false."""
threshold = n["threshold"]
return "%s: if (E.%s >= %s /*%s*/) goto %s;" % (
label, n['feature'], order_encode(threshold), threshold, next_label)
def if_member_node(n, label, next_label):
"""Returns code snippet for a if_member node.
Jumps to true_label if the Example feature (ENUM) is present in the set of enum values
described in the node.
Control falls through if condition is evaluated to false."""
members = '|'.join([
"BIT(%s_type::%s)" % (n['feature'], member)
for member in n["set"]
])
return "%s: if (E.%s & (%s)) goto %s;" % (
label, n['feature'], members, next_label)
def node(n, label, next_label):
"""Returns code snippet for the node."""
return {
'boost': boost_node,
'if_greater': if_greater_node,
'if_member': if_member_node,
}[n['operation']](n, label, next_label)
def tree(t, tree_num, node_num):
"""Returns code for inferencing a Decision Tree.
Also returns the size of the decision tree.
A tree starts with its label `t{tree#}`.
A node of the tree starts with label `t{tree#}_n{node#}`.
The tree contains two types of node: Conditional node and Leaf node.
- Conditional node evaluates a condition. If true, it jumps to the true node/child.
Code is generated using pre-order traversal of the tree considering
false node as the first child. Therefore the false node is always the
immediately next label.
- Leaf node adds the value to the score and jumps to the next tree.
"""
label = "t%d_n%d" % (tree_num, node_num)
code = []
if node_num == 0:
code.append("t%d:" % tree_num)
if t["operation"] == "boost":
code.append(node(t, label=label, next_label="t%d" % (tree_num + 1)))
return code, 1
false_code, false_size = tree(
t['else'], tree_num=tree_num, node_num=node_num+1)
true_node_num = node_num+false_size+1
true_label = "t%d_n%d" % (tree_num, true_node_num)
true_code, true_size = tree(
t['then'], tree_num=tree_num, node_num=true_node_num)
code.append(node(t, label=label, next_label=true_label))
return code+false_code+true_code, 1+false_size+true_size
def gen_header_code(features_json, cpp_class, filename):
"""Returns code for header declaring the inference runtime.
Declares the Example class named {cpp_class} inside relevant namespaces.
The Example class contains all the features as class members. This
class can be used to represent a code completion candidate.
Provides `float Evaluate()` function which can be used to score the Example.
"""
setters = []
for f in features_json:
feature = f["name"]
if f["kind"] == "NUMBER":
# Floats are order-encoded to integers for faster comparison.
setters.append(
"void set%s(float V) { %s = OrderEncode(V); }" % (
feature, feature))
elif f["kind"] == "ENUM":
setters.append(
"void set%s(unsigned V) { %s = 1 << V; }" % (feature, feature))
else:
raise ValueError("Unhandled feature type.", f["kind"])
# Class members represent all the features of the Example.
class_members = ["uint32_t %s = 0;" % f['name'] for f in features_json]
nline = "\n "
guard = header_guard(filename)
return """#ifndef %s
#define %s
#include <cstdint>
%s
class %s {
public:
%s
private:
%s
// Produces an integer that sorts in the same order as F.
// That is: a < b <==> orderEncode(a) < orderEncode(b).
static uint32_t OrderEncode(float F);
friend float Evaluate(const %s&);
};
float Evaluate(const %s&);
%s
#endif // %s
""" % (guard, guard, cpp_class.ns_begin(), cpp_class.name, nline.join(setters),
nline.join(class_members), cpp_class.name, cpp_class.name,
cpp_class.ns_end(), guard)
def order_encode(v):
i = struct.unpack('<I', struct.pack('<f', v))[0]
TopBit = 1 << 31
# IEEE 754 floats compare like sign-magnitude integers.
if (i & TopBit): # Negative float
return (1 << 32) - i # low half of integers, order reversed.
return TopBit + i # top half of integers
def evaluate_func(forest_json, cpp_class):
"""Generates code for `float Evaluate(const {Example}&)` function.
The generated function can be used to score an Example."""
code = "float Evaluate(const %s& E) {\n" % cpp_class.name
lines = []
lines.append("float Score = 0;")
tree_num = 0
for tree_json in forest_json:
lines.extend(tree(tree_json, tree_num=tree_num, node_num=0)[0])
lines.append("")
tree_num += 1
lines.append("t%s: // No such tree." % len(forest_json))
lines.append("return Score;")
code += " " + "\n ".join(lines)
code += "\n}"
return code
def gen_cpp_code(forest_json, features_json, filename, cpp_class):
"""Generates code for the .cpp file."""
# Headers
# Required by OrderEncode(float F).
angled_include = [
'#include <%s>' % h
for h in ["cstring", "limits"]
]
# Include generated header.
qouted_headers = {filename + '.h', 'llvm/ADT/bit.h'}
# Headers required by ENUM features used by the model.
qouted_headers |= {f["header"]
for f in features_json if f["kind"] == "ENUM"}
quoted_include = ['#include "%s"' % h for h in sorted(qouted_headers)]
# using-decl for ENUM features.
using_decls = "\n".join("using %s_type = %s;" % (
feature['name'], feature['type'])
for feature in features_json
if feature["kind"] == "ENUM")
nl = "\n"
return """%s
%s
#define BIT(X) (1 << X)
%s
%s
uint32_t %s::OrderEncode(float F) {
static_assert(std::numeric_limits<float>::is_iec559, "");
constexpr uint32_t TopBit = ~(~uint32_t{0} >> 1);
// Get the bits of the float. Endianness is the same as for integers.
uint32_t U = llvm::bit_cast<uint32_t>(F);
std::memcpy(&U, &F, sizeof(U));
// IEEE 754 floats compare like sign-magnitude integers.
if (U & TopBit) // Negative float.
return 0 - U; // Map onto the low half of integers, order reversed.
return U + TopBit; // Positive floats map onto the high half of integers.
}
%s
%s
""" % (nl.join(angled_include), nl.join(quoted_include), cpp_class.ns_begin(),
using_decls, cpp_class.name, evaluate_func(forest_json, cpp_class),
cpp_class.ns_end())
def main():
parser = argparse.ArgumentParser('DecisionForestCodegen')
parser.add_argument('--filename', help='output file name.')
parser.add_argument('--output_dir', help='output directory.')
parser.add_argument('--model', help='path to model directory.')
parser.add_argument(
'--cpp_class',
help='The name of the class (which may be a namespace-qualified) created in generated header.'
)
ns = parser.parse_args()
output_dir = ns.output_dir
filename = ns.filename
header_file = "%s/%s.h" % (output_dir, filename)
cpp_file = "%s/%s.cpp" % (output_dir, filename)
cpp_class = CppClass(cpp_class=ns.cpp_class)
model_file = "%s/forest.json" % ns.model
features_file = "%s/features.json" % ns.model
with open(features_file) as f:
features_json = json.load(f)
with open(model_file) as m:
forest_json = json.load(m)
with open(cpp_file, 'w+t') as output_cc:
output_cc.write(
gen_cpp_code(forest_json=forest_json,
features_json=features_json,
filename=filename,
cpp_class=cpp_class))
with open(header_file, 'w+t') as output_h:
output_h.write(gen_header_code(
features_json=features_json, cpp_class=cpp_class, filename=filename))
if __name__ == '__main__':
main()

View File

@ -0,0 +1,220 @@
# Decision Forest Code Completion Model
## Decision Forest
A **decision forest** is a collection of many decision trees. A **decision tree** is a full binary tree that provides a quality prediction for an input (code completion item). Internal nodes represent a **binary decision** based on the input data, and leaf nodes represent a prediction.
In order to predict the relevance of a code completion item, we traverse each of the decision trees beginning with their roots until we reach a leaf.
An input (code completion candidate) is characterized as a set of **features**, such as the *type of symbol* or the *number of existing references*.
At every non-leaf node, we evaluate the condition to decide whether to go left or right. The condition compares one *feature** of the input against a constant. The condition can be of two types:
- **if_greater**: Checks whether a numerical feature is **>=** a **threshold**.
- **if_member**: Check whether the **enum** feature is contained in the **set** defined in the node.
A leaf node contains the value **score**.
To compute an overall **quality** score, we traverse each tree in this way and add up the scores.
## Model Input Format
The input model is represented in json format.
### Features
The file **features.json** defines the features available to the model.
It is a json list of features. The features can be of following two kinds.
#### Number
```
{
"name": "a_numerical_feature",
"kind": "NUMBER"
}
```
#### Enum
```
{
"name": "an_enum_feature",
"kind": "ENUM",
"enum": "fully::qualified::enum",
"header": "path/to/HeaderDeclaringEnum.h"
}
```
The field `enum` specifies the fully qualified name of the enum.
The maximum cardinality of the enum can be **32**.
The field `header` specifies the header containing the declaration of the enum.
This header is included by the inference runtime.
### Decision Forest
The file `forest.json` defines the decision forest. It is a json list of **DecisionTree**.
**DecisionTree** is one of **IfGreaterNode**, **IfMemberNode**, **LeafNode**.
#### IfGreaterNode
```
{
"operation": "if_greater",
"feature": "a_numerical_feature",
"threshold": A real number,
"then": {A DecisionTree},
"else": {A DecisionTree}
}
```
#### IfMemberNode
```
{
"operation": "if_member",
"feature": "an_enum_feature",
"set": ["enum_value1", "enum_value2", ...],
"then": {A DecisionTree},
"else": {A DecisionTree}
}
```
#### LeafNode
```
{
"operation": "boost",
"score": A real number
}
```
## Code Generator for Inference
The implementation of inference runtime is split across:
### Code generator
The code generator `CompletionModelCodegen.py` takes input the `${model}` dir and generates the inference library:
- `${output_dir}/{filename}.h`
- `${output_dir}/{filename}.cpp`
Invocation
```
python3 CompletionModelCodegen.py \
--model path/to/model/dir \
--output_dir path/to/output/dir \
--filename OutputFileName \
--cpp_class clang::clangd::YourExampleClass
```
### Build System
`CompletionModel.cmake` provides `gen_decision_forest` method .
Client intending to use the CompletionModel for inference can use this to trigger the code generator and generate the inference library.
It can then use the generated API by including and depending on this library.
### Generated API for inference
The code generator defines the Example `class` inside relevant namespaces as specified in option `${cpp_class}`.
Members of this generated class comprises of all the features mentioned in `features.json`.
Thus this class can represent a code completion candidate that needs to be scored.
The API also provides `float Evaluate(const MyClass&)` which can be used to score the completion candidate.
## Example
### model/features.json
```
[
{
"name": "ANumber",
"type": "NUMBER"
},
{
"name": "AFloat",
"type": "NUMBER"
},
{
"name": "ACategorical",
"type": "ENUM",
"enum": "ns1::ns2::TestEnum",
"header": "model/CategoricalFeature.h"
}
]
```
### model/forest.json
```
[
{
"operation": "if_greater",
"feature": "ANumber",
"threshold": 200.0,
"then": {
"operation": "if_greater",
"feature": "AFloat",
"threshold": -1,
"then": {
"operation": "boost",
"score": 10.0
},
"else": {
"operation": "boost",
"score": -20.0
}
},
"else": {
"operation": "if_member",
"feature": "ACategorical",
"set": [
"A",
"C"
],
"then": {
"operation": "boost",
"score": 3.0
},
"else": {
"operation": "boost",
"score": -4.0
}
}
},
{
"operation": "if_member",
"feature": "ACategorical",
"set": [
"A",
"B"
],
"then": {
"operation": "boost",
"score": 5.0
},
"else": {
"operation": "boost",
"score": -6.0
}
}
]
```
### DecisionForestRuntime.h
```
...
namespace ns1 {
namespace ns2 {
namespace test {
class Example {
public:
void setANumber(float V) { ... }
void setAFloat(float V) { ... }
void setACategorical(unsigned V) { ... }
private:
...
};
float Evaluate(const Example&);
} // namespace test
} // namespace ns2
} // namespace ns1
```
### CMake Invocation
Inorder to use the inference runtime, one can use `gen_decision_forest` function
described in `CompletionModel.cmake` which invokes `CodeCompletionCodegen.py` with the appropriate arguments.
For example, the following invocation reads the model present in `path/to/model` and creates
`${CMAKE_CURRENT_BINARY_DIR}/myfilename.h` and `${CMAKE_CURRENT_BINARY_DIR}/myfilename.cpp`
describing a `class` named `MyClass` in namespace `fully::qualified`.
```
gen_decision_forest(path/to/model
myfilename
::fully::qualifed::MyClass)
```

View File

@ -0,0 +1,8 @@
[
{
"name": "ContextKind",
"kind": "ENUM",
"type": "clang::CodeCompletionContext::Kind",
"header": "clang/Sema/CodeCompleteConsumer.h"
}
]

View File

@ -0,0 +1,18 @@
[
{
"operation": "if_member",
"feature": "ContextKind",
"set": [
"CCC_DotMemberAccess",
"CCC_ArrowMemberAccess"
],
"then": {
"operation": "boost",
"score": 3.0
},
"else": {
"operation": "boost",
"score": 1.0
}
}
]

View File

@ -28,6 +28,9 @@ if (CLANGD_ENABLE_REMOTE)
set(REMOTE_TEST_SOURCES remote/MarshallingTests.cpp)
endif()
include(${CMAKE_CURRENT_SOURCE_DIR}/../quality/CompletionModel.cmake)
gen_decision_forest(${CMAKE_CURRENT_SOURCE_DIR}/decision_forest_model DecisionForestRuntimeTest ::ns1::ns2::test::Example)
add_custom_target(ClangdUnitTests)
add_unittest(ClangdUnitTests ClangdTests
Annotations.cpp
@ -44,6 +47,7 @@ add_unittest(ClangdUnitTests ClangdTests
ConfigCompileTests.cpp
ConfigProviderTests.cpp
ConfigYAMLTests.cpp
DecisionForestTests.cpp
DexTests.cpp
DiagnosticsTests.cpp
DraftStoreTests.cpp
@ -89,6 +93,7 @@ add_unittest(ClangdUnitTests ClangdTests
TweakTesting.cpp
URITests.cpp
XRefsTests.cpp
${CMAKE_CURRENT_BINARY_DIR}/DecisionForestRuntimeTest.cpp
support/CancellationTests.cpp
support/ContextTests.cpp
@ -103,6 +108,11 @@ add_unittest(ClangdUnitTests ClangdTests
$<TARGET_OBJECTS:obj.clangDaemonTweaks>
)
# Include generated ComletionModel headers.
target_include_directories(ClangdTests PUBLIC
$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
)
clang_target_link_libraries(ClangdTests
PRIVATE
clangAST

View File

@ -10,6 +10,7 @@
#include "ClangdServer.h"
#include "CodeComplete.h"
#include "Compiler.h"
#include "CompletionModel.h"
#include "Matchers.h"
#include "Protocol.h"
#include "Quality.h"
@ -47,6 +48,7 @@ using ::testing::HasSubstr;
using ::testing::IsEmpty;
using ::testing::Not;
using ::testing::UnorderedElementsAre;
using ContextKind = CodeCompletionContext::Kind;
// GMock helpers for matching completion items.
MATCHER_P(Named, Name, "") { return arg.Name == Name; }
@ -161,6 +163,16 @@ Symbol withReferences(int N, Symbol S) {
return S;
}
TEST(DecisionForestRuntime, SanityTest) {
using Example = clangd::Example;
using clangd::Evaluate;
Example E1;
E1.setContextKind(ContextKind::CCC_ArrowMemberAccess);
Example E2;
E2.setContextKind(ContextKind::CCC_SymbolOrNewName);
EXPECT_GT(Evaluate(E1), Evaluate(E2));
}
TEST(CompletionTest, Limit) {
clangd::CodeCompleteOptions Opts;
Opts.Limit = 2;

View File

@ -0,0 +1,29 @@
#include "DecisionForestRuntimeTest.h"
#include "decision_forest_model/CategoricalFeature.h"
#include "gtest/gtest.h"
namespace clang {
namespace clangd {
TEST(DecisionForestRuntime, Evaluate) {
using Example = ::ns1::ns2::test::Example;
using Cat = ::ns1::ns2::TestEnum;
using ::ns1::ns2::test::Evaluate;
Example E;
E.setANumber(200); // True
E.setAFloat(0); // True: +10.0
E.setACategorical(Cat::A); // True: +5.0
EXPECT_EQ(Evaluate(E), 15.0);
E.setANumber(200); // True
E.setAFloat(-2.5); // False: -20.0
E.setACategorical(Cat::B); // True: +5.0
EXPECT_EQ(Evaluate(E), -15.0);
E.setANumber(100); // False
E.setACategorical(Cat::C); // True: +3.0, False: -6.0
EXPECT_EQ(Evaluate(E), -3.0);
}
} // namespace clangd
} // namespace clang

View File

@ -0,0 +1,5 @@
namespace ns1 {
namespace ns2 {
enum TestEnum { A, B, C, D };
} // namespace ns2
} // namespace ns1

View File

@ -0,0 +1,16 @@
[
{
"name": "ANumber",
"kind": "NUMBER"
},
{
"name": "AFloat",
"kind": "NUMBER"
},
{
"name": "ACategorical",
"kind": "ENUM",
"type": "ns1::ns2::TestEnum",
"header": "decision_forest_model/CategoricalFeature.h"
}
]

View File

@ -0,0 +1,52 @@
[
{
"operation": "if_greater",
"feature": "ANumber",
"threshold": 200.0,
"then": {
"operation": "if_greater",
"feature": "AFloat",
"threshold": -1,
"then": {
"operation": "boost",
"score": 10.0
},
"else": {
"operation": "boost",
"score": -20.0
}
},
"else": {
"operation": "if_member",
"feature": "ACategorical",
"set": [
"A",
"C"
],
"then": {
"operation": "boost",
"score": 3.0
},
"else": {
"operation": "boost",
"score": -4.0
}
}
},
{
"operation": "if_member",
"feature": "ACategorical",
"set": [
"A",
"B"
],
"then": {
"operation": "boost",
"score": 5.0
},
"else": {
"operation": "boost",
"score": -6.0
}
}
]