bad_clone_prediction/MethodFunctionRelationExtra...

472 lines
19 KiB
Python

import json
import pathlib
import re
from cmath import isnan
from difflib import SequenceMatcher
from typing import List
import numpy as np
import pandas as pd
from dulwich import diff_tree
from dulwich.objects import Commit
from dulwich.repo import Repo
import GlobalConstants
from ConfigOperator import ConfigOperator
from GitOperator import GitOperator
from models.LineRelationInfo import LineRelationInfo
from models.MethodInfo import MethodInfo
from models.RepoInfo import RepoInfo
from MySQLOperator import MySQLOperator
class CommitLineRelationExtractor(object):
"""
This class is used for extracting line number relations between parent commit and child commit
"""
def __init__(
self,
repoInfo: RepoInfo,
commit: Commit,
commit_id: int,
parent: Commit,
parent_id: int,
tree_changes: List[diff_tree.TreeChange],
configOp: ConfigOperator,
):
self.repoInfo = repoInfo
self.repo = Repo(self.repoInfo.bare_repo_path)
self.commit = commit
self.commit_id = commit_id
self.parent = parent
self.parent_id = parent_id
self.tree_changes = tree_changes
self.configOp = configOp
def extract_diff(self, old_content: list, new_content: list):
"""
Function: extract the differences between two contents (Only store new lines for insert and replace; only store old lines for delete).
We also need to store the line number relationship between old and new file, {
old line number: [new line numbers] # because there may not be strict line number relationship
}
params:
- old_content: a list of strings
- new_content: a list of strings
return:
- df: pd.DataFrame: columns [line_old, line_new, content_old, content_new]
"""
line_old_list = []
line_new_list = []
content_old_list = []
content_new_list = []
for tag, i1, i2, j1, j2 in SequenceMatcher(
None, old_content, new_content
).get_opcodes():
if tag == "equal":
for i in range(i2 - i1):
old_line = i1 + i + 1
new_line = j1 + i + 1
line_old_list.append(old_line)
line_new_list.append(new_line)
content_old_list.append(None)
content_new_list.append(None)
elif tag == "insert":
for i in range(j1 + 1, j2 + 1):
line_old_list.append(np.nan)
line_new_list.append(i)
content_old_list.append(None)
content_new_list.append(new_content[i - 1])
elif tag == "delete":
for i in range(i1 + 1, i2 + 1):
line_old_list.append(i)
line_new_list.append(np.nan)
content_old_list.append(old_content[i - 1])
content_new_list.append(None)
elif tag == "replace":
for i in range(i2 - i1):
old_line = i1 + i + 1
line_old_list.append(old_line)
line_new_list.append(np.nan)
content_old_list.append(old_content[old_line - 1])
content_new_list.append(None)
for j in range(j2 - j1):
new_line = j1 + j + 1
line_old_list.append(np.nan)
line_new_list.append(new_line)
content_old_list.append(None)
content_new_list.append(new_content[new_line - 1])
else:
raise Exception("Function extract_diff Error: type error!")
return pd.DataFrame.from_dict(
{
"line_old": line_old_list,
"line_new": line_new_list,
"content_old": content_old_list,
"content_new": content_new_list,
}
)
def handle_tree_change(self, tree_change: diff_tree.TreeChange):
"""
Function: get the changed relative filepath and changed lines of a diff_tree.TreeChange object
return:
- df: pd.DataFrame: columns [line_old, line_new, content_old, content_new, filepath_old, filepath_new, commit_id_old, commit_id_new]
change_types:
CHANGE_ADD = "add"
CHANGE_MODIFY = "modify"
CHANGE_DELETE = "delete"
CHANGE_RENAME = "rename"
CHANGE_COPY = "copy"
CHANGE_UNCHANGED = "unchanged"
"""
if tree_change.old.sha is not None:
old_content = self.repo.object_store[tree_change.old.sha].data.splitlines()
if tree_change.new.sha is not None:
new_content = self.repo.object_store[tree_change.new.sha].data.splitlines()
change_type = tree_change.type
if change_type == "add":
line_relation_df = self.extract_diff(
old_content=[], new_content=new_content
)
elif change_type == "delete":
line_relation_df = self.extract_diff(
old_content=old_content,
new_content=[],
)
elif change_type == "modify":
line_relation_df = self.extract_diff(
old_content=old_content,
new_content=new_content,
)
elif change_type == "rename":
line_relation_df = self.extract_diff(
old_content=old_content,
new_content=new_content,
)
elif change_type == "copy":
line_relation_df = self.extract_diff(
old_content=old_content,
new_content=new_content,
)
"""
dulwich's TreeChange's path is always in Linux mode, Windows is not supported
Therefore, we need to localize the filepath
"""
if tree_change.old.path is None:
filepath_old = None
else:
filepath_old = str(pathlib.Path(tree_change.old.path.decode())).encode()
if tree_change.new.path is None:
filepath_new = None
else:
filepath_new = str(pathlib.Path(tree_change.new.path.decode())).encode()
line_relation_df["filepath_old"] = filepath_old
line_relation_df["filepath_new"] = filepath_new
line_relation_df["commit_id_old"] = self.parent_id
line_relation_df["commit_id_new"] = self.commit_id
return line_relation_df
def parse(self) -> pd.DataFrame:
"""
Function: extracting changes for List[diff_tree.TreeChange]
"""
result: pd.DataFrame = pd.DataFrame(
columns=[
"filepath_old",
"filepath_new",
"line_old",
"line_new",
"content_old",
"content_new",
"commit_id_old",
"commit_id_new",
]
)
for tree_change in self.tree_changes:
if (
tree_change.new.path is not None
and (
not self.configOp.is_lang_supported(
filepath=str(
pathlib.Path(tree_change.new.path.decode())
).encode()
)
)
) or (
tree_change.old.path is not None
and (
not self.configOp.is_lang_supported(
filepath=str(
pathlib.Path(tree_change.old.path.decode())
).encode()
)
)
):
continue # the file language is not supported
line_relation_df = self.handle_tree_change(tree_change)
result = result.append(line_relation_df)
return result
def extract_method_function_relation(
repoInfo: RepoInfo, mysqlOp: MySQLOperator, configOp: ConfigOperator
):
step_name = "method function extraction"
mysqlOp.cursor.execute(
"select handled from `{steps_tablename}` where step_name=%s".format(
steps_tablename=mysqlOp.tablename_dict["steps"]
),
(step_name),
)
handled = mysqlOp.cursor.fetchone()["handled"]
if not handled:
mysqlOp.truncate_table(
tablename=mysqlOp.tablename_dict["method_function_relations"]
)
# read filepath_id_dict
filepath_id_dict = mysqlOp.get_filepath_id_dict()
# read commit_sha_id_dict
commit_sha_id_dict = mysqlOp.get_commit_sha_id_dict()
# read method_id_name dict
method_id_name_dict = mysqlOp.get_method_id_name_dict()
repo = Repo(repoInfo.bare_repo_path)
def handle_commit(commit_id: int, commit: Commit):
"""
Function: extract the commit line change relations
params:
- commit_id: the mysql id of Commit
- commit: the Commit object
"""
# read line no and method id relationship (commit_id -> filepath -> line no -> method id), because we also need parent_commits' filepath line method relationship
related_commit_ids = [commit_id]
parent_shas = commit.parents
for parent_sha in parent_shas:
parent_id = mysqlOp.get_commit_id_by_sha(sha=parent_sha)
if parent_id is not None:
related_commit_ids.append(parent_id)
commit_filepath_lineno_method_id_dict = {}
for related_commit_id in related_commit_ids:
tmp_dict = mysqlOp.get_fp_lineno_method_id_dict(
commit_id=related_commit_id,
)
commit_filepath_lineno_method_id_dict[related_commit_id] = tmp_dict
def formatrow(row):
if row["filepath_old"] is None:
filepath_id_old = np.nan
else:
if row["filepath_old"] not in filepath_id_dict:
"""
Some filepaths gotten by dulwich are different with the real filepaths
in the mysql database and the key names in filepath_id_dict. When this bug
happened we set filepath_id_old = None.
"""
filepath_id_old = None
else:
filepath_id_old = filepath_id_dict[row["filepath_old"]]
if row["filepath_new"] is None:
filepath_id_new = np.nan
else:
if row["filepath_new"] not in filepath_id_dict:
"""
Some filepaths gotten by dulwich are different with the real filepaths
in the mysql database and the key names in filepath_id_dict. When this bug
happened we set filepath_id_old = None.
Example: When deal with the repository git@github.com:apache/iotdb.git, a filepath in
filepath_id_dict is 'iotdb\\metrics\\interface\\src\\main\\java\\org\\apache\\iotdb\\metrics\\DoNothingMetricService.java'
while the filepath obtained by dulwich will ignore "iotdb\\"
"""
filepath_id_new = None
else:
filepath_id_new = filepath_id_dict[row["filepath_new"]]
commit_id_old = row["commit_id_old"]
commit_id_new = row["commit_id_new"]
line_old = row["line_old"]
line_new = row["line_new"]
if np.isnan(line_old):
method_id_old = np.nan
else:
if (
commit_id_old not in commit_filepath_lineno_method_id_dict
or filepath_id_old
not in commit_filepath_lineno_method_id_dict[commit_id_old]
or line_old
not in commit_filepath_lineno_method_id_dict[commit_id_old][
filepath_id_old
]
):
method_id_old = np.nan # this line is not related to a method
else:
method_id_old = commit_filepath_lineno_method_id_dict[
commit_id_old
][filepath_id_old][line_old]
if np.isnan(line_new):
method_id_new = np.nan
else:
if (
commit_id_new not in commit_filepath_lineno_method_id_dict
or filepath_id_new
not in commit_filepath_lineno_method_id_dict[commit_id_new]
or line_new
not in commit_filepath_lineno_method_id_dict[commit_id_new][
filepath_id_new
]
):
method_id_new = np.nan # this line is not related to a method
else:
method_id_new = commit_filepath_lineno_method_id_dict[
commit_id_new
][filepath_id_new][line_new]
if np.isnan(method_id_old):
method_name_old = None
else:
method_name_old = method_id_name_dict[method_id_old]
if np.isnan(method_id_new):
method_name_new = None
else:
method_name_new = method_id_name_dict[method_id_new]
return method_id_old, method_name_old, method_id_new, method_name_new
def get_changes(df):
"""
Function: get the method_function_changes according to the df
params:
- df: the line change dataframe
return:
- List[{
"method_id_1"
"method_id_2"
"change": json str
}]
"""
result = []
filtered_df = df.loc[
(df["method_name_old"].notnull())
& (df["method_name_new"].notnull())
& (df["method_name_old"] == df["method_name_new"])
] # if functions are related, they should have one line of code that are not modified; therefore, this line of code should have the same related method name
groups = filtered_df.groupby(["method_id_old", "method_id_new"])
for name, _ in groups:
method_id_old = int(name[0])
method_id_new = int(name[1])
# get the changed contents
change_delete = list(
df[
(df["method_id_old"] == method_id_old)
& (df["content_old"].notnull())
]["content_old"]
)
change_add = list(
df[
(df["method_id_new"] == method_id_new)
& (df["content_new"].notnull())
]["content_new"]
)
if len(change_delete) == 0 and len(change_add) == 0:
change = None
else:
change = [b"ADD:"]
change.extend(change_add)
change.extend([b"DELETE:"])
change.extend(change_delete)
change = b"\n".join(change)
result.append(
{
"method_id_1": method_id_old,
"method_id_2": method_id_new,
"change": change,
}
)
return result
if len(commit.parents) == 0:
# no method change
pass
else:
for parent_sha in commit.parents:
parent_id = mysqlOp.get_commit_id_by_sha(
sha=parent_sha,
)
tree_changes = diff_tree.tree_changes(
store=repo.object_store,
tree1_id=repo.object_store[parent_sha].tree,
tree2_id=repo.object_store[commit.id].tree,
)
line_relation_df = CommitLineRelationExtractor(
repoInfo=repoInfo,
commit=commit,
commit_id=commit_id,
parent=repo.object_store[parent_sha],
parent_id=parent_id,
tree_changes=tree_changes,
configOp=configOp,
).parse()
if line_relation_df.shape[0] > 0:
line_relation_df[
[
"method_id_old",
"method_name_old",
"method_id_new",
"method_name_new",
]
] = line_relation_df.apply(
formatrow, axis=1, result_type="expand"
)
changes = get_changes(line_relation_df)
for change in changes:
method_id_1 = change["method_id_1"]
method_id_2 = change["method_id_2"]
change_content = change["change"]
mysqlOp.cursor.execute(
"insert into `{method_function_relation_tablename}` (method_id_1, method_id_2, `change`) values (%s, %s, %s)".format(
method_function_relation_tablename=mysqlOp.tablename_dict[
"method_function_relations"
]
),
(method_id_1, method_id_2, change_content),
)
for commit_sha, commit_id in commit_sha_id_dict.items():
handle_commit(commit_id=commit_id, commit=repo.object_store[commit_sha])
# update steps table
mysqlOp.cursor.execute(
"update `{steps_tablename}` set handled=%s where step_name=%s".format(
steps_tablename=mysqlOp.tablename_dict["steps"]
),
(1, step_name),
)
mysqlOp.connection.commit()