bad_clone_prediction/MethodFunctionRelationExtra...

472 lines
19 KiB
Python
Raw Permalink Normal View History

import json
2022-08-14 19:14:02 +08:00
import pathlib
2022-08-14 17:02:37 +08:00
import re
2022-08-17 18:40:20 +08:00
from cmath import isnan
2022-08-14 17:02:37 +08:00
from difflib import SequenceMatcher
from typing import List
2022-08-14 17:02:37 +08:00
import numpy as np
import pandas as pd
from dulwich import diff_tree
2022-08-08 09:08:18 +08:00
from dulwich.objects import Commit
2022-08-14 17:02:37 +08:00
from dulwich.repo import Repo
2022-08-08 09:08:18 +08:00
import GlobalConstants
2022-08-14 17:02:37 +08:00
from ConfigOperator import ConfigOperator
from GitOperator import GitOperator
from models.LineRelationInfo import LineRelationInfo
from models.MethodInfo import MethodInfo
from models.RepoInfo import RepoInfo
from MySQLOperator import MySQLOperator
2022-08-14 17:02:37 +08:00
class CommitLineRelationExtractor(object):
"""
This class is used for extracting line number relations between parent commit and child commit
"""
2022-08-14 17:02:37 +08:00
def __init__(
self,
repoInfo: RepoInfo,
commit: Commit,
commit_id: int,
parent: Commit,
parent_id: int,
tree_changes: List[diff_tree.TreeChange],
configOp: ConfigOperator,
):
self.repoInfo = repoInfo
self.repo = Repo(self.repoInfo.bare_repo_path)
self.commit = commit
self.commit_id = commit_id
self.parent = parent
self.parent_id = parent_id
self.tree_changes = tree_changes
self.configOp = configOp
def extract_diff(self, old_content: list, new_content: list):
"""
Function: extract the differences between two contents (Only store new lines for insert and replace; only store old lines for delete).
We also need to store the line number relationship between old and new file, {
old line number: [new line numbers] # because there may not be strict line number relationship
}
params:
- old_content: a list of strings
- new_content: a list of strings
return:
- df: pd.DataFrame: columns [line_old, line_new, content_old, content_new]
"""
line_old_list = []
line_new_list = []
content_old_list = []
content_new_list = []
for tag, i1, i2, j1, j2 in SequenceMatcher(
None, old_content, new_content
).get_opcodes():
if tag == "equal":
for i in range(i2 - i1):
old_line = i1 + i + 1
new_line = j1 + i + 1
line_old_list.append(old_line)
line_new_list.append(new_line)
content_old_list.append(None)
content_new_list.append(None)
elif tag == "insert":
for i in range(j1 + 1, j2 + 1):
line_old_list.append(np.nan)
line_new_list.append(i)
content_old_list.append(None)
content_new_list.append(new_content[i - 1])
elif tag == "delete":
for i in range(i1 + 1, i2 + 1):
line_old_list.append(i)
line_new_list.append(np.nan)
content_old_list.append(old_content[i - 1])
content_new_list.append(None)
elif tag == "replace":
for i in range(i2 - i1):
old_line = i1 + i + 1
line_old_list.append(old_line)
line_new_list.append(np.nan)
content_old_list.append(old_content[old_line - 1])
content_new_list.append(None)
for j in range(j2 - j1):
new_line = j1 + j + 1
line_old_list.append(np.nan)
line_new_list.append(new_line)
content_old_list.append(None)
content_new_list.append(new_content[new_line - 1])
else:
raise Exception("Function extract_diff Error: type error!")
return pd.DataFrame.from_dict(
{
"line_old": line_old_list,
"line_new": line_new_list,
"content_old": content_old_list,
"content_new": content_new_list,
}
)
2022-08-14 17:02:37 +08:00
def handle_tree_change(self, tree_change: diff_tree.TreeChange):
"""
Function: get the changed relative filepath and changed lines of a diff_tree.TreeChange object
return:
- df: pd.DataFrame: columns [line_old, line_new, content_old, content_new, filepath_old, filepath_new, commit_id_old, commit_id_new]
change_types:
CHANGE_ADD = "add"
CHANGE_MODIFY = "modify"
CHANGE_DELETE = "delete"
CHANGE_RENAME = "rename"
CHANGE_COPY = "copy"
CHANGE_UNCHANGED = "unchanged"
"""
if tree_change.old.sha is not None:
old_content = self.repo.object_store[tree_change.old.sha].data.splitlines()
if tree_change.new.sha is not None:
new_content = self.repo.object_store[tree_change.new.sha].data.splitlines()
2022-08-14 17:02:37 +08:00
change_type = tree_change.type
if change_type == "add":
line_relation_df = self.extract_diff(
old_content=[], new_content=new_content
2022-08-14 17:02:37 +08:00
)
elif change_type == "delete":
line_relation_df = self.extract_diff(
old_content=old_content,
2022-08-14 17:02:37 +08:00
new_content=[],
)
elif change_type == "modify":
line_relation_df = self.extract_diff(
old_content=old_content,
new_content=new_content,
2022-08-14 17:02:37 +08:00
)
elif change_type == "rename":
line_relation_df = self.extract_diff(
old_content=old_content,
new_content=new_content,
2022-08-14 17:02:37 +08:00
)
elif change_type == "copy":
line_relation_df = self.extract_diff(
old_content=old_content,
new_content=new_content,
2022-08-14 17:02:37 +08:00
)
2022-08-14 19:14:02 +08:00
"""
dulwich's TreeChange's path is always in Linux mode, Windows is not supported
Therefore, we need to localize the filepath
"""
if tree_change.old.path is None:
filepath_old = None
else:
filepath_old = str(pathlib.Path(tree_change.old.path.decode())).encode()
if tree_change.new.path is None:
filepath_new = None
else:
filepath_new = str(pathlib.Path(tree_change.new.path.decode())).encode()
line_relation_df["filepath_old"] = filepath_old
line_relation_df["filepath_new"] = filepath_new
2022-08-14 17:02:37 +08:00
line_relation_df["commit_id_old"] = self.parent_id
line_relation_df["commit_id_new"] = self.commit_id
return line_relation_df
def parse(self) -> pd.DataFrame:
"""
Function: extracting changes for List[diff_tree.TreeChange]
"""
result: pd.DataFrame = pd.DataFrame(
columns=[
"filepath_old",
"filepath_new",
"line_old",
"line_new",
"content_old",
"content_new",
"commit_id_old",
"commit_id_new",
]
)
2022-08-14 17:02:37 +08:00
for tree_change in self.tree_changes:
if (
tree_change.new.path is not None
2022-08-14 19:14:02 +08:00
and (
not self.configOp.is_lang_supported(
filepath=str(
pathlib.Path(tree_change.new.path.decode())
).encode()
)
)
2022-08-14 17:02:37 +08:00
) or (
tree_change.old.path is not None
2022-08-14 19:14:02 +08:00
and (
not self.configOp.is_lang_supported(
filepath=str(
pathlib.Path(tree_change.old.path.decode())
).encode()
)
)
2022-08-14 17:02:37 +08:00
):
continue # the file language is not supported
line_relation_df = self.handle_tree_change(tree_change)
result = result.append(line_relation_df)
return result
def extract_method_function_relation(
repoInfo: RepoInfo, mysqlOp: MySQLOperator, configOp: ConfigOperator
):
2022-08-17 18:40:20 +08:00
step_name = "method function extraction"
2022-08-14 17:02:37 +08:00
mysqlOp.cursor.execute(
2022-08-17 18:40:20 +08:00
"select handled from `{steps_tablename}` where step_name=%s".format(
steps_tablename=mysqlOp.tablename_dict["steps"]
2022-08-14 17:02:37 +08:00
),
2022-08-17 18:40:20 +08:00
(step_name),
2022-08-14 17:02:37 +08:00
)
2022-08-17 18:40:20 +08:00
handled = mysqlOp.cursor.fetchone()["handled"]
2022-08-17 18:40:20 +08:00
if not handled:
2022-08-17 18:40:20 +08:00
mysqlOp.truncate_table(
tablename=mysqlOp.tablename_dict["method_function_relations"]
)
2022-08-17 18:40:20 +08:00
# read filepath_id_dict
filepath_id_dict = mysqlOp.get_filepath_id_dict()
2022-08-17 18:40:20 +08:00
# read commit_sha_id_dict
commit_sha_id_dict = mysqlOp.get_commit_sha_id_dict()
2022-08-14 17:02:37 +08:00
2022-08-17 18:40:20 +08:00
# read method_id_name dict
method_id_name_dict = mysqlOp.get_method_id_name_dict()
2022-08-14 17:02:37 +08:00
2022-08-17 18:40:20 +08:00
repo = Repo(repoInfo.bare_repo_path)
2022-08-14 17:02:37 +08:00
2022-08-17 18:40:20 +08:00
def handle_commit(commit_id: int, commit: Commit):
"""
Function: extract the commit line change relations
params:
- commit_id: the mysql id of Commit
- commit: the Commit object
"""
2022-08-14 17:02:37 +08:00
2022-08-17 18:40:20 +08:00
# read line no and method id relationship (commit_id -> filepath -> line no -> method id), because we also need parent_commits' filepath line method relationship
related_commit_ids = [commit_id]
parent_shas = commit.parents
for parent_sha in parent_shas:
parent_id = mysqlOp.get_commit_id_by_sha(sha=parent_sha)
if parent_id is not None:
related_commit_ids.append(parent_id)
commit_filepath_lineno_method_id_dict = {}
for related_commit_id in related_commit_ids:
tmp_dict = mysqlOp.get_fp_lineno_method_id_dict(
commit_id=related_commit_id,
)
commit_filepath_lineno_method_id_dict[related_commit_id] = tmp_dict
2022-08-14 17:02:37 +08:00
2022-08-17 18:40:20 +08:00
def formatrow(row):
if row["filepath_old"] is None:
filepath_id_old = np.nan
2022-08-14 17:02:37 +08:00
else:
2022-08-24 09:17:29 +08:00
if row["filepath_old"] not in filepath_id_dict:
"""
Some filepaths gotten by dulwich are different with the real filepaths
in the mysql database and the key names in filepath_id_dict. When this bug
happened we set filepath_id_old = None.
2022-08-24 09:17:29 +08:00
"""
filepath_id_old = None
2022-08-24 09:17:29 +08:00
else:
filepath_id_old = filepath_id_dict[row["filepath_old"]]
2022-08-17 18:40:20 +08:00
if row["filepath_new"] is None:
filepath_id_new = np.nan
2022-08-14 17:02:37 +08:00
else:
2022-08-24 09:17:29 +08:00
if row["filepath_new"] not in filepath_id_dict:
"""
Some filepaths gotten by dulwich are different with the real filepaths
in the mysql database and the key names in filepath_id_dict. When this bug
happened we set filepath_id_old = None.
2022-08-24 09:26:35 +08:00
Example: When deal with the repository git@github.com:apache/iotdb.git, a filepath in
filepath_id_dict is 'iotdb\\metrics\\interface\\src\\main\\java\\org\\apache\\iotdb\\metrics\\DoNothingMetricService.java'
while the filepath obtained by dulwich will ignore "iotdb\\"
2022-08-24 09:17:29 +08:00
"""
filepath_id_new = None
2022-08-24 09:17:29 +08:00
else:
filepath_id_new = filepath_id_dict[row["filepath_new"]]
2022-08-14 17:02:37 +08:00
2022-08-17 18:40:20 +08:00
commit_id_old = row["commit_id_old"]
commit_id_new = row["commit_id_new"]
2022-08-14 17:02:37 +08:00
2022-08-17 18:40:20 +08:00
line_old = row["line_old"]
line_new = row["line_new"]
2022-08-14 17:02:37 +08:00
2022-08-17 18:40:20 +08:00
if np.isnan(line_old):
method_id_old = np.nan
else:
if (
2022-08-17 18:40:20 +08:00
commit_id_old not in commit_filepath_lineno_method_id_dict
or filepath_id_old
not in commit_filepath_lineno_method_id_dict[commit_id_old]
or line_old
not in commit_filepath_lineno_method_id_dict[commit_id_old][
filepath_id_old
]
):
method_id_old = np.nan # this line is not related to a method
else:
2022-08-17 22:22:40 +08:00
method_id_old = commit_filepath_lineno_method_id_dict[
commit_id_old
][filepath_id_old][line_old]
2022-08-17 18:40:20 +08:00
if np.isnan(line_new):
method_id_new = np.nan
else:
if (
2022-08-17 18:40:20 +08:00
commit_id_new not in commit_filepath_lineno_method_id_dict
or filepath_id_new
not in commit_filepath_lineno_method_id_dict[commit_id_new]
or line_new
not in commit_filepath_lineno_method_id_dict[commit_id_new][
filepath_id_new
]
):
method_id_new = np.nan # this line is not related to a method
else:
2022-08-17 22:22:40 +08:00
method_id_new = commit_filepath_lineno_method_id_dict[
commit_id_new
][filepath_id_new][line_new]
2022-08-17 18:40:20 +08:00
if np.isnan(method_id_old):
method_name_old = None
else:
method_name_old = method_id_name_dict[method_id_old]
2022-08-14 17:02:37 +08:00
2022-08-17 18:40:20 +08:00
if np.isnan(method_id_new):
method_name_new = None
else:
method_name_new = method_id_name_dict[method_id_new]
return method_id_old, method_name_old, method_id_new, method_name_new
def get_changes(df):
"""
Function: get the method_function_changes according to the df
params:
- df: the line change dataframe
return:
- List[{
"method_id_1"
"method_id_2"
"change": json str
}]
"""
result = []
filtered_df = df.loc[
(df["method_name_old"].notnull())
& (df["method_name_new"].notnull())
& (df["method_name_old"] == df["method_name_new"])
] # if functions are related, they should have one line of code that are not modified; therefore, this line of code should have the same related method name
groups = filtered_df.groupby(["method_id_old", "method_id_new"])
for name, _ in groups:
method_id_old = int(name[0])
method_id_new = int(name[1])
# get the changed contents
change_delete = list(
df[
(df["method_id_old"] == method_id_old)
& (df["content_old"].notnull())
]["content_old"]
)
change_add = list(
df[
(df["method_id_new"] == method_id_new)
& (df["content_new"].notnull())
]["content_new"]
)
if len(change_delete) == 0 and len(change_add) == 0:
change = None
else:
change = [b"ADD:"]
change.extend(change_add)
change.extend([b"DELETE:"])
change.extend(change_delete)
change = b"\n".join(change)
result.append(
{
"method_id_1": method_id_old,
"method_id_2": method_id_new,
"change": change,
}
)
return result
2022-08-14 17:02:37 +08:00
2022-08-17 18:40:20 +08:00
if len(commit.parents) == 0:
# no method change
pass
else:
for parent_sha in commit.parents:
parent_id = mysqlOp.get_commit_id_by_sha(
sha=parent_sha,
2022-08-14 17:02:37 +08:00
)
2022-08-17 18:40:20 +08:00
tree_changes = diff_tree.tree_changes(
store=repo.object_store,
tree1_id=repo.object_store[parent_sha].tree,
tree2_id=repo.object_store[commit.id].tree,
2022-08-14 17:02:37 +08:00
)
2022-08-17 18:40:20 +08:00
line_relation_df = CommitLineRelationExtractor(
repoInfo=repoInfo,
commit=commit,
commit_id=commit_id,
parent=repo.object_store[parent_sha],
parent_id=parent_id,
tree_changes=tree_changes,
configOp=configOp,
).parse()
if line_relation_df.shape[0] > 0:
line_relation_df[
[
"method_id_old",
"method_name_old",
"method_id_new",
"method_name_new",
]
] = line_relation_df.apply(
formatrow, axis=1, result_type="expand"
2022-08-14 17:02:37 +08:00
)
2022-08-17 18:40:20 +08:00
changes = get_changes(line_relation_df)
for change in changes:
method_id_1 = change["method_id_1"]
method_id_2 = change["method_id_2"]
change_content = change["change"]
mysqlOp.cursor.execute(
"insert into `{method_function_relation_tablename}` (method_id_1, method_id_2, `change`) values (%s, %s, %s)".format(
method_function_relation_tablename=mysqlOp.tablename_dict[
"method_function_relations"
]
),
(method_id_1, method_id_2, change_content),
)
for commit_sha, commit_id in commit_sha_id_dict.items():
handle_commit(commit_id=commit_id, commit=repo.object_store[commit_sha])
2022-08-14 17:02:37 +08:00
2022-08-17 18:40:20 +08:00
# update steps table
mysqlOp.cursor.execute(
2022-08-17 18:40:20 +08:00
"update `{steps_tablename}` set handled=%s where step_name=%s".format(
steps_tablename=mysqlOp.tablename_dict["steps"]
),
2022-08-17 18:40:20 +08:00
(1, step_name),
)
mysqlOp.connection.commit()