472 lines
19 KiB
Python
472 lines
19 KiB
Python
import json
|
|
import pathlib
|
|
import re
|
|
from cmath import isnan
|
|
from difflib import SequenceMatcher
|
|
from typing import List
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
from dulwich import diff_tree
|
|
from dulwich.objects import Commit
|
|
from dulwich.repo import Repo
|
|
|
|
import GlobalConstants
|
|
from ConfigOperator import ConfigOperator
|
|
from GitOperator import GitOperator
|
|
from models.LineRelationInfo import LineRelationInfo
|
|
from models.MethodInfo import MethodInfo
|
|
from models.RepoInfo import RepoInfo
|
|
from MySQLOperator import MySQLOperator
|
|
|
|
|
|
class CommitLineRelationExtractor(object):
|
|
"""
|
|
This class is used for extracting line number relations between parent commit and child commit
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
repoInfo: RepoInfo,
|
|
commit: Commit,
|
|
commit_id: int,
|
|
parent: Commit,
|
|
parent_id: int,
|
|
tree_changes: List[diff_tree.TreeChange],
|
|
configOp: ConfigOperator,
|
|
):
|
|
self.repoInfo = repoInfo
|
|
self.repo = Repo(self.repoInfo.bare_repo_path)
|
|
self.commit = commit
|
|
self.commit_id = commit_id
|
|
self.parent = parent
|
|
self.parent_id = parent_id
|
|
self.tree_changes = tree_changes
|
|
self.configOp = configOp
|
|
|
|
def extract_diff(self, old_content: list, new_content: list):
|
|
"""
|
|
Function: extract the differences between two contents (Only store new lines for insert and replace; only store old lines for delete).
|
|
We also need to store the line number relationship between old and new file, {
|
|
old line number: [new line numbers] # because there may not be strict line number relationship
|
|
}
|
|
params:
|
|
- old_content: a list of strings
|
|
- new_content: a list of strings
|
|
return:
|
|
- df: pd.DataFrame: columns [line_old, line_new, content_old, content_new]
|
|
"""
|
|
line_old_list = []
|
|
line_new_list = []
|
|
content_old_list = []
|
|
content_new_list = []
|
|
for tag, i1, i2, j1, j2 in SequenceMatcher(
|
|
None, old_content, new_content
|
|
).get_opcodes():
|
|
if tag == "equal":
|
|
for i in range(i2 - i1):
|
|
old_line = i1 + i + 1
|
|
new_line = j1 + i + 1
|
|
line_old_list.append(old_line)
|
|
line_new_list.append(new_line)
|
|
content_old_list.append(None)
|
|
content_new_list.append(None)
|
|
elif tag == "insert":
|
|
for i in range(j1 + 1, j2 + 1):
|
|
line_old_list.append(np.nan)
|
|
line_new_list.append(i)
|
|
content_old_list.append(None)
|
|
content_new_list.append(new_content[i - 1])
|
|
elif tag == "delete":
|
|
for i in range(i1 + 1, i2 + 1):
|
|
line_old_list.append(i)
|
|
line_new_list.append(np.nan)
|
|
content_old_list.append(old_content[i - 1])
|
|
content_new_list.append(None)
|
|
elif tag == "replace":
|
|
for i in range(i2 - i1):
|
|
old_line = i1 + i + 1
|
|
line_old_list.append(old_line)
|
|
line_new_list.append(np.nan)
|
|
content_old_list.append(old_content[old_line - 1])
|
|
content_new_list.append(None)
|
|
for j in range(j2 - j1):
|
|
new_line = j1 + j + 1
|
|
line_old_list.append(np.nan)
|
|
line_new_list.append(new_line)
|
|
content_old_list.append(None)
|
|
content_new_list.append(new_content[new_line - 1])
|
|
else:
|
|
raise Exception("Function extract_diff Error: type error!")
|
|
return pd.DataFrame.from_dict(
|
|
{
|
|
"line_old": line_old_list,
|
|
"line_new": line_new_list,
|
|
"content_old": content_old_list,
|
|
"content_new": content_new_list,
|
|
}
|
|
)
|
|
|
|
def handle_tree_change(self, tree_change: diff_tree.TreeChange):
|
|
"""
|
|
Function: get the changed relative filepath and changed lines of a diff_tree.TreeChange object
|
|
return:
|
|
- df: pd.DataFrame: columns [line_old, line_new, content_old, content_new, filepath_old, filepath_new, commit_id_old, commit_id_new]
|
|
|
|
change_types:
|
|
CHANGE_ADD = "add"
|
|
CHANGE_MODIFY = "modify"
|
|
CHANGE_DELETE = "delete"
|
|
CHANGE_RENAME = "rename"
|
|
CHANGE_COPY = "copy"
|
|
CHANGE_UNCHANGED = "unchanged"
|
|
"""
|
|
|
|
if tree_change.old.sha is not None:
|
|
old_content = self.repo.object_store[tree_change.old.sha].data.splitlines()
|
|
|
|
if tree_change.new.sha is not None:
|
|
new_content = self.repo.object_store[tree_change.new.sha].data.splitlines()
|
|
|
|
change_type = tree_change.type
|
|
if change_type == "add":
|
|
line_relation_df = self.extract_diff(
|
|
old_content=[], new_content=new_content
|
|
)
|
|
elif change_type == "delete":
|
|
line_relation_df = self.extract_diff(
|
|
old_content=old_content,
|
|
new_content=[],
|
|
)
|
|
elif change_type == "modify":
|
|
line_relation_df = self.extract_diff(
|
|
old_content=old_content,
|
|
new_content=new_content,
|
|
)
|
|
elif change_type == "rename":
|
|
line_relation_df = self.extract_diff(
|
|
old_content=old_content,
|
|
new_content=new_content,
|
|
)
|
|
elif change_type == "copy":
|
|
line_relation_df = self.extract_diff(
|
|
old_content=old_content,
|
|
new_content=new_content,
|
|
)
|
|
|
|
"""
|
|
dulwich's TreeChange's path is always in Linux mode, Windows is not supported
|
|
Therefore, we need to localize the filepath
|
|
"""
|
|
if tree_change.old.path is None:
|
|
filepath_old = None
|
|
else:
|
|
filepath_old = str(pathlib.Path(tree_change.old.path.decode())).encode()
|
|
|
|
if tree_change.new.path is None:
|
|
filepath_new = None
|
|
else:
|
|
filepath_new = str(pathlib.Path(tree_change.new.path.decode())).encode()
|
|
|
|
line_relation_df["filepath_old"] = filepath_old
|
|
line_relation_df["filepath_new"] = filepath_new
|
|
line_relation_df["commit_id_old"] = self.parent_id
|
|
line_relation_df["commit_id_new"] = self.commit_id
|
|
return line_relation_df
|
|
|
|
def parse(self) -> pd.DataFrame:
|
|
|
|
"""
|
|
Function: extracting changes for List[diff_tree.TreeChange]
|
|
"""
|
|
result: pd.DataFrame = pd.DataFrame(
|
|
columns=[
|
|
"filepath_old",
|
|
"filepath_new",
|
|
"line_old",
|
|
"line_new",
|
|
"content_old",
|
|
"content_new",
|
|
"commit_id_old",
|
|
"commit_id_new",
|
|
]
|
|
)
|
|
|
|
for tree_change in self.tree_changes:
|
|
|
|
if (
|
|
tree_change.new.path is not None
|
|
and (
|
|
not self.configOp.is_lang_supported(
|
|
filepath=str(
|
|
pathlib.Path(tree_change.new.path.decode())
|
|
).encode()
|
|
)
|
|
)
|
|
) or (
|
|
tree_change.old.path is not None
|
|
and (
|
|
not self.configOp.is_lang_supported(
|
|
filepath=str(
|
|
pathlib.Path(tree_change.old.path.decode())
|
|
).encode()
|
|
)
|
|
)
|
|
):
|
|
continue # the file language is not supported
|
|
|
|
line_relation_df = self.handle_tree_change(tree_change)
|
|
|
|
result = result.append(line_relation_df)
|
|
|
|
return result
|
|
|
|
|
|
def extract_method_function_relation(
|
|
repoInfo: RepoInfo, mysqlOp: MySQLOperator, configOp: ConfigOperator
|
|
):
|
|
|
|
step_name = "method function extraction"
|
|
mysqlOp.cursor.execute(
|
|
"select handled from `{steps_tablename}` where step_name=%s".format(
|
|
steps_tablename=mysqlOp.tablename_dict["steps"]
|
|
),
|
|
(step_name),
|
|
)
|
|
handled = mysqlOp.cursor.fetchone()["handled"]
|
|
|
|
if not handled:
|
|
|
|
mysqlOp.truncate_table(
|
|
tablename=mysqlOp.tablename_dict["method_function_relations"]
|
|
)
|
|
|
|
# read filepath_id_dict
|
|
filepath_id_dict = mysqlOp.get_filepath_id_dict()
|
|
|
|
# read commit_sha_id_dict
|
|
commit_sha_id_dict = mysqlOp.get_commit_sha_id_dict()
|
|
|
|
# read method_id_name dict
|
|
method_id_name_dict = mysqlOp.get_method_id_name_dict()
|
|
|
|
repo = Repo(repoInfo.bare_repo_path)
|
|
|
|
def handle_commit(commit_id: int, commit: Commit):
|
|
"""
|
|
Function: extract the commit line change relations
|
|
params:
|
|
- commit_id: the mysql id of Commit
|
|
- commit: the Commit object
|
|
"""
|
|
|
|
# read line no and method id relationship (commit_id -> filepath -> line no -> method id), because we also need parent_commits' filepath line method relationship
|
|
related_commit_ids = [commit_id]
|
|
parent_shas = commit.parents
|
|
for parent_sha in parent_shas:
|
|
parent_id = mysqlOp.get_commit_id_by_sha(sha=parent_sha)
|
|
if parent_id is not None:
|
|
related_commit_ids.append(parent_id)
|
|
commit_filepath_lineno_method_id_dict = {}
|
|
for related_commit_id in related_commit_ids:
|
|
tmp_dict = mysqlOp.get_fp_lineno_method_id_dict(
|
|
commit_id=related_commit_id,
|
|
)
|
|
commit_filepath_lineno_method_id_dict[related_commit_id] = tmp_dict
|
|
|
|
def formatrow(row):
|
|
if row["filepath_old"] is None:
|
|
filepath_id_old = np.nan
|
|
else:
|
|
if row["filepath_old"] not in filepath_id_dict:
|
|
"""
|
|
Some filepaths gotten by dulwich are different with the real filepaths
|
|
in the mysql database and the key names in filepath_id_dict. When this bug
|
|
happened we set filepath_id_old = None.
|
|
"""
|
|
filepath_id_old = None
|
|
else:
|
|
filepath_id_old = filepath_id_dict[row["filepath_old"]]
|
|
|
|
if row["filepath_new"] is None:
|
|
filepath_id_new = np.nan
|
|
else:
|
|
if row["filepath_new"] not in filepath_id_dict:
|
|
"""
|
|
Some filepaths gotten by dulwich are different with the real filepaths
|
|
in the mysql database and the key names in filepath_id_dict. When this bug
|
|
happened we set filepath_id_old = None.
|
|
Example: When deal with the repository git@github.com:apache/iotdb.git, a filepath in
|
|
filepath_id_dict is 'iotdb\\metrics\\interface\\src\\main\\java\\org\\apache\\iotdb\\metrics\\DoNothingMetricService.java'
|
|
while the filepath obtained by dulwich will ignore "iotdb\\"
|
|
"""
|
|
filepath_id_new = None
|
|
else:
|
|
filepath_id_new = filepath_id_dict[row["filepath_new"]]
|
|
|
|
commit_id_old = row["commit_id_old"]
|
|
commit_id_new = row["commit_id_new"]
|
|
|
|
line_old = row["line_old"]
|
|
line_new = row["line_new"]
|
|
|
|
if np.isnan(line_old):
|
|
method_id_old = np.nan
|
|
else:
|
|
if (
|
|
commit_id_old not in commit_filepath_lineno_method_id_dict
|
|
or filepath_id_old
|
|
not in commit_filepath_lineno_method_id_dict[commit_id_old]
|
|
or line_old
|
|
not in commit_filepath_lineno_method_id_dict[commit_id_old][
|
|
filepath_id_old
|
|
]
|
|
):
|
|
method_id_old = np.nan # this line is not related to a method
|
|
else:
|
|
method_id_old = commit_filepath_lineno_method_id_dict[
|
|
commit_id_old
|
|
][filepath_id_old][line_old]
|
|
|
|
if np.isnan(line_new):
|
|
method_id_new = np.nan
|
|
else:
|
|
if (
|
|
commit_id_new not in commit_filepath_lineno_method_id_dict
|
|
or filepath_id_new
|
|
not in commit_filepath_lineno_method_id_dict[commit_id_new]
|
|
or line_new
|
|
not in commit_filepath_lineno_method_id_dict[commit_id_new][
|
|
filepath_id_new
|
|
]
|
|
):
|
|
method_id_new = np.nan # this line is not related to a method
|
|
else:
|
|
method_id_new = commit_filepath_lineno_method_id_dict[
|
|
commit_id_new
|
|
][filepath_id_new][line_new]
|
|
|
|
if np.isnan(method_id_old):
|
|
method_name_old = None
|
|
else:
|
|
method_name_old = method_id_name_dict[method_id_old]
|
|
|
|
if np.isnan(method_id_new):
|
|
method_name_new = None
|
|
else:
|
|
method_name_new = method_id_name_dict[method_id_new]
|
|
|
|
return method_id_old, method_name_old, method_id_new, method_name_new
|
|
|
|
def get_changes(df):
|
|
"""
|
|
Function: get the method_function_changes according to the df
|
|
params:
|
|
- df: the line change dataframe
|
|
return:
|
|
- List[{
|
|
"method_id_1"
|
|
"method_id_2"
|
|
"change": json str
|
|
}]
|
|
"""
|
|
result = []
|
|
filtered_df = df.loc[
|
|
(df["method_name_old"].notnull())
|
|
& (df["method_name_new"].notnull())
|
|
& (df["method_name_old"] == df["method_name_new"])
|
|
] # if functions are related, they should have one line of code that are not modified; therefore, this line of code should have the same related method name
|
|
groups = filtered_df.groupby(["method_id_old", "method_id_new"])
|
|
for name, _ in groups:
|
|
method_id_old = int(name[0])
|
|
method_id_new = int(name[1])
|
|
# get the changed contents
|
|
change_delete = list(
|
|
df[
|
|
(df["method_id_old"] == method_id_old)
|
|
& (df["content_old"].notnull())
|
|
]["content_old"]
|
|
)
|
|
change_add = list(
|
|
df[
|
|
(df["method_id_new"] == method_id_new)
|
|
& (df["content_new"].notnull())
|
|
]["content_new"]
|
|
)
|
|
if len(change_delete) == 0 and len(change_add) == 0:
|
|
change = None
|
|
else:
|
|
change = [b"ADD:"]
|
|
change.extend(change_add)
|
|
change.extend([b"DELETE:"])
|
|
change.extend(change_delete)
|
|
change = b"\n".join(change)
|
|
result.append(
|
|
{
|
|
"method_id_1": method_id_old,
|
|
"method_id_2": method_id_new,
|
|
"change": change,
|
|
}
|
|
)
|
|
return result
|
|
|
|
if len(commit.parents) == 0:
|
|
# no method change
|
|
pass
|
|
else:
|
|
for parent_sha in commit.parents:
|
|
parent_id = mysqlOp.get_commit_id_by_sha(
|
|
sha=parent_sha,
|
|
)
|
|
tree_changes = diff_tree.tree_changes(
|
|
store=repo.object_store,
|
|
tree1_id=repo.object_store[parent_sha].tree,
|
|
tree2_id=repo.object_store[commit.id].tree,
|
|
)
|
|
line_relation_df = CommitLineRelationExtractor(
|
|
repoInfo=repoInfo,
|
|
commit=commit,
|
|
commit_id=commit_id,
|
|
parent=repo.object_store[parent_sha],
|
|
parent_id=parent_id,
|
|
tree_changes=tree_changes,
|
|
configOp=configOp,
|
|
).parse()
|
|
|
|
if line_relation_df.shape[0] > 0:
|
|
line_relation_df[
|
|
[
|
|
"method_id_old",
|
|
"method_name_old",
|
|
"method_id_new",
|
|
"method_name_new",
|
|
]
|
|
] = line_relation_df.apply(
|
|
formatrow, axis=1, result_type="expand"
|
|
)
|
|
changes = get_changes(line_relation_df)
|
|
for change in changes:
|
|
method_id_1 = change["method_id_1"]
|
|
method_id_2 = change["method_id_2"]
|
|
change_content = change["change"]
|
|
mysqlOp.cursor.execute(
|
|
"insert into `{method_function_relation_tablename}` (method_id_1, method_id_2, `change`) values (%s, %s, %s)".format(
|
|
method_function_relation_tablename=mysqlOp.tablename_dict[
|
|
"method_function_relations"
|
|
]
|
|
),
|
|
(method_id_1, method_id_2, change_content),
|
|
)
|
|
|
|
for commit_sha, commit_id in commit_sha_id_dict.items():
|
|
handle_commit(commit_id=commit_id, commit=repo.object_store[commit_sha])
|
|
|
|
# update steps table
|
|
mysqlOp.cursor.execute(
|
|
"update `{steps_tablename}` set handled=%s where step_name=%s".format(
|
|
steps_tablename=mysqlOp.tablename_dict["steps"]
|
|
),
|
|
(1, step_name),
|
|
)
|
|
mysqlOp.connection.commit()
|