Merge pull request '截至到8月21日完成的任务' (#43) from zy into master

This commit is contained in:
MillerEvan 2022-09-02 23:13:41 +08:00
commit f2d9408d74
12 changed files with 556 additions and 42 deletions

1
.gitignore vendored
View File

@ -144,3 +144,4 @@ config.yml
#repos
repos
delete_repos
test/

View File

@ -84,12 +84,13 @@ class CloneOperator(object):
if not handled:
p = subprocess.Popen(
"cd {target_folder} && java -jar {NIL_path} -s ./ -mit {mit} -mil {mil} -t {thread_num} -o result.csv".format(
"cd {target_folder} && java -jar {NIL_path} -s ./ -mit {mit} -mil {mil} -t {thread_num} -o result.csv -p {partition_num}".format(
target_folder=target_folder,
NIL_path=NIL_path,
mit=int(nil_config["mit"]),
mil=int(nil_config["mil"]),
thread_num=int(nil_config["thread_num"]),
partition_num=int(nil_config["partition_num"]),
),
shell=True,
stdout=subprocess.PIPE,

View File

@ -18,9 +18,9 @@ class FileOperator(object):
with open(self.path, "r") as file:
list = file.read().strip().splitlines()
for line in list:
ownername, reponame, git_url = line.split(" ")
ownername, reponame, git_url, id = line.split(" ")
repoInfo = RepoInfo(
ownername=ownername, reponame=reponame, git_url=git_url
id=id, ownername=ownername, reponame=reponame, git_url=git_url
)
result.append(repoInfo)
return result

View File

@ -1,4 +1,5 @@
import os
import pathlib
import subprocess
from typing import List
@ -39,6 +40,48 @@ class GitOperator(object):
-
"""
# def find_blobs_in_tree(
# repo: Repo, commit: Commit, tree: Tree, relpath: bytes = b""
# ) -> List[BlobInfo]:
# """
# Function: iterately find all the blobs for the target commit
# params:
# - commit: the target commit
# - tree: current Tree object in iteration
# - relpath: the relative path before this iteration
# return:
# - a list of BlobInfo objects regarding this commit
# """
# result = []
# for entry in tree.items():
# if (not repo.object_store.contains_loose(entry.sha)) and (
# not repo.object_store.contains_packed(entry.sha)
# ):
# # the object cannot be found in the repo
# return result
# obj = repo.object_store[entry.sha]
# new_relpath = os.path.join(relpath, entry.path)
# if obj.type_name == b"blob":
# result.append(
# BlobInfo(
# repo=repo, commit=commit, filepath=new_relpath, blob=obj
# )
# )
# elif obj.type_name == b"tree":
# new_tree = obj
# result.extend(
# find_blobs_in_tree(
# repo=repo,
# commit=commit,
# tree=new_tree,
# relpath=new_relpath,
# )
# )
# else:
# # there is something wrong with this tree object
# return result
# return result
def find_blobs_in_tree(
repo: Repo, commit: Commit, tree: Tree, relpath: bytes = b""
) -> List[BlobInfo]:
@ -52,33 +95,19 @@ class GitOperator(object):
- a list of BlobInfo objects regarding this commit
"""
result = []
for entry in tree.items():
for entry in Repo(
self.repoInfo.bare_repo_path
).object_store.iter_tree_contents(commit.tree):
if (not repo.object_store.contains_loose(entry.sha)) and (
not repo.object_store.contains_packed(entry.sha)
):
# the object cannot be found in the repo
return result
continue
obj = repo.object_store[entry.sha]
new_relpath = os.path.join(relpath, entry.path)
if obj.type_name == b"blob":
result.append(
BlobInfo(
repo=repo, commit=commit, filepath=new_relpath, blob=obj
)
)
elif obj.type_name == b"tree":
new_tree = obj
result.extend(
find_blobs_in_tree(
repo=repo,
commit=commit,
tree=new_tree,
relpath=new_relpath,
)
)
else:
# there is something wrong with this tree object
return result
new_relpath = str(pathlib.Path(entry.path.decode())).encode()
result.append(
BlobInfo(repo=repo, commit=commit, filepath=new_relpath, blob=obj)
)
return result
blobInfos = find_blobs_in_tree(

View File

@ -278,12 +278,31 @@ def extract_method_function_relation(
if row["filepath_old"] is None:
filepath_id_old = np.nan
else:
filepath_id_old = filepath_id_dict[row["filepath_old"]]
if row["filepath_old"] not in filepath_id_dict:
"""
Some filepaths gotten by dulwich are different with the real filepaths
in the mysql database and the key names in filepath_id_dict. When this bug
happened we set filepath_id_old = None.
"""
filepath_id_old = None
else:
filepath_id_old = filepath_id_dict[row["filepath_old"]]
if row["filepath_new"] is None:
filepath_id_new = np.nan
else:
filepath_id_new = filepath_id_dict[row["filepath_new"]]
if row["filepath_new"] not in filepath_id_dict:
"""
Some filepaths gotten by dulwich are different with the real filepaths
in the mysql database and the key names in filepath_id_dict. When this bug
happened we set filepath_id_old = None.
Example: When deal with the repository git@github.com:apache/iotdb.git, a filepath in
filepath_id_dict is 'iotdb\\metrics\\interface\\src\\main\\java\\org\\apache\\iotdb\\metrics\\DoNothingMetricService.java'
while the filepath obtained by dulwich will ignore "iotdb\\"
"""
filepath_id_new = None
else:
filepath_id_new = filepath_id_dict[row["filepath_new"]]
commit_id_old = row["commit_id_old"]
commit_id_new = row["commit_id_new"]
@ -438,7 +457,6 @@ def extract_method_function_relation(
),
(method_id_1, method_id_2, change_content),
)
mysqlOp.connection.commit()
for commit_sha, commit_id in commit_sha_id_dict.items():
handle_commit(commit_id=commit_id, commit=repo.object_store[commit_sha])

View File

@ -85,14 +85,29 @@ class MySQLOperator(object):
- repoInfos: a list of RepoInfo objects
"""
for repoInfo in repoInfos:
id = repoInfo.id
ownername = repoInfo.ownername
reponame = repoInfo.reponame
self.cursor.execute(
"insert ignore into repositories (ownername, reponame, handled) values (%s, %s, %s)",
(ownername, reponame, 0),
"insert ignore into repositories (id, ownername, reponame, handled) values (%s, %s, %s, %s)",
(id, ownername, reponame, 0),
)
self.connection.commit()
def update_handled_repository(self, repoInfo: RepoInfo):
"""
Function: insert all the repositories into repositories table
params:
- repoInfos: a list of RepoInfo objects
"""
ownername = repoInfo.ownername
reponame = repoInfo.reponame
self.cursor.execute(
"update repositories set handled = 1 where ownername=%s and reponame=%s",
(ownername, reponame),
)
self.connection.commit()
def init_steps_table(self, repoInfo: RepoInfo):
"""
Function: initialize the handled_repositories table

View File

@ -13,7 +13,9 @@ This is a project for finding factors related to bad clones.
- this project uses [Mysql](https://dev.mysql.com/downloads/) 8.0.30
- copy the configuration template and rename it using command`cp ./config.template.yml ./config.yml`
- set the section of the config with the hints in the template
- Java
- To run the clone detector NIL, [jdk](https://www.oracle.com/java/technologies/downloads/) 1.8+ is needed.
## run the project
1. Start collecting data for repositories by running the following commands:
```

View File

@ -15,6 +15,7 @@ from GitOperator import GitOperator
from MethodFunctionRelationExtractor import extract_method_function_relation
from models.RepoInfo import RepoInfo
from MySQLOperator import MySQLOperator
from RiskEvaluator import evaluate_all_pairs
class RepoExecutorThread(threading.Thread):
@ -173,5 +174,20 @@ class RepoExecutorThread(threading.Thread):
)
)
# evaluate the risk of the clone pairs
evaluate_all_pairs(repoInfo=repoInfo)
print(
"[Info]: Thread: {thread_name} finish evaluating all clone pairs in the whole repo: {git_url}".format(
thread_name=self.name, git_url=repoInfo.git_url
)
)
# mark the handled repository
mysqlOp.update_handled_repository(repoInfo=repoInfo)
print(
"[Info]: Thread: {thread_name} finish handling the whole repo: {git_url}".format(
thread_name=self.name, git_url=repoInfo.git_url
)
)
self.q.task_done()
print("[Info]: Exist thread: " + self.name)

432
RiskEvaluator.py Normal file
View File

@ -0,0 +1,432 @@
import re
from typing import Dict, List, Tuple
import pandas as pd
from sqlalchemy import create_engine
import GlobalConstants
from FileOperator import FileOperator
from models.RepoInfo import RepoInfo
from MySQLOperator import MySQLOperator
class RiskEvaluator(object):
def __init__(
self,
function_id_1: int,
function_id_2: int,
repoInfo: RepoInfo,
):
self.function_id_1 = function_id_1
self.function_id_2 = function_id_2
self.repoInfo = repoInfo
def evaluate(self, mysqlOp: MySQLOperator):
"""
Function: evaluate the risk of a clone pair in five steps:
1.get the change num of clone pair
2.get the consistent change num of clone pair
3.get the interval of consistent change of clone class
4.get the fix message of commit of consistent change
5.evaluate the risk
"""
method_function_relations = mysqlOp.tablename_dict["method_function_relations"]
clone_relations_function = mysqlOp.tablename_dict["clone_relations_function"]
commits = mysqlOp.tablename_dict["commits"]
blob_methods = mysqlOp.tablename_dict["blob_methods"]
blob_commit_relations = mysqlOp.tablename_dict["blob_commit_relations"]
commit_relations = mysqlOp.tablename_dict["commit_relations"]
# Fix: only consider the life time of clone pair(clone pair existing commits)
relate_commits = []
sql_clone_pairs = """
select commit_id from `{tablename}`
where (function_id_1 = {function_id_1} and function_id_2 = {function_id_2}) or
(function_id_1 = {function_id_2} and function_id_2 = {function_id_1})
""".format(
tablename=clone_relations_function,
function_id_1=self.function_id_1,
function_id_2=self.function_id_2,
)
mysqlOp.cursor.execute(sql_clone_pairs)
clone_pairs = mysqlOp.cursor.fetchall()
for clone_pair in clone_pairs:
relate_commits.append(clone_pair.get("commit_id"))
# get the related commmits of clone pair
candidate_commits = [
commit_id
for commit_id in range(min(relate_commits), max(relate_commits) + 1)
]
start_commits = []
end_commits = []
middle_commits = []
commit_children_dict = {} # find children by parent_id
commit_parents_dict = {} # find parents by child_id
mysqlOp.cursor.execute(
"select id, parent_id from `{commit_relations}`".format(
commit_relations=commit_relations
)
)
commit_relation_results = mysqlOp.cursor.fetchall()
for relation in commit_relation_results:
commit_id = relation["id"]
parent_id = relation["parent_id"]
commit_children_dict.setdefault(parent_id, [])
commit_children_dict[parent_id].append(commit_id)
commit_parents_dict.setdefault(commit_id, [])
commit_parents_dict[commit_id].append(parent_id)
# if the clone pair only live in one commit, this commit belongs to both start commit and end commit
if len(candidate_commits) == 1:
start_commits.append(candidate_commits[0])
end_commits.append(candidate_commits[0])
else:
for commit_id in candidate_commits:
if commit_id not in commit_parents_dict:
parent_ids = []
else:
parent_ids = commit_parents_dict[commit_id]
if commit_id not in commit_children_dict:
son_ids = []
else:
son_ids = commit_children_dict[commit_id]
intersect_parents = list(set(parent_ids) & set(candidate_commits))
intersect_children = list(set(son_ids) & set(candidate_commits))
# if no parent in candidate_commits & at least one child in candidate_commits & there exists clone relationship in this commit -> candidate_start
if (
len(intersect_parents) == 0
and len(intersect_children) > 0
and commit_id in relate_commits
):
start_commits.append(commit_id)
# if at least one parent in candidate_commits & no child in candidate_commits & there exists clone relationship in this commit -> candidate_end
elif (
len(intersect_parents) > 0
and len(intersect_children) == 0
and commit_id in relate_commits
):
end_commits.append(commit_id)
# if no parent in candidate_commits & no child in candidate_commits -> ignore
elif len(intersect_parents) == 0 and len(intersect_children) == 0:
continue
# if at least one parent in candidate_commits & at least one child in candidate_commits -> middle_commit
elif len(intersect_parents) > 0 and len(intersect_children) > 0:
middle_commits.append(commit_id)
for start_commit in start_commits:
commit_parents_dict.setdefault(start_commit, [])
for end_commit in end_commits:
commit_children_dict.setdefault(end_commit, [])
# get the CpI
# find related method ids in commits
def find_related_methods(function_id: int) -> List[Tuple[int, int]]:
result = []
sql = """
select bm.id as method_id, bcr.commit_id
from `{blob_methods}` bm, `{blob_commit_relations}` bcr
where bm.blob_id=bcr.blob_id
and bm.function_id=%s
""".format(
blob_methods=blob_methods, blob_commit_relations=blob_commit_relations
)
mysqlOp.cursor.execute(sql, (function_id,))
methods = mysqlOp.cursor.fetchall()
for method in methods:
method_id = method["method_id"]
commit_id = method["commit_id"]
result.append((method_id, commit_id))
return result
candidate_methods_1 = find_related_methods(function_id=self.function_id_1)
candidate_methods_2 = find_related_methods(function_id=self.function_id_2)
def filter_candidate_methods(
candidate_methods: List[Tuple[int, int]], commit_ids: List[int]
) -> Tuple[List[int], Dict[int, List[int]]]:
"""
return:
- method ids
- {
method_id: [commit_ids] # a method can be related to multiple commits
}
"""
method_ids = []
method_commit_dict = {}
for candidate_method in candidate_methods:
if candidate_method[1] in commit_ids:
method_ids.append(candidate_method[0])
method_commit_dict.setdefault(candidate_method[0], [])
method_commit_dict[candidate_method[0]].append(candidate_method[1])
return list(set(method_ids)), method_commit_dict
all_methods_1, method_commit_dict_1 = filter_candidate_methods(
candidate_methods=candidate_methods_1,
commit_ids=list(
set(start_commits) | set(end_commits) | set(middle_commits)
),
)
all_methods_2, method_commit_dict_2 = filter_candidate_methods(
candidate_methods=candidate_methods_2,
commit_ids=list(
set(start_commits) | set(end_commits) | set(middle_commits)
),
)
# Find all changes during clone pair evolution
def get_method_change(
all_methods: List[int],
) -> List[Tuple[int, int, bytes, bytes]]:
"""
result:
[(
method_old,
method_new,
add_change,
delete_change
)]
"""
result_changes = []
all_methods_str = [str(method_id) for method_id in all_methods]
method_ids = "(" + ",".join(all_methods_str) + ")"
sql_change = """
select method_id_1, method_id_2, GROUP_CONCAT(distinct `change`) as `change` from `{tablename}`
where method_id_1 in {method_ids}
and method_id_2 in {method_ids}
and `change` is not null
group by method_id_1, method_id_2
""".format(
tablename=method_function_relations, method_ids=method_ids
)
mysqlOp.cursor.execute(sql_change)
changes = mysqlOp.cursor.fetchall()
def extract_changes(content: bytes) -> Tuple[bytes, bytes]:
"""
return:
- add contents
- delete contents
"""
add_contents = b""
delete_content = b""
lines = content.splitlines()
add_flag = False
for line in lines:
if line == b"ADD:":
add_flag = True
elif line == b"DELETE:":
add_flag = False
else:
if add_flag:
add_contents += b"".join(line.split())
else:
delete_content += b"".join(line.split())
return (add_contents, delete_content)
for change in changes:
method_id_1 = change["method_id_1"]
method_id_2 = change["method_id_2"]
change = change["change"]
add_change, delete_change = extract_changes(content=change)
result_changes.append(
(method_id_1, method_id_2, add_change, delete_change)
)
return result_changes
result_changes_1 = get_method_change(all_methods=all_methods_1)
result_changes_2 = get_method_change(all_methods=all_methods_2)
sum_changes = len(result_changes_1) + len(result_changes_2)
# get CCR
# Find consistent_changes in all changes
consistent_change_list1 = []
consistent_change_list2 = []
consistent_changes = 0
for change_1 in result_changes_1:
for change_2 in result_changes_2:
if change_1[2] == change_2[2] and change_1[3] == change_2[3]:
consistent_change_list1.append(change_1)
consistent_change_list2.append(change_2)
consistent_changes = consistent_changes + 2
# get CCL
# Find Latency in different commits
def get_commit_change_by_method_change(
method_old: int, method_new: int, method_commit_dict: dict
) -> List[Tuple[int, int]]:
"""
Function: get the change of commits via the change of methods
return:
- [(
commit_old,
commit_new
)]
"""
result = []
commits_old = method_commit_dict[method_old]
commits_new = method_commit_dict[method_new]
for commit_old in commits_old:
children_old = commit_children_dict[commit_old]
intersect_commits = set(children_old) & set(commits_new)
for commit_id in intersect_commits:
result.append((commit_old, commit_id))
return result
target_commits = []
if consistent_changes == 0:
CCL = 0
else:
CCL = 0
for i in range(len(consistent_change_list1)):
change_1 = consistent_change_list1[i]
change_2 = consistent_change_list2[i]
method_old_1 = change_1[0]
method_new_1 = change_1[1]
method_old_2 = change_2[0]
method_new_2 = change_2[1]
commit_changes_1 = get_commit_change_by_method_change(
method_old=method_old_1,
method_new=method_new_1,
method_commit_dict=method_commit_dict_1,
)
commit_changes_2 = get_commit_change_by_method_change(
method_old=method_old_2,
method_new=method_new_2,
method_commit_dict=method_commit_dict_2,
)
consistent_change_commit_paths = list(
set(commit_changes_1) | set(commit_changes_2)
)
CCL += len(consistent_change_commit_paths)
target_commits.extend(
list(set([path[1] for path in consistent_change_commit_paths]))
)
# get bug_fix_num
if CCL == 0:
bug_fix_num = 0
else:
bug_fix_num = 0
for commit in target_commits:
sql_message = """
select message from `{tablename1}` where id = {id}
""".format(
tablename1=commits, id=commit
)
mysqlOp.cursor.execute(sql_message)
message = mysqlOp.cursor.fetchone()["message"].lower()
if (
re.search(
rb"(close|closes|closed|fix|fixes|fixed|resolve|resolves|resolved)\s+.*?#\d+",
message,
)
is not None
):
bug_fix_num += 1
def Harmness_Evaluating(CpI: int, CCR: int, CCL: int, bug_fix_num: int) -> int:
"""
Function : Evaluate the harmness of a clone
input:
- CpI: Changes per clone Instance
- CCR: Consistent Change Ratio
- CCl: Consistent Change Latency
- bug_fix_num: the number of bug_fix commit
output:
- risk_level:
- 1 -> Clone is harmless
- 2 -> Clone is low risky
- 3 -> Clone is medium risky
- 4 -> Clone is high risky
"""
if CpI == 0 | (CpI > 0 & CCR == 0):
risk_level = 0
else:
if CCL == 0:
risk_level = 1
else:
if bug_fix_num == 0:
risk_level = 2
else:
risk_level = 3
return risk_level
return Harmness_Evaluating(sum_changes, consistent_changes, CCL, bug_fix_num)
# Only for test
# repoInfos: List[RepoInfo] = FileOperator("repos").load_repos()
# for repoInfo in repoInfos:
# mysqlOp: MySQLOperator = MySQLOperator(config_path="config.yml", repoInfo=repoInfo)
# clone_pair = RiskEvaluator(
# 10, 9, repoInfo
# )
# print(clone_pair.evaluate(mysqlOp))
def evaluate_all_pairs(repoInfo):
mysqlOp: MySQLOperator = MySQLOperator(config_path="config.yml", repoInfo=repoInfo)
# clone_pair = RiskEvaluator(
# 10, 9, repoInfo
# )
# print(clone_pair.evaluate(mysqlOp))
clone_relations_function = mysqlOp.tablename_dict["clone_relations_function"]
sql_all_clones = """
select function_id_1,function_id_2 from `{tablename}`
""".format(
tablename=clone_relations_function
)
mysqlOp.cursor.execute(sql_all_clones)
all_clone_pairs = mysqlOp.cursor.fetchall()
evaluate_list = []
for clone_pair in all_clone_pairs:
function_id_1 = clone_pair.get("function_id_1")
function_id_2 = clone_pair.get("function_id_2")
clone_pair = RiskEvaluator(function_id_1, function_id_2, repoInfo)
risklevel = clone_pair.evaluate(mysqlOp)
pair = {
"function_id_1": function_id_1,
"function_id_2": function_id_2,
"risk_level": risklevel,
}
evaluate_list.append(pair)
result = pd.DataFrame(evaluate_list)
result_of_evaluator = "{repo_id}{separator}result_of_evaluator".format(
repo_id=repoInfo.id, separator=GlobalConstants.SEPARATOR
)
sql_result = """
create table if not exists `{tablename}` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`function_id_1` int(11) NULL,
`function_id_2` int(11) NULL,
`risk_level` int(11) NULL,
PRIMARY KEY (`id`),
INDEX(`function_id_1`) USING BTREE,
INDEX(`function_id_2`) USING BTREE,
INDEX(`risk_level`) USING BTREE
)
""".format(
tablename=result_of_evaluator
)
mysqlOp.cursor.execute(sql_result)
mysqlOp.truncate_table(tablename=result_of_evaluator)
config = mysqlOp.config["mysql"]
engine = create_engine(
"mysql+pymysql://{username}:{password}@{host}:{port}/{database}".format(
username=config["user"],
password=config["passwd"],
host=config["host"],
port=config["port"],
database=config["database"],
)
)
result.to_sql(
result_of_evaluator,
engine,
index=False,
if_exists="append",
)

View File

@ -1,5 +1,5 @@
apache ant git@github.com:apache/ant.git
apache dubbo git@github.com:apache/dubbo.git
apache kafka git@github.com:apache/kafka.git
apache maven git@github.com:apache/maven.git
apache rocketmq git@github.com:apache/rocketmq.git
apache ant git@github.com:apache/ant.git 11
apache dubbo git@github.com:apache/dubbo.git 12
apache kafka git@github.com:apache/kafka.git 13
apache maven git@github.com:apache/maven.git 14
apache rocketmq git@github.com:apache/rocketmq.git 15

View File

@ -1,5 +1,5 @@
apache ant git@github.com:apache/ant.git
apache dubbo git@github.com:apache/dubbo.git
apache kafka git@github.com:apache/kafka.git
apache maven git@github.com:apache/maven.git
apache rocketmq git@github.com:apache/rocketmq.git
apache ant git@github.com:apache/ant.git 11
apache dubbo git@github.com:apache/dubbo.git 12
apache kafka git@github.com:apache/kafka.git 13
apache maven git@github.com:apache/maven.git 14
apache rocketmq git@github.com:apache/rocketmq.git 15

View File

@ -2,7 +2,7 @@ CREATE TABLE IF NOT EXISTS `{tablename}` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`method_id_1` int(11) NULL,
`method_id_2` int(11) NULL,
`change` blob NULL, # set(block1, block2) a set of change, no order, and no whitespace in each block
`change` longblob NULL, # set(block1, block2) a set of change, no order, and no whitespace in each block
PRIMARY KEY (`id`),
INDEX(`method_id_1`) USING BTREE,
INDEX(`method_id_2`) USING BTREE