Merge pull request '截至到8月21日完成的任务' (#43) from zy into master
This commit is contained in:
commit
f2d9408d74
|
@ -144,3 +144,4 @@ config.yml
|
|||
#repos
|
||||
repos
|
||||
delete_repos
|
||||
test/
|
||||
|
|
|
@ -84,12 +84,13 @@ class CloneOperator(object):
|
|||
|
||||
if not handled:
|
||||
p = subprocess.Popen(
|
||||
"cd {target_folder} && java -jar {NIL_path} -s ./ -mit {mit} -mil {mil} -t {thread_num} -o result.csv".format(
|
||||
"cd {target_folder} && java -jar {NIL_path} -s ./ -mit {mit} -mil {mil} -t {thread_num} -o result.csv -p {partition_num}".format(
|
||||
target_folder=target_folder,
|
||||
NIL_path=NIL_path,
|
||||
mit=int(nil_config["mit"]),
|
||||
mil=int(nil_config["mil"]),
|
||||
thread_num=int(nil_config["thread_num"]),
|
||||
partition_num=int(nil_config["partition_num"]),
|
||||
),
|
||||
shell=True,
|
||||
stdout=subprocess.PIPE,
|
||||
|
|
|
@ -18,9 +18,9 @@ class FileOperator(object):
|
|||
with open(self.path, "r") as file:
|
||||
list = file.read().strip().splitlines()
|
||||
for line in list:
|
||||
ownername, reponame, git_url = line.split(" ")
|
||||
ownername, reponame, git_url, id = line.split(" ")
|
||||
repoInfo = RepoInfo(
|
||||
ownername=ownername, reponame=reponame, git_url=git_url
|
||||
id=id, ownername=ownername, reponame=reponame, git_url=git_url
|
||||
)
|
||||
result.append(repoInfo)
|
||||
return result
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
import os
|
||||
import pathlib
|
||||
import subprocess
|
||||
from typing import List
|
||||
|
||||
|
@ -39,6 +40,48 @@ class GitOperator(object):
|
|||
-
|
||||
"""
|
||||
|
||||
# def find_blobs_in_tree(
|
||||
# repo: Repo, commit: Commit, tree: Tree, relpath: bytes = b""
|
||||
# ) -> List[BlobInfo]:
|
||||
# """
|
||||
# Function: iterately find all the blobs for the target commit
|
||||
# params:
|
||||
# - commit: the target commit
|
||||
# - tree: current Tree object in iteration
|
||||
# - relpath: the relative path before this iteration
|
||||
# return:
|
||||
# - a list of BlobInfo objects regarding this commit
|
||||
# """
|
||||
# result = []
|
||||
# for entry in tree.items():
|
||||
# if (not repo.object_store.contains_loose(entry.sha)) and (
|
||||
# not repo.object_store.contains_packed(entry.sha)
|
||||
# ):
|
||||
# # the object cannot be found in the repo
|
||||
# return result
|
||||
# obj = repo.object_store[entry.sha]
|
||||
# new_relpath = os.path.join(relpath, entry.path)
|
||||
# if obj.type_name == b"blob":
|
||||
# result.append(
|
||||
# BlobInfo(
|
||||
# repo=repo, commit=commit, filepath=new_relpath, blob=obj
|
||||
# )
|
||||
# )
|
||||
# elif obj.type_name == b"tree":
|
||||
# new_tree = obj
|
||||
# result.extend(
|
||||
# find_blobs_in_tree(
|
||||
# repo=repo,
|
||||
# commit=commit,
|
||||
# tree=new_tree,
|
||||
# relpath=new_relpath,
|
||||
# )
|
||||
# )
|
||||
# else:
|
||||
# # there is something wrong with this tree object
|
||||
# return result
|
||||
# return result
|
||||
|
||||
def find_blobs_in_tree(
|
||||
repo: Repo, commit: Commit, tree: Tree, relpath: bytes = b""
|
||||
) -> List[BlobInfo]:
|
||||
|
@ -52,33 +95,19 @@ class GitOperator(object):
|
|||
- a list of BlobInfo objects regarding this commit
|
||||
"""
|
||||
result = []
|
||||
for entry in tree.items():
|
||||
for entry in Repo(
|
||||
self.repoInfo.bare_repo_path
|
||||
).object_store.iter_tree_contents(commit.tree):
|
||||
if (not repo.object_store.contains_loose(entry.sha)) and (
|
||||
not repo.object_store.contains_packed(entry.sha)
|
||||
):
|
||||
# the object cannot be found in the repo
|
||||
return result
|
||||
continue
|
||||
obj = repo.object_store[entry.sha]
|
||||
new_relpath = os.path.join(relpath, entry.path)
|
||||
if obj.type_name == b"blob":
|
||||
result.append(
|
||||
BlobInfo(
|
||||
repo=repo, commit=commit, filepath=new_relpath, blob=obj
|
||||
)
|
||||
)
|
||||
elif obj.type_name == b"tree":
|
||||
new_tree = obj
|
||||
result.extend(
|
||||
find_blobs_in_tree(
|
||||
repo=repo,
|
||||
commit=commit,
|
||||
tree=new_tree,
|
||||
relpath=new_relpath,
|
||||
)
|
||||
)
|
||||
else:
|
||||
# there is something wrong with this tree object
|
||||
return result
|
||||
new_relpath = str(pathlib.Path(entry.path.decode())).encode()
|
||||
result.append(
|
||||
BlobInfo(repo=repo, commit=commit, filepath=new_relpath, blob=obj)
|
||||
)
|
||||
return result
|
||||
|
||||
blobInfos = find_blobs_in_tree(
|
||||
|
|
|
@ -278,12 +278,31 @@ def extract_method_function_relation(
|
|||
if row["filepath_old"] is None:
|
||||
filepath_id_old = np.nan
|
||||
else:
|
||||
filepath_id_old = filepath_id_dict[row["filepath_old"]]
|
||||
if row["filepath_old"] not in filepath_id_dict:
|
||||
"""
|
||||
Some filepaths gotten by dulwich are different with the real filepaths
|
||||
in the mysql database and the key names in filepath_id_dict. When this bug
|
||||
happened we set filepath_id_old = None.
|
||||
"""
|
||||
filepath_id_old = None
|
||||
else:
|
||||
filepath_id_old = filepath_id_dict[row["filepath_old"]]
|
||||
|
||||
if row["filepath_new"] is None:
|
||||
filepath_id_new = np.nan
|
||||
else:
|
||||
filepath_id_new = filepath_id_dict[row["filepath_new"]]
|
||||
if row["filepath_new"] not in filepath_id_dict:
|
||||
"""
|
||||
Some filepaths gotten by dulwich are different with the real filepaths
|
||||
in the mysql database and the key names in filepath_id_dict. When this bug
|
||||
happened we set filepath_id_old = None.
|
||||
Example: When deal with the repository git@github.com:apache/iotdb.git, a filepath in
|
||||
filepath_id_dict is 'iotdb\\metrics\\interface\\src\\main\\java\\org\\apache\\iotdb\\metrics\\DoNothingMetricService.java'
|
||||
while the filepath obtained by dulwich will ignore "iotdb\\"
|
||||
"""
|
||||
filepath_id_new = None
|
||||
else:
|
||||
filepath_id_new = filepath_id_dict[row["filepath_new"]]
|
||||
|
||||
commit_id_old = row["commit_id_old"]
|
||||
commit_id_new = row["commit_id_new"]
|
||||
|
@ -438,7 +457,6 @@ def extract_method_function_relation(
|
|||
),
|
||||
(method_id_1, method_id_2, change_content),
|
||||
)
|
||||
mysqlOp.connection.commit()
|
||||
|
||||
for commit_sha, commit_id in commit_sha_id_dict.items():
|
||||
handle_commit(commit_id=commit_id, commit=repo.object_store[commit_sha])
|
||||
|
|
|
@ -85,14 +85,29 @@ class MySQLOperator(object):
|
|||
- repoInfos: a list of RepoInfo objects
|
||||
"""
|
||||
for repoInfo in repoInfos:
|
||||
id = repoInfo.id
|
||||
ownername = repoInfo.ownername
|
||||
reponame = repoInfo.reponame
|
||||
self.cursor.execute(
|
||||
"insert ignore into repositories (ownername, reponame, handled) values (%s, %s, %s)",
|
||||
(ownername, reponame, 0),
|
||||
"insert ignore into repositories (id, ownername, reponame, handled) values (%s, %s, %s, %s)",
|
||||
(id, ownername, reponame, 0),
|
||||
)
|
||||
self.connection.commit()
|
||||
|
||||
def update_handled_repository(self, repoInfo: RepoInfo):
|
||||
"""
|
||||
Function: insert all the repositories into repositories table
|
||||
params:
|
||||
- repoInfos: a list of RepoInfo objects
|
||||
"""
|
||||
ownername = repoInfo.ownername
|
||||
reponame = repoInfo.reponame
|
||||
self.cursor.execute(
|
||||
"update repositories set handled = 1 where ownername=%s and reponame=%s",
|
||||
(ownername, reponame),
|
||||
)
|
||||
self.connection.commit()
|
||||
|
||||
def init_steps_table(self, repoInfo: RepoInfo):
|
||||
"""
|
||||
Function: initialize the handled_repositories table
|
||||
|
|
|
@ -13,7 +13,9 @@ This is a project for finding factors related to bad clones.
|
|||
- this project uses [Mysql](https://dev.mysql.com/downloads/) 8.0.30
|
||||
- copy the configuration template and rename it using command`cp ./config.template.yml ./config.yml`
|
||||
- set the section of the config with the hints in the template
|
||||
|
||||
- Java
|
||||
- To run the clone detector NIL, [jdk](https://www.oracle.com/java/technologies/downloads/) 1.8+ is needed.
|
||||
|
||||
## run the project
|
||||
1. Start collecting data for repositories by running the following commands:
|
||||
```
|
||||
|
|
|
@ -15,6 +15,7 @@ from GitOperator import GitOperator
|
|||
from MethodFunctionRelationExtractor import extract_method_function_relation
|
||||
from models.RepoInfo import RepoInfo
|
||||
from MySQLOperator import MySQLOperator
|
||||
from RiskEvaluator import evaluate_all_pairs
|
||||
|
||||
|
||||
class RepoExecutorThread(threading.Thread):
|
||||
|
@ -173,5 +174,20 @@ class RepoExecutorThread(threading.Thread):
|
|||
)
|
||||
)
|
||||
|
||||
# evaluate the risk of the clone pairs
|
||||
evaluate_all_pairs(repoInfo=repoInfo)
|
||||
print(
|
||||
"[Info]: Thread: {thread_name} finish evaluating all clone pairs in the whole repo: {git_url}".format(
|
||||
thread_name=self.name, git_url=repoInfo.git_url
|
||||
)
|
||||
)
|
||||
|
||||
# mark the handled repository
|
||||
mysqlOp.update_handled_repository(repoInfo=repoInfo)
|
||||
print(
|
||||
"[Info]: Thread: {thread_name} finish handling the whole repo: {git_url}".format(
|
||||
thread_name=self.name, git_url=repoInfo.git_url
|
||||
)
|
||||
)
|
||||
self.q.task_done()
|
||||
print("[Info]: Exist thread: " + self.name)
|
||||
|
|
|
@ -0,0 +1,432 @@
|
|||
import re
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
import pandas as pd
|
||||
from sqlalchemy import create_engine
|
||||
|
||||
import GlobalConstants
|
||||
from FileOperator import FileOperator
|
||||
from models.RepoInfo import RepoInfo
|
||||
from MySQLOperator import MySQLOperator
|
||||
|
||||
|
||||
class RiskEvaluator(object):
|
||||
def __init__(
|
||||
self,
|
||||
function_id_1: int,
|
||||
function_id_2: int,
|
||||
repoInfo: RepoInfo,
|
||||
):
|
||||
self.function_id_1 = function_id_1
|
||||
self.function_id_2 = function_id_2
|
||||
self.repoInfo = repoInfo
|
||||
|
||||
def evaluate(self, mysqlOp: MySQLOperator):
|
||||
"""
|
||||
Function: evaluate the risk of a clone pair in five steps:
|
||||
1.get the change num of clone pair
|
||||
2.get the consistent change num of clone pair
|
||||
3.get the interval of consistent change of clone class
|
||||
4.get the fix message of commit of consistent change
|
||||
5.evaluate the risk
|
||||
"""
|
||||
method_function_relations = mysqlOp.tablename_dict["method_function_relations"]
|
||||
clone_relations_function = mysqlOp.tablename_dict["clone_relations_function"]
|
||||
commits = mysqlOp.tablename_dict["commits"]
|
||||
blob_methods = mysqlOp.tablename_dict["blob_methods"]
|
||||
blob_commit_relations = mysqlOp.tablename_dict["blob_commit_relations"]
|
||||
commit_relations = mysqlOp.tablename_dict["commit_relations"]
|
||||
# Fix: only consider the life time of clone pair(clone pair existing commits)
|
||||
relate_commits = []
|
||||
sql_clone_pairs = """
|
||||
select commit_id from `{tablename}`
|
||||
where (function_id_1 = {function_id_1} and function_id_2 = {function_id_2}) or
|
||||
(function_id_1 = {function_id_2} and function_id_2 = {function_id_1})
|
||||
""".format(
|
||||
tablename=clone_relations_function,
|
||||
function_id_1=self.function_id_1,
|
||||
function_id_2=self.function_id_2,
|
||||
)
|
||||
mysqlOp.cursor.execute(sql_clone_pairs)
|
||||
clone_pairs = mysqlOp.cursor.fetchall()
|
||||
for clone_pair in clone_pairs:
|
||||
relate_commits.append(clone_pair.get("commit_id"))
|
||||
# get the related commmits of clone pair
|
||||
candidate_commits = [
|
||||
commit_id
|
||||
for commit_id in range(min(relate_commits), max(relate_commits) + 1)
|
||||
]
|
||||
|
||||
start_commits = []
|
||||
end_commits = []
|
||||
middle_commits = []
|
||||
|
||||
commit_children_dict = {} # find children by parent_id
|
||||
commit_parents_dict = {} # find parents by child_id
|
||||
mysqlOp.cursor.execute(
|
||||
"select id, parent_id from `{commit_relations}`".format(
|
||||
commit_relations=commit_relations
|
||||
)
|
||||
)
|
||||
commit_relation_results = mysqlOp.cursor.fetchall()
|
||||
for relation in commit_relation_results:
|
||||
commit_id = relation["id"]
|
||||
parent_id = relation["parent_id"]
|
||||
commit_children_dict.setdefault(parent_id, [])
|
||||
commit_children_dict[parent_id].append(commit_id)
|
||||
commit_parents_dict.setdefault(commit_id, [])
|
||||
commit_parents_dict[commit_id].append(parent_id)
|
||||
|
||||
# if the clone pair only live in one commit, this commit belongs to both start commit and end commit
|
||||
if len(candidate_commits) == 1:
|
||||
start_commits.append(candidate_commits[0])
|
||||
end_commits.append(candidate_commits[0])
|
||||
else:
|
||||
for commit_id in candidate_commits:
|
||||
if commit_id not in commit_parents_dict:
|
||||
parent_ids = []
|
||||
else:
|
||||
parent_ids = commit_parents_dict[commit_id]
|
||||
|
||||
if commit_id not in commit_children_dict:
|
||||
son_ids = []
|
||||
else:
|
||||
son_ids = commit_children_dict[commit_id]
|
||||
|
||||
intersect_parents = list(set(parent_ids) & set(candidate_commits))
|
||||
intersect_children = list(set(son_ids) & set(candidate_commits))
|
||||
# if no parent in candidate_commits & at least one child in candidate_commits & there exists clone relationship in this commit -> candidate_start
|
||||
if (
|
||||
len(intersect_parents) == 0
|
||||
and len(intersect_children) > 0
|
||||
and commit_id in relate_commits
|
||||
):
|
||||
start_commits.append(commit_id)
|
||||
# if at least one parent in candidate_commits & no child in candidate_commits & there exists clone relationship in this commit -> candidate_end
|
||||
elif (
|
||||
len(intersect_parents) > 0
|
||||
and len(intersect_children) == 0
|
||||
and commit_id in relate_commits
|
||||
):
|
||||
end_commits.append(commit_id)
|
||||
# if no parent in candidate_commits & no child in candidate_commits -> ignore
|
||||
elif len(intersect_parents) == 0 and len(intersect_children) == 0:
|
||||
continue
|
||||
# if at least one parent in candidate_commits & at least one child in candidate_commits -> middle_commit
|
||||
elif len(intersect_parents) > 0 and len(intersect_children) > 0:
|
||||
middle_commits.append(commit_id)
|
||||
for start_commit in start_commits:
|
||||
commit_parents_dict.setdefault(start_commit, [])
|
||||
for end_commit in end_commits:
|
||||
commit_children_dict.setdefault(end_commit, [])
|
||||
|
||||
# get the CpI
|
||||
# find related method ids in commits
|
||||
def find_related_methods(function_id: int) -> List[Tuple[int, int]]:
|
||||
result = []
|
||||
sql = """
|
||||
select bm.id as method_id, bcr.commit_id
|
||||
from `{blob_methods}` bm, `{blob_commit_relations}` bcr
|
||||
where bm.blob_id=bcr.blob_id
|
||||
and bm.function_id=%s
|
||||
""".format(
|
||||
blob_methods=blob_methods, blob_commit_relations=blob_commit_relations
|
||||
)
|
||||
mysqlOp.cursor.execute(sql, (function_id,))
|
||||
methods = mysqlOp.cursor.fetchall()
|
||||
for method in methods:
|
||||
method_id = method["method_id"]
|
||||
commit_id = method["commit_id"]
|
||||
result.append((method_id, commit_id))
|
||||
return result
|
||||
|
||||
candidate_methods_1 = find_related_methods(function_id=self.function_id_1)
|
||||
candidate_methods_2 = find_related_methods(function_id=self.function_id_2)
|
||||
|
||||
def filter_candidate_methods(
|
||||
candidate_methods: List[Tuple[int, int]], commit_ids: List[int]
|
||||
) -> Tuple[List[int], Dict[int, List[int]]]:
|
||||
"""
|
||||
return:
|
||||
- method ids
|
||||
- {
|
||||
method_id: [commit_ids] # a method can be related to multiple commits
|
||||
}
|
||||
"""
|
||||
method_ids = []
|
||||
method_commit_dict = {}
|
||||
for candidate_method in candidate_methods:
|
||||
if candidate_method[1] in commit_ids:
|
||||
method_ids.append(candidate_method[0])
|
||||
method_commit_dict.setdefault(candidate_method[0], [])
|
||||
method_commit_dict[candidate_method[0]].append(candidate_method[1])
|
||||
return list(set(method_ids)), method_commit_dict
|
||||
|
||||
all_methods_1, method_commit_dict_1 = filter_candidate_methods(
|
||||
candidate_methods=candidate_methods_1,
|
||||
commit_ids=list(
|
||||
set(start_commits) | set(end_commits) | set(middle_commits)
|
||||
),
|
||||
)
|
||||
all_methods_2, method_commit_dict_2 = filter_candidate_methods(
|
||||
candidate_methods=candidate_methods_2,
|
||||
commit_ids=list(
|
||||
set(start_commits) | set(end_commits) | set(middle_commits)
|
||||
),
|
||||
)
|
||||
|
||||
# Find all changes during clone pair evolution
|
||||
def get_method_change(
|
||||
all_methods: List[int],
|
||||
) -> List[Tuple[int, int, bytes, bytes]]:
|
||||
"""
|
||||
result:
|
||||
[(
|
||||
method_old,
|
||||
method_new,
|
||||
add_change,
|
||||
delete_change
|
||||
)]
|
||||
"""
|
||||
result_changes = []
|
||||
all_methods_str = [str(method_id) for method_id in all_methods]
|
||||
method_ids = "(" + ",".join(all_methods_str) + ")"
|
||||
sql_change = """
|
||||
select method_id_1, method_id_2, GROUP_CONCAT(distinct `change`) as `change` from `{tablename}`
|
||||
where method_id_1 in {method_ids}
|
||||
and method_id_2 in {method_ids}
|
||||
and `change` is not null
|
||||
group by method_id_1, method_id_2
|
||||
""".format(
|
||||
tablename=method_function_relations, method_ids=method_ids
|
||||
)
|
||||
mysqlOp.cursor.execute(sql_change)
|
||||
changes = mysqlOp.cursor.fetchall()
|
||||
|
||||
def extract_changes(content: bytes) -> Tuple[bytes, bytes]:
|
||||
"""
|
||||
return:
|
||||
- add contents
|
||||
- delete contents
|
||||
"""
|
||||
add_contents = b""
|
||||
delete_content = b""
|
||||
lines = content.splitlines()
|
||||
add_flag = False
|
||||
for line in lines:
|
||||
if line == b"ADD:":
|
||||
add_flag = True
|
||||
elif line == b"DELETE:":
|
||||
add_flag = False
|
||||
else:
|
||||
if add_flag:
|
||||
add_contents += b"".join(line.split())
|
||||
else:
|
||||
delete_content += b"".join(line.split())
|
||||
return (add_contents, delete_content)
|
||||
|
||||
for change in changes:
|
||||
method_id_1 = change["method_id_1"]
|
||||
method_id_2 = change["method_id_2"]
|
||||
change = change["change"]
|
||||
add_change, delete_change = extract_changes(content=change)
|
||||
result_changes.append(
|
||||
(method_id_1, method_id_2, add_change, delete_change)
|
||||
)
|
||||
return result_changes
|
||||
|
||||
result_changes_1 = get_method_change(all_methods=all_methods_1)
|
||||
result_changes_2 = get_method_change(all_methods=all_methods_2)
|
||||
|
||||
sum_changes = len(result_changes_1) + len(result_changes_2)
|
||||
|
||||
# get CCR
|
||||
# Find consistent_changes in all changes
|
||||
consistent_change_list1 = []
|
||||
consistent_change_list2 = []
|
||||
consistent_changes = 0
|
||||
for change_1 in result_changes_1:
|
||||
for change_2 in result_changes_2:
|
||||
if change_1[2] == change_2[2] and change_1[3] == change_2[3]:
|
||||
consistent_change_list1.append(change_1)
|
||||
consistent_change_list2.append(change_2)
|
||||
consistent_changes = consistent_changes + 2
|
||||
|
||||
# get CCL
|
||||
# Find Latency in different commits
|
||||
def get_commit_change_by_method_change(
|
||||
method_old: int, method_new: int, method_commit_dict: dict
|
||||
) -> List[Tuple[int, int]]:
|
||||
"""
|
||||
Function: get the change of commits via the change of methods
|
||||
return:
|
||||
- [(
|
||||
commit_old,
|
||||
commit_new
|
||||
)]
|
||||
"""
|
||||
result = []
|
||||
commits_old = method_commit_dict[method_old]
|
||||
commits_new = method_commit_dict[method_new]
|
||||
for commit_old in commits_old:
|
||||
children_old = commit_children_dict[commit_old]
|
||||
intersect_commits = set(children_old) & set(commits_new)
|
||||
for commit_id in intersect_commits:
|
||||
result.append((commit_old, commit_id))
|
||||
return result
|
||||
|
||||
target_commits = []
|
||||
if consistent_changes == 0:
|
||||
CCL = 0
|
||||
else:
|
||||
CCL = 0
|
||||
for i in range(len(consistent_change_list1)):
|
||||
change_1 = consistent_change_list1[i]
|
||||
change_2 = consistent_change_list2[i]
|
||||
method_old_1 = change_1[0]
|
||||
method_new_1 = change_1[1]
|
||||
method_old_2 = change_2[0]
|
||||
method_new_2 = change_2[1]
|
||||
|
||||
commit_changes_1 = get_commit_change_by_method_change(
|
||||
method_old=method_old_1,
|
||||
method_new=method_new_1,
|
||||
method_commit_dict=method_commit_dict_1,
|
||||
)
|
||||
commit_changes_2 = get_commit_change_by_method_change(
|
||||
method_old=method_old_2,
|
||||
method_new=method_new_2,
|
||||
method_commit_dict=method_commit_dict_2,
|
||||
)
|
||||
|
||||
consistent_change_commit_paths = list(
|
||||
set(commit_changes_1) | set(commit_changes_2)
|
||||
)
|
||||
CCL += len(consistent_change_commit_paths)
|
||||
|
||||
target_commits.extend(
|
||||
list(set([path[1] for path in consistent_change_commit_paths]))
|
||||
)
|
||||
|
||||
# get bug_fix_num
|
||||
if CCL == 0:
|
||||
bug_fix_num = 0
|
||||
else:
|
||||
bug_fix_num = 0
|
||||
for commit in target_commits:
|
||||
sql_message = """
|
||||
select message from `{tablename1}` where id = {id}
|
||||
""".format(
|
||||
tablename1=commits, id=commit
|
||||
)
|
||||
mysqlOp.cursor.execute(sql_message)
|
||||
message = mysqlOp.cursor.fetchone()["message"].lower()
|
||||
if (
|
||||
re.search(
|
||||
rb"(close|closes|closed|fix|fixes|fixed|resolve|resolves|resolved)\s+.*?#\d+",
|
||||
message,
|
||||
)
|
||||
is not None
|
||||
):
|
||||
bug_fix_num += 1
|
||||
|
||||
def Harmness_Evaluating(CpI: int, CCR: int, CCL: int, bug_fix_num: int) -> int:
|
||||
"""
|
||||
Function : Evaluate the harmness of a clone
|
||||
input:
|
||||
- CpI: Changes per clone Instance
|
||||
- CCR: Consistent Change Ratio
|
||||
- CCl: Consistent Change Latency
|
||||
- bug_fix_num: the number of bug_fix commit
|
||||
output:
|
||||
- risk_level:
|
||||
- 1 -> Clone is harmless
|
||||
- 2 -> Clone is low risky
|
||||
- 3 -> Clone is medium risky
|
||||
- 4 -> Clone is high risky
|
||||
"""
|
||||
if CpI == 0 | (CpI > 0 & CCR == 0):
|
||||
risk_level = 0
|
||||
else:
|
||||
if CCL == 0:
|
||||
risk_level = 1
|
||||
else:
|
||||
if bug_fix_num == 0:
|
||||
risk_level = 2
|
||||
else:
|
||||
risk_level = 3
|
||||
return risk_level
|
||||
|
||||
return Harmness_Evaluating(sum_changes, consistent_changes, CCL, bug_fix_num)
|
||||
|
||||
|
||||
# Only for test
|
||||
# repoInfos: List[RepoInfo] = FileOperator("repos").load_repos()
|
||||
# for repoInfo in repoInfos:
|
||||
# mysqlOp: MySQLOperator = MySQLOperator(config_path="config.yml", repoInfo=repoInfo)
|
||||
# clone_pair = RiskEvaluator(
|
||||
# 10, 9, repoInfo
|
||||
# )
|
||||
# print(clone_pair.evaluate(mysqlOp))
|
||||
def evaluate_all_pairs(repoInfo):
|
||||
mysqlOp: MySQLOperator = MySQLOperator(config_path="config.yml", repoInfo=repoInfo)
|
||||
# clone_pair = RiskEvaluator(
|
||||
# 10, 9, repoInfo
|
||||
# )
|
||||
# print(clone_pair.evaluate(mysqlOp))
|
||||
clone_relations_function = mysqlOp.tablename_dict["clone_relations_function"]
|
||||
sql_all_clones = """
|
||||
select function_id_1,function_id_2 from `{tablename}`
|
||||
""".format(
|
||||
tablename=clone_relations_function
|
||||
)
|
||||
mysqlOp.cursor.execute(sql_all_clones)
|
||||
all_clone_pairs = mysqlOp.cursor.fetchall()
|
||||
evaluate_list = []
|
||||
for clone_pair in all_clone_pairs:
|
||||
function_id_1 = clone_pair.get("function_id_1")
|
||||
function_id_2 = clone_pair.get("function_id_2")
|
||||
clone_pair = RiskEvaluator(function_id_1, function_id_2, repoInfo)
|
||||
risklevel = clone_pair.evaluate(mysqlOp)
|
||||
pair = {
|
||||
"function_id_1": function_id_1,
|
||||
"function_id_2": function_id_2,
|
||||
"risk_level": risklevel,
|
||||
}
|
||||
evaluate_list.append(pair)
|
||||
result = pd.DataFrame(evaluate_list)
|
||||
result_of_evaluator = "{repo_id}{separator}result_of_evaluator".format(
|
||||
repo_id=repoInfo.id, separator=GlobalConstants.SEPARATOR
|
||||
)
|
||||
sql_result = """
|
||||
create table if not exists `{tablename}` (
|
||||
`id` int(11) NOT NULL AUTO_INCREMENT,
|
||||
`function_id_1` int(11) NULL,
|
||||
`function_id_2` int(11) NULL,
|
||||
`risk_level` int(11) NULL,
|
||||
PRIMARY KEY (`id`),
|
||||
INDEX(`function_id_1`) USING BTREE,
|
||||
INDEX(`function_id_2`) USING BTREE,
|
||||
INDEX(`risk_level`) USING BTREE
|
||||
)
|
||||
""".format(
|
||||
tablename=result_of_evaluator
|
||||
)
|
||||
mysqlOp.cursor.execute(sql_result)
|
||||
mysqlOp.truncate_table(tablename=result_of_evaluator)
|
||||
config = mysqlOp.config["mysql"]
|
||||
engine = create_engine(
|
||||
"mysql+pymysql://{username}:{password}@{host}:{port}/{database}".format(
|
||||
username=config["user"],
|
||||
password=config["passwd"],
|
||||
host=config["host"],
|
||||
port=config["port"],
|
||||
database=config["database"],
|
||||
)
|
||||
)
|
||||
result.to_sql(
|
||||
result_of_evaluator,
|
||||
engine,
|
||||
index=False,
|
||||
if_exists="append",
|
||||
)
|
|
@ -1,5 +1,5 @@
|
|||
apache ant git@github.com:apache/ant.git
|
||||
apache dubbo git@github.com:apache/dubbo.git
|
||||
apache kafka git@github.com:apache/kafka.git
|
||||
apache maven git@github.com:apache/maven.git
|
||||
apache rocketmq git@github.com:apache/rocketmq.git
|
||||
apache ant git@github.com:apache/ant.git 11
|
||||
apache dubbo git@github.com:apache/dubbo.git 12
|
||||
apache kafka git@github.com:apache/kafka.git 13
|
||||
apache maven git@github.com:apache/maven.git 14
|
||||
apache rocketmq git@github.com:apache/rocketmq.git 15
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
apache ant git@github.com:apache/ant.git
|
||||
apache dubbo git@github.com:apache/dubbo.git
|
||||
apache kafka git@github.com:apache/kafka.git
|
||||
apache maven git@github.com:apache/maven.git
|
||||
apache rocketmq git@github.com:apache/rocketmq.git
|
||||
apache ant git@github.com:apache/ant.git 11
|
||||
apache dubbo git@github.com:apache/dubbo.git 12
|
||||
apache kafka git@github.com:apache/kafka.git 13
|
||||
apache maven git@github.com:apache/maven.git 14
|
||||
apache rocketmq git@github.com:apache/rocketmq.git 15
|
||||
|
|
|
@ -2,7 +2,7 @@ CREATE TABLE IF NOT EXISTS `{tablename}` (
|
|||
`id` int(11) NOT NULL AUTO_INCREMENT,
|
||||
`method_id_1` int(11) NULL,
|
||||
`method_id_2` int(11) NULL,
|
||||
`change` blob NULL, # set(block1, block2) a set of change, no order, and no whitespace in each block
|
||||
`change` longblob NULL, # set(block1, block2) a set of change, no order, and no whitespace in each block
|
||||
PRIMARY KEY (`id`),
|
||||
INDEX(`method_id_1`) USING BTREE,
|
||||
INDEX(`method_id_2`) USING BTREE
|
||||
|
|
Loading…
Reference in New Issue