From 9a413273b82eee99227792a55b2ba8524ce3080e Mon Sep 17 00:00:00 2001 From: zy Date: Thu, 18 Aug 2022 03:32:36 +0800 Subject: [PATCH 01/23] fix the bug from different support path --- MethodFunctionRelationExtractor.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/MethodFunctionRelationExtractor.py b/MethodFunctionRelationExtractor.py index ce0d9cc..1337067 100644 --- a/MethodFunctionRelationExtractor.py +++ b/MethodFunctionRelationExtractor.py @@ -1,4 +1,5 @@ import json +import pathlib import re from cmath import isnan from difflib import SequenceMatcher @@ -163,9 +164,22 @@ class CommitLineRelationExtractor(object): b"\r|\n|\r\n", self.repo.object_store[tree_change.new.sha].data ), ) + """ + dulwich's TreeChange's path is always in Linux mode, Windows is not supported + Therefore, we need to localize the filepath + """ + if tree_change.old.path is None: + filepath_old = None + else: + filepath_old = str(pathlib.Path(tree_change.old.path.decode())).encode() - line_relation_df["filepath_old"] = tree_change.old.path - line_relation_df["filepath_new"] = tree_change.new.path + if tree_change.new.path is None: + filepath_new = None + else: + filepath_new = str(pathlib.Path(tree_change.new.path.decode())).encode() + + line_relation_df["filepath_old"] = filepath_old + line_relation_df["filepath_new"] = filepath_new line_relation_df["commit_id_old"] = self.parent_id line_relation_df["commit_id_new"] = self.commit_id return line_relation_df From f758c968a8fa7ea23e7935e8f261bbc551c8c93b Mon Sep 17 00:00:00 2001 From: zy Date: Fri, 19 Aug 2022 01:00:38 +0800 Subject: [PATCH 02/23] fix the repository --- MySQLOperator.py | 34 ++++++++++++++++++++++++++++++++-- README.md | 4 +++- RepoExecutor.py | 7 +++++++ 3 files changed, 42 insertions(+), 3 deletions(-) diff --git a/MySQLOperator.py b/MySQLOperator.py index 11f485e..03b0e74 100644 --- a/MySQLOperator.py +++ b/MySQLOperator.py @@ -84,15 +84,45 @@ class MySQLOperator(object): params: - repoInfos: a list of RepoInfo objects """ + self.cursor.execute( + "select max(id) as max_id from repositories" + ) + max_id = self.cursor.fetchone()["max_id"] + if max_id is None: + max_id = 1 for repoInfo in repoInfos: ownername = repoInfo.ownername reponame = repoInfo.reponame self.cursor.execute( - "insert ignore into repositories (ownername, reponame, handled) values (%s, %s, %s)", - (ownername, reponame, 0), + "insert ignore into repositories (id, ownername, reponame, handled) values (%s, %s, %s, %s)", + (max_id, ownername, reponame, 0), ) + max_id = max_id + 1 self.connection.commit() + def update_handled_repository(self, repoInfo: RepoInfo): + """ + Function: insert all the repositories into repositories table + params: + - repoInfos: a list of RepoInfo objects + """ + ownername = repoInfo.ownername + reponame = repoInfo.reponame + self.cursor.execute( + "select handled from `{repo_id}{separator}steps`".format( + repo_id=repoInfo.id, + separator=GlobalConstants.SEPARATOR, + ) + ) + handled = self.cursor.fetchall() + handled = [step["handled"] for step in handled] + if 0 not in handled: + self.cursor.execute( + "update repositories set handled = 1 where ownername=%s and reponame=%s", + (ownername, reponame), + ) + self.connection.commit() + def init_steps_table(self, repoInfo: RepoInfo): """ Function: initialize the handled_repositories table diff --git a/README.md b/README.md index e65339b..4d86c75 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,9 @@ This is a project for finding factors related to bad clones. - this project uses [Mysql](https://dev.mysql.com/downloads/) 8.0.30 - copy the configuration template and rename it using command`cp ./config.template.yml ./config.yml` - set the section of the config with the hints in the template - +- Java + - To run the clone detector NIL, [jdk](https://www.oracle.com/java/technologies/downloads/) 1.8+ is needed. + ## run the project 1. Start collecting data for repositories by running the following commands: ``` diff --git a/RepoExecutor.py b/RepoExecutor.py index ca62659..342d82c 100644 --- a/RepoExecutor.py +++ b/RepoExecutor.py @@ -179,5 +179,12 @@ class RepoExecutorThread(threading.Thread): ) ) + # mark the handled repository + mysqlOp.update_handled_repository(repoInfo=repoInfo) + print( + "[Info]: Thread: {thread_name} finish handling the whole repo: {git_url}".format( + thread_name=self.name, git_url=repoInfo.git_url + ) + ) self.q.task_done() print("[Info]: Exist thread: " + self.name) From bb5fb2798724f4b30998fcea59ea04773a1de77d Mon Sep 17 00:00:00 2001 From: zy Date: Fri, 19 Aug 2022 13:08:19 +0800 Subject: [PATCH 03/23] insert id --- FileOperator.py | 4 ++-- MySQLOperator.py | 26 ++++++-------------------- delete_repos.example | 10 +++++----- repos.example | 10 +++++----- 4 files changed, 18 insertions(+), 32 deletions(-) diff --git a/FileOperator.py b/FileOperator.py index 39fad3d..6437e79 100644 --- a/FileOperator.py +++ b/FileOperator.py @@ -18,9 +18,9 @@ class FileOperator(object): with open(self.path, "r") as file: list = file.read().strip().splitlines() for line in list: - ownername, reponame, git_url = line.split(" ") + ownername, reponame, git_url, id = line.split(" ") repoInfo = RepoInfo( - ownername=ownername, reponame=reponame, git_url=git_url + id=id, ownername=ownername, reponame=reponame, git_url=git_url ) result.append(repoInfo) return result diff --git a/MySQLOperator.py b/MySQLOperator.py index 03b0e74..746850c 100644 --- a/MySQLOperator.py +++ b/MySQLOperator.py @@ -1,3 +1,4 @@ +import uuid from typing import List import pymysql @@ -84,20 +85,14 @@ class MySQLOperator(object): params: - repoInfos: a list of RepoInfo objects """ - self.cursor.execute( - "select max(id) as max_id from repositories" - ) - max_id = self.cursor.fetchone()["max_id"] - if max_id is None: - max_id = 1 for repoInfo in repoInfos: + id = repoInfo.id ownername = repoInfo.ownername reponame = repoInfo.reponame self.cursor.execute( "insert ignore into repositories (id, ownername, reponame, handled) values (%s, %s, %s, %s)", - (max_id, ownername, reponame, 0), + (id, ownername, reponame, 0), ) - max_id = max_id + 1 self.connection.commit() def update_handled_repository(self, repoInfo: RepoInfo): @@ -109,19 +104,10 @@ class MySQLOperator(object): ownername = repoInfo.ownername reponame = repoInfo.reponame self.cursor.execute( - "select handled from `{repo_id}{separator}steps`".format( - repo_id=repoInfo.id, - separator=GlobalConstants.SEPARATOR, + "update repositories set handled = 1 where ownername=%s and reponame=%s", + (ownername, reponame), ) - ) - handled = self.cursor.fetchall() - handled = [step["handled"] for step in handled] - if 0 not in handled: - self.cursor.execute( - "update repositories set handled = 1 where ownername=%s and reponame=%s", - (ownername, reponame), - ) - self.connection.commit() + self.connection.commit() def init_steps_table(self, repoInfo: RepoInfo): """ diff --git a/delete_repos.example b/delete_repos.example index 493edf3..ca503a5 100644 --- a/delete_repos.example +++ b/delete_repos.example @@ -1,5 +1,5 @@ -apache ant git@github.com:apache/ant.git -apache dubbo git@github.com:apache/dubbo.git -apache kafka git@github.com:apache/kafka.git -apache maven git@github.com:apache/maven.git -apache rocketmq git@github.com:apache/rocketmq.git +apache ant git@github.com:apache/ant.git 11 +apache dubbo git@github.com:apache/dubbo.git 12 +apache kafka git@github.com:apache/kafka.git 13 +apache maven git@github.com:apache/maven.git 14 +apache rocketmq git@github.com:apache/rocketmq.git 15 diff --git a/repos.example b/repos.example index 493edf3..ca503a5 100644 --- a/repos.example +++ b/repos.example @@ -1,5 +1,5 @@ -apache ant git@github.com:apache/ant.git -apache dubbo git@github.com:apache/dubbo.git -apache kafka git@github.com:apache/kafka.git -apache maven git@github.com:apache/maven.git -apache rocketmq git@github.com:apache/rocketmq.git +apache ant git@github.com:apache/ant.git 11 +apache dubbo git@github.com:apache/dubbo.git 12 +apache kafka git@github.com:apache/kafka.git 13 +apache maven git@github.com:apache/maven.git 14 +apache rocketmq git@github.com:apache/rocketmq.git 15 From dc451c0d4c371720aebe72c86a673c1f3ae56735 Mon Sep 17 00:00:00 2001 From: zy Date: Mon, 22 Aug 2022 02:43:14 +0800 Subject: [PATCH 04/23] finish Riskevalutator and change MYISAM --- MySQLOperator.py | 1 - RiskEvaluator.py | 233 ++++++++++++++++++++ sql_templates/blob_commit_relations.sql | 2 +- sql_templates/blob_methods.sql | 2 +- sql_templates/blobs.sql | 2 +- sql_templates/clone_relations_function.sql | 2 +- sql_templates/commit_relations.sql | 2 +- sql_templates/commits.sql | 2 +- sql_templates/filepaths.sql | 2 +- sql_templates/method_function_relations.sql | 2 +- sql_templates/method_geneology.sql | 2 +- sql_templates/repositories.sql | 2 +- sql_templates/steps.sql | 2 +- 13 files changed, 244 insertions(+), 12 deletions(-) create mode 100644 RiskEvaluator.py diff --git a/MySQLOperator.py b/MySQLOperator.py index 746850c..b2e25d8 100644 --- a/MySQLOperator.py +++ b/MySQLOperator.py @@ -1,4 +1,3 @@ -import uuid from typing import List import pymysql diff --git a/RiskEvaluator.py b/RiskEvaluator.py new file mode 100644 index 0000000..0b9846f --- /dev/null +++ b/RiskEvaluator.py @@ -0,0 +1,233 @@ +from typing import List + +import GlobalConstants +from FileOperator import FileOperator +from models.RepoInfo import RepoInfo +from MySQLOperator import MySQLOperator + +class RiskEvaluator(object): + def __init__( + self, + function_id_1: int, + function_id_2: int, + repoInfo: RepoInfo, + ): + self.function_id_1 = function_id_1 + self.function_id_2 = function_id_2 + self.repoInfo = repoInfo + + def evaluate(self, mysqlOp: MySQLOperator): + """ + Function: evaluate the risk of a clone pair in five steps: + 1.get the change num of clone pair + 2.get the consistent change num of clone pair + 3.get the interval of consistent change of clone class + 4.get the fix message of commit of consistent change + 5.evaluate the risk + """ + method_function_relations = "{id}{separator}method_function_relations".format( + id=self.repoInfo.id, + separator=GlobalConstants.SEPARATOR, + ) + clone_relations_function = "{id}{separator}clone_relations_function".format( + id=self.repoInfo.id, + separator=GlobalConstants.SEPARATOR, + ) + blob_commit_relations = "{id}{separator}blob_commit_relations".format( + id=self.repoInfo.id, + separator=GlobalConstants.SEPARATOR, + ) + blob_methods = "{id}{separator}blob_methods".format( + id=self.repoInfo.id, + separator=GlobalConstants.SEPARATOR, + ) + commits = "{id}{separator}commits".format( + id=self.repoInfo.id, + separator=GlobalConstants.SEPARATOR, + ) + commit_relations = "{id}{separator}commit_relations".format( + id=self.repoInfo.id, + separator=GlobalConstants.SEPARATOR, + ) + + # get the CpI + # Find changed methods in clone pairs + sql_change_1 = """ + select * from `{tablename1}` where method_id_1 in + (select method_id_1 from `{tablename2}` where function_id_1 = {function_id}) or method_id_2 in + (select method_id_2 from `{tablename2}` where function_id_2 = {function_id}) + """.format( + tablename1=method_function_relations, + tablename2=clone_relations_function, + function_id=self.function_id_1, + ) + + mysqlOp.cursor.execute(sql_change_1) + result_changes_1 = mysqlOp.cursor.fetchall() + num_changes_1 = len(result_changes_1) + + sql_change_2 = """ + select * from `{tablename1}` where method_id_1 in + (select method_id_1 from `{tablename2}` where function_id_1 = {function_id}) or method_id_2 in + (select method_id_2 from `{tablename2}` where function_id_2 = {function_id}) + """.format( + tablename1=method_function_relations, + tablename2=clone_relations_function, + function_id=self.function_id_2, + ) + + mysqlOp.cursor.execute(sql_change_2) + result_changes_2 = mysqlOp.cursor.fetchall() + num_changes_2 = len(result_changes_2) + sum_changes = num_changes_1 + num_changes_2 # The happened changes in clone pair + + # get CCR + # Find consistent_changes in all changes + change_pair_id_1 = [] + change_pair_id_2 = [] + if sum_changes <= 1: + CCR = 0 + else: + change_list_1 = [] + change_list_2 = [] + consistent_changes = 0 + for change in result_changes_1: + change_list_1.append(change.get("change")) + for change in result_changes_2: + change_list_2.append(change.get("change")) + for i in range(len(change_list_1)): + for j in range(len(change_list_2)): + if change_list_1[i] == change_list_2[j]: + change_pair_id_1.append(i) + change_pair_id_2.append(j) + consistent_changes = consistent_changes + 2 + CCR = consistent_changes / sum_changes + + # get CCL + # Find Latency in different commits + method_id_list1 = [] + method_id_list2 = [] + target_commits = [] + if CCR == 0: + CCL = 0 + else: + CCL = 0 + for id_1 in change_pair_id_1: + method_id_list1.append(result_changes_1[id_1].get("method_id_2")) + for id_2 in change_pair_id_2: + method_id_list2.append(result_changes_2[id_2].get("method_id_2")) + for i in range(len(change_pair_id_2)): + sql_commit_1 = """ + select commit_id from `{tablename1}` where blob_id = ( + select blob_id from `{tablename2}` where id = {id} + ) + """.format( + tablename1=blob_commit_relations, + tablename2=blob_methods, + id=method_id_list1[i] + ) + mysqlOp.cursor.execute(sql_commit_1) + commit_1 = mysqlOp.cursor.fetchall() + commit_line_1 = [] + for commit in commit_1: + commit_line_1.append(commit.get("commit_id")) + for j in range(len(commit_line_1)): + mysqlOp.cursor.execute( + "select parent_id from `{tablename}` where id = {id}".format( + tablename=commit_relations, + id=commit_line_1[j] + ) + ) + parent_id = mysqlOp.cursor.fetchall()[0].get("parent_id") + if parent_id not in commit_line_1: + commit_target_1 = commit_line_1[j] + break + + sql_commit_2 = """ + select commit_id from `{tablename1}` where blob_id = ( + select blob_id from `{tablename2}` where id = {id} + ) + """.format( + tablename1=blob_commit_relations, + tablename2=blob_methods, + id=method_id_list2[i] + ) + mysqlOp.cursor.execute(sql_commit_2) + commit_2 = mysqlOp.cursor.fetchall() + commit_line_2 = [] + for commit in commit_2: + commit_line_2.append(commit.get("commit_id")) + for j in range(len(commit_line_2)): + mysqlOp.cursor.execute( + "select parent_id from `{tablename}` where id = {id}".format( + tablename=commit_relations, + id=commit_line_2[j] + ) + ) + parent_id = mysqlOp.cursor.fetchall()[0].get("parent_id") + if parent_id not in commit_line_2: + commit_target_2 = commit_line_2[j] + break + + if commit_target_1 == commit_target_2: + CCL = 0 + else: + target_commits.append(commit_1[0].get("commit_id")) + target_commits.append(commit_2[0].get("commit_id")) + CCL = CCL + 1 + + # get bug_fix_num + if CCL == 0: + bug_fix_num = 0 + else: + bug_fix_num = 0 + for commit in target_commits: + sql_message = """ + select message from `{tablename1}` where id = {id} + """.format( + tablename1=commits, + id=commit + ) + mysqlOp.cursor.execute(sql_message) + messages = mysqlOp.cursor.fetchall() + for message in messages: + keywords = message.get("message").decode().replace('\n', ' ').split() + if "fix" in keywords or "Fix" in keywords: + bug_fix_num = bug_fix_num + 1 + + def Harmness_Evaluating( + CpI: float, CCR: float, CCL: float, bug_fix_num: int + ) -> int: + """ + Function : Evaluate the harmness of a clone + input: + - CpI: Changes per clone Instance + - CCR: Consistent Change Ratio + - CCl: Consistent Change Latency + - bug_fix_num: the number of bug_fix commit + output: + - risk_level: + - 1 -> Clone is harmless + - 2 -> Clone is low risky + - 3 -> Clone is medium risky + - 4 -> Clone is high risky + """ + if CpI == 0 | (CpI > 0 & CCR == 0): + risk_level = 0 + else: + if CCL == 0: + risk_level = 1 + else: + if bug_fix_num == 0: + risk_level = 2 + else: + risk_level = 3 + return risk_level + + return Harmness_Evaluating(sum_changes, CCR, CCL, bug_fix_num) + +repoInfos: List[RepoInfo] = FileOperator("repos").load_repos() +mysqlOp: MySQLOperator = MySQLOperator(config_path="config.yml") +for repoInfo in repoInfos: + method = RiskEvaluator(8, 10, repoInfo) + print(method.evaluate(mysqlOp)) diff --git a/sql_templates/blob_commit_relations.sql b/sql_templates/blob_commit_relations.sql index 765835d..ff327b1 100644 --- a/sql_templates/blob_commit_relations.sql +++ b/sql_templates/blob_commit_relations.sql @@ -7,4 +7,4 @@ CREATE TABLE IF NOT EXISTS `{tablename}` ( INDEX(`blob_id`) USING BTREE, INDEX(`commit_id`) USING BTREE, INDEX(`filepath_id`) USING BTREE -); +)engine=MyISAM; diff --git a/sql_templates/blob_methods.sql b/sql_templates/blob_methods.sql index 242171f..bae8259 100644 --- a/sql_templates/blob_methods.sql +++ b/sql_templates/blob_methods.sql @@ -10,4 +10,4 @@ CREATE TABLE IF NOT EXISTS `{tablename}` ( INDEX(`start`) USING BTREE, INDEX(`end`) USING BTREE, INDEX(`function_id`) USING BTREE -); +)engine=MyISAM; diff --git a/sql_templates/blobs.sql b/sql_templates/blobs.sql index 2f3a6ea..975d79a 100644 --- a/sql_templates/blobs.sql +++ b/sql_templates/blobs.sql @@ -3,4 +3,4 @@ CREATE TABLE IF NOT EXISTS `{tablename}` ( `sha` varbinary(40) NULL, PRIMARY KEY (`id`), UNIQUE INDEX(`sha`) USING HASH -); +)engine=MyISAM; diff --git a/sql_templates/clone_relations_function.sql b/sql_templates/clone_relations_function.sql index 799d7ba..24add42 100644 --- a/sql_templates/clone_relations_function.sql +++ b/sql_templates/clone_relations_function.sql @@ -16,4 +16,4 @@ CREATE TABLE IF NOT EXISTS `{tablename}` ( INDEX(`blob_id_2`) USING BTREE, INDEX(`function_id_2`) USING BTREE, INDEX(`commit_id`) USING BTREE -); +)engine=MyISAM; diff --git a/sql_templates/commit_relations.sql b/sql_templates/commit_relations.sql index 9dd0fa2..58feed0 100644 --- a/sql_templates/commit_relations.sql +++ b/sql_templates/commit_relations.sql @@ -3,4 +3,4 @@ CREATE TABLE IF NOT EXISTS `{tablename}` ( `parent_id` int(11) NULL, INDEX(`id`) USING BTREE, INDEX(`parent_id`) USING BTREE -); +)engine=MyISAM; diff --git a/sql_templates/commits.sql b/sql_templates/commits.sql index 8f1af6e..8f1bf7c 100644 --- a/sql_templates/commits.sql +++ b/sql_templates/commits.sql @@ -10,4 +10,4 @@ CREATE TABLE IF NOT EXISTS `{tablename}` ( `message` longblob NULL, PRIMARY KEY (`id`), UNIQUE INDEX(`sha`) USING HASH -); +)engine=MyISAM; diff --git a/sql_templates/filepaths.sql b/sql_templates/filepaths.sql index d825f6b..b4c0281 100644 --- a/sql_templates/filepaths.sql +++ b/sql_templates/filepaths.sql @@ -4,4 +4,4 @@ CREATE TABLE IF NOT EXISTS `{tablename}` ( `filepath` blob NULL, PRIMARY KEY (`id`), UNIQUE INDEX(`sha`) USING HASH -); +)engine=MyISAM; diff --git a/sql_templates/method_function_relations.sql b/sql_templates/method_function_relations.sql index 5ef7652..56f537e 100644 --- a/sql_templates/method_function_relations.sql +++ b/sql_templates/method_function_relations.sql @@ -6,4 +6,4 @@ CREATE TABLE IF NOT EXISTS `{tablename}` ( PRIMARY KEY (`id`), INDEX(`method_id_1`) USING BTREE, INDEX(`method_id_2`) USING BTREE -); +)engine=MyISAM; diff --git a/sql_templates/method_geneology.sql b/sql_templates/method_geneology.sql index b6aa887..d70dd88 100644 --- a/sql_templates/method_geneology.sql +++ b/sql_templates/method_geneology.sql @@ -15,4 +15,4 @@ CREATE TABLE IF NOT EXISTS `{tablename}` ( INDEX(`commit_sha_new`) USING BTREE, INDEX(`filepath_sha_new`) USING BTREE, INDEX(`method_id_new`) USING BTREE -); +)engine=MyISAM; diff --git a/sql_templates/repositories.sql b/sql_templates/repositories.sql index 82e47ac..7aecd6f 100644 --- a/sql_templates/repositories.sql +++ b/sql_templates/repositories.sql @@ -6,4 +6,4 @@ CREATE TABLE IF NOT EXISTS `repositories` ( PRIMARY KEY (`id`), UNIQUE INDEX(`ownername`, `reponame`) USING HASH, INDEX(`handled`) USING BTREE -); +)engine=MyISAM; diff --git a/sql_templates/steps.sql b/sql_templates/steps.sql index 2d8c24f..7bc3191 100644 --- a/sql_templates/steps.sql +++ b/sql_templates/steps.sql @@ -3,4 +3,4 @@ CREATE TABLE IF NOT EXISTS `{tablename}` ( `step_name` varchar(255) NULL, `handled` int(1) NULL, UNIQUE INDEX(`step_id`) USING HASH -); +)engine=MyISAM; From a2521c74ef9bcc7501cf4b5149164e112df55ee5 Mon Sep 17 00:00:00 2001 From: zy Date: Mon, 22 Aug 2022 13:24:13 +0800 Subject: [PATCH 05/23] delete myisam --- sql_templates/blob_commit_relations.sql | 2 +- sql_templates/blob_methods.sql | 2 +- sql_templates/blobs.sql | 2 +- sql_templates/clone_relations_function.sql | 2 +- sql_templates/commit_relations.sql | 2 +- sql_templates/commits.sql | 2 +- sql_templates/filepaths.sql | 2 +- sql_templates/method_function_relations.sql | 2 +- sql_templates/method_geneology.sql | 2 +- sql_templates/repositories.sql | 2 +- sql_templates/steps.sql | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) diff --git a/sql_templates/blob_commit_relations.sql b/sql_templates/blob_commit_relations.sql index ff327b1..765835d 100644 --- a/sql_templates/blob_commit_relations.sql +++ b/sql_templates/blob_commit_relations.sql @@ -7,4 +7,4 @@ CREATE TABLE IF NOT EXISTS `{tablename}` ( INDEX(`blob_id`) USING BTREE, INDEX(`commit_id`) USING BTREE, INDEX(`filepath_id`) USING BTREE -)engine=MyISAM; +); diff --git a/sql_templates/blob_methods.sql b/sql_templates/blob_methods.sql index bae8259..242171f 100644 --- a/sql_templates/blob_methods.sql +++ b/sql_templates/blob_methods.sql @@ -10,4 +10,4 @@ CREATE TABLE IF NOT EXISTS `{tablename}` ( INDEX(`start`) USING BTREE, INDEX(`end`) USING BTREE, INDEX(`function_id`) USING BTREE -)engine=MyISAM; +); diff --git a/sql_templates/blobs.sql b/sql_templates/blobs.sql index 975d79a..2f3a6ea 100644 --- a/sql_templates/blobs.sql +++ b/sql_templates/blobs.sql @@ -3,4 +3,4 @@ CREATE TABLE IF NOT EXISTS `{tablename}` ( `sha` varbinary(40) NULL, PRIMARY KEY (`id`), UNIQUE INDEX(`sha`) USING HASH -)engine=MyISAM; +); diff --git a/sql_templates/clone_relations_function.sql b/sql_templates/clone_relations_function.sql index 24add42..799d7ba 100644 --- a/sql_templates/clone_relations_function.sql +++ b/sql_templates/clone_relations_function.sql @@ -16,4 +16,4 @@ CREATE TABLE IF NOT EXISTS `{tablename}` ( INDEX(`blob_id_2`) USING BTREE, INDEX(`function_id_2`) USING BTREE, INDEX(`commit_id`) USING BTREE -)engine=MyISAM; +); diff --git a/sql_templates/commit_relations.sql b/sql_templates/commit_relations.sql index 58feed0..9dd0fa2 100644 --- a/sql_templates/commit_relations.sql +++ b/sql_templates/commit_relations.sql @@ -3,4 +3,4 @@ CREATE TABLE IF NOT EXISTS `{tablename}` ( `parent_id` int(11) NULL, INDEX(`id`) USING BTREE, INDEX(`parent_id`) USING BTREE -)engine=MyISAM; +); diff --git a/sql_templates/commits.sql b/sql_templates/commits.sql index 8f1bf7c..8f1af6e 100644 --- a/sql_templates/commits.sql +++ b/sql_templates/commits.sql @@ -10,4 +10,4 @@ CREATE TABLE IF NOT EXISTS `{tablename}` ( `message` longblob NULL, PRIMARY KEY (`id`), UNIQUE INDEX(`sha`) USING HASH -)engine=MyISAM; +); diff --git a/sql_templates/filepaths.sql b/sql_templates/filepaths.sql index b4c0281..d825f6b 100644 --- a/sql_templates/filepaths.sql +++ b/sql_templates/filepaths.sql @@ -4,4 +4,4 @@ CREATE TABLE IF NOT EXISTS `{tablename}` ( `filepath` blob NULL, PRIMARY KEY (`id`), UNIQUE INDEX(`sha`) USING HASH -)engine=MyISAM; +); diff --git a/sql_templates/method_function_relations.sql b/sql_templates/method_function_relations.sql index 56f537e..5ef7652 100644 --- a/sql_templates/method_function_relations.sql +++ b/sql_templates/method_function_relations.sql @@ -6,4 +6,4 @@ CREATE TABLE IF NOT EXISTS `{tablename}` ( PRIMARY KEY (`id`), INDEX(`method_id_1`) USING BTREE, INDEX(`method_id_2`) USING BTREE -)engine=MyISAM; +); diff --git a/sql_templates/method_geneology.sql b/sql_templates/method_geneology.sql index d70dd88..b6aa887 100644 --- a/sql_templates/method_geneology.sql +++ b/sql_templates/method_geneology.sql @@ -15,4 +15,4 @@ CREATE TABLE IF NOT EXISTS `{tablename}` ( INDEX(`commit_sha_new`) USING BTREE, INDEX(`filepath_sha_new`) USING BTREE, INDEX(`method_id_new`) USING BTREE -)engine=MyISAM; +); diff --git a/sql_templates/repositories.sql b/sql_templates/repositories.sql index 7aecd6f..82e47ac 100644 --- a/sql_templates/repositories.sql +++ b/sql_templates/repositories.sql @@ -6,4 +6,4 @@ CREATE TABLE IF NOT EXISTS `repositories` ( PRIMARY KEY (`id`), UNIQUE INDEX(`ownername`, `reponame`) USING HASH, INDEX(`handled`) USING BTREE -)engine=MyISAM; +); diff --git a/sql_templates/steps.sql b/sql_templates/steps.sql index 7bc3191..2d8c24f 100644 --- a/sql_templates/steps.sql +++ b/sql_templates/steps.sql @@ -3,4 +3,4 @@ CREATE TABLE IF NOT EXISTS `{tablename}` ( `step_name` varchar(255) NULL, `handled` int(1) NULL, UNIQUE INDEX(`step_id`) USING HASH -)engine=MyISAM; +); From 84db793d1ab343ddd1a297e1c803941b6388f8ea Mon Sep 17 00:00:00 2001 From: zy Date: Tue, 23 Aug 2022 10:26:07 +0800 Subject: [PATCH 06/23] fix riskevaluator --- RiskEvaluator.py | 218 ++++++++++++++++----------------- sql_templates/repositories.sql | 2 +- 2 files changed, 104 insertions(+), 116 deletions(-) diff --git a/RiskEvaluator.py b/RiskEvaluator.py index 0b9846f..3273dce 100644 --- a/RiskEvaluator.py +++ b/RiskEvaluator.py @@ -1,3 +1,4 @@ +from collections import Counter from typing import List import GlobalConstants @@ -5,12 +6,13 @@ from FileOperator import FileOperator from models.RepoInfo import RepoInfo from MySQLOperator import MySQLOperator + class RiskEvaluator(object): def __init__( - self, - function_id_1: int, - function_id_2: int, - repoInfo: RepoInfo, + self, + function_id_1: int, + function_id_2: int, + repoInfo: RepoInfo, ): self.function_id_1 = function_id_1 self.function_id_2 = function_id_2 @@ -18,26 +20,18 @@ class RiskEvaluator(object): def evaluate(self, mysqlOp: MySQLOperator): """ - Function: evaluate the risk of a clone pair in five steps: - 1.get the change num of clone pair - 2.get the consistent change num of clone pair - 3.get the interval of consistent change of clone class - 4.get the fix message of commit of consistent change - 5.evaluate the risk + Function: evaluate the risk of a clone pair in five steps: + 1.get the change num of clone pair + 2.get the consistent change num of clone pair + 3.get the interval of consistent change of clone class + 4.get the fix message of commit of consistent change + 5.evaluate the risk """ method_function_relations = "{id}{separator}method_function_relations".format( id=self.repoInfo.id, separator=GlobalConstants.SEPARATOR, ) clone_relations_function = "{id}{separator}clone_relations_function".format( - id=self.repoInfo.id, - separator=GlobalConstants.SEPARATOR, - ) - blob_commit_relations = "{id}{separator}blob_commit_relations".format( - id=self.repoInfo.id, - separator=GlobalConstants.SEPARATOR, - ) - blob_methods = "{id}{separator}blob_methods".format( id=self.repoInfo.id, separator=GlobalConstants.SEPARATOR, ) @@ -45,48 +39,82 @@ class RiskEvaluator(object): id=self.repoInfo.id, separator=GlobalConstants.SEPARATOR, ) - commit_relations = "{id}{separator}commit_relations".format( - id=self.repoInfo.id, - separator=GlobalConstants.SEPARATOR, + + # Fix: only consider the life time of clone pair(clone pair existing commits) + method_ids_1 = [] + method_ids_2 = [] + clone_methods_1 = [] + clone_methods_2 = [] + clone_pairs = [] + sql_clone_pairs_1 = """ + select method_id_1, method_id_2, commit_id from `{tablename}` + where function_id_1 = {function_id_1} and function_id_2 = {function_id_2} + """.format( + tablename=clone_relations_function, + function_id_1=self.function_id_1, + function_id_2=self.function_id_2, ) + mysqlOp.cursor.execute(sql_clone_pairs_1) + clone_pairs_1 = mysqlOp.cursor.fetchall() + for clone_pair in clone_pairs_1: + method_ids_1.append(clone_pair.get("method_id_1")) + method_ids_2.append(clone_pair.get("method_id_2")) + clone_pairs.extend(clone_pairs_1) + + sql_clone_pairs_2 = """ + select method_id_1, method_id_2, commit_id from `{tablename}` + where function_id_1 = {function_id_2} and function_id_2 = {function_id_1} + """.format( + tablename=clone_relations_function, + function_id_1=self.function_id_1, + function_id_2=self.function_id_2, + ) + mysqlOp.cursor.execute(sql_clone_pairs_2) + clone_pairs_2 = mysqlOp.cursor.fetchall() + for clone_pair in clone_pairs_2: + method_ids_2.append(clone_pair.get("method_id_1")) + method_ids_1.append(clone_pair.get("method_id_2")) + clone_pairs.extend(clone_pairs_2) + + for method_id, count in Counter(method_ids_1).items(): + clone_methods_1.append(method_id) + for method_id, count in Counter(method_ids_2).items(): + clone_methods_2.append(method_id) # get the CpI # Find changed methods in clone pairs - sql_change_1 = """ - select * from `{tablename1}` where method_id_1 in - (select method_id_1 from `{tablename2}` where function_id_1 = {function_id}) or method_id_2 in - (select method_id_2 from `{tablename2}` where function_id_2 = {function_id}) - """.format( - tablename1=method_function_relations, - tablename2=clone_relations_function, - function_id=self.function_id_1, - ) + result_changes_1 = [] + result_changes_2 = [] + for method_id in clone_methods_1: + sql_change = """ + select * from `{tablename1}` where method_id_1 = {method_id} + """.format( + tablename1=method_function_relations, + method_id=method_id, + ) - mysqlOp.cursor.execute(sql_change_1) - result_changes_1 = mysqlOp.cursor.fetchall() - num_changes_1 = len(result_changes_1) + mysqlOp.cursor.execute(sql_change) + result_changes_1.extend(mysqlOp.cursor.fetchall()) + for method_id in clone_methods_2: + sql_change = """ + select * from `{tablename1}` where method_id_1 = {method_id} + """.format( + tablename1=method_function_relations, + method_id=method_id, + ) - sql_change_2 = """ - select * from `{tablename1}` where method_id_1 in - (select method_id_1 from `{tablename2}` where function_id_1 = {function_id}) or method_id_2 in - (select method_id_2 from `{tablename2}` where function_id_2 = {function_id}) - """.format( - tablename1=method_function_relations, - tablename2=clone_relations_function, - function_id=self.function_id_2, - ) + mysqlOp.cursor.execute(sql_change) + result_changes_2.extend(mysqlOp.cursor.fetchall()) - mysqlOp.cursor.execute(sql_change_2) - result_changes_2 = mysqlOp.cursor.fetchall() - num_changes_2 = len(result_changes_2) - sum_changes = num_changes_1 + num_changes_2 # The happened changes in clone pair + sum_changes = len(result_changes_1) + len(result_changes_2) + print(sum_changes) # get CCR # Find consistent_changes in all changes change_pair_id_1 = [] change_pair_id_2 = [] if sum_changes <= 1: - CCR = 0 + consistent_changes = 0 else: change_list_1 = [] change_list_2 = [] @@ -101,79 +129,37 @@ class RiskEvaluator(object): change_pair_id_1.append(i) change_pair_id_2.append(j) consistent_changes = consistent_changes + 2 - CCR = consistent_changes / sum_changes # get CCL # Find Latency in different commits - method_id_list1 = [] - method_id_list2 = [] + changed_method_list1 = [] + changed_method_list2 = [] target_commits = [] - if CCR == 0: + if consistent_changes == 0: CCL = 0 else: CCL = 0 - for id_1 in change_pair_id_1: - method_id_list1.append(result_changes_1[id_1].get("method_id_2")) - for id_2 in change_pair_id_2: - method_id_list2.append(result_changes_2[id_2].get("method_id_2")) - for i in range(len(change_pair_id_2)): - sql_commit_1 = """ - select commit_id from `{tablename1}` where blob_id = ( - select blob_id from `{tablename2}` where id = {id} - ) - """.format( - tablename1=blob_commit_relations, - tablename2=blob_methods, - id=method_id_list1[i] - ) - mysqlOp.cursor.execute(sql_commit_1) - commit_1 = mysqlOp.cursor.fetchall() - commit_line_1 = [] - for commit in commit_1: - commit_line_1.append(commit.get("commit_id")) - for j in range(len(commit_line_1)): - mysqlOp.cursor.execute( - "select parent_id from `{tablename}` where id = {id}".format( - tablename=commit_relations, - id=commit_line_1[j] - ) - ) - parent_id = mysqlOp.cursor.fetchall()[0].get("parent_id") - if parent_id not in commit_line_1: - commit_target_1 = commit_line_1[j] - break - - sql_commit_2 = """ - select commit_id from `{tablename1}` where blob_id = ( - select blob_id from `{tablename2}` where id = {id} - ) - """.format( - tablename1=blob_commit_relations, - tablename2=blob_methods, - id=method_id_list2[i] - ) - mysqlOp.cursor.execute(sql_commit_2) - commit_2 = mysqlOp.cursor.fetchall() - commit_line_2 = [] - for commit in commit_2: - commit_line_2.append(commit.get("commit_id")) - for j in range(len(commit_line_2)): - mysqlOp.cursor.execute( - "select parent_id from `{tablename}` where id = {id}".format( - tablename=commit_relations, - id=commit_line_2[j] - ) - ) - parent_id = mysqlOp.cursor.fetchall()[0].get("parent_id") - if parent_id not in commit_line_2: - commit_target_2 = commit_line_2[j] - break - + commits_1 = [] + commits_2 = [] + for id in change_pair_id_1: + changed_method_list1.append(result_changes_1[id].get("method_id_2")) + for id in change_pair_id_2: + changed_method_list2.append(result_changes_2[id].get("method_id_2")) + for i in range(len(changed_method_list1)): + changed_method_id_1 = changed_method_list1[i] + changed_method_id_2 = changed_method_list2[i] + for clone_pair in clone_pairs: + if clone_pair.get("method_id_1") == changed_method_id_1: + commits_1.append(clone_pair.get("commit_id")) + if clone_pair.get("method_id_2") == changed_method_id_2: + commits_2.append(clone_pair.get("commit_id")) + commit_target_1 = min(commits_1) + commit_target_2 = min(commits_2) if commit_target_1 == commit_target_2: CCL = 0 else: - target_commits.append(commit_1[0].get("commit_id")) - target_commits.append(commit_2[0].get("commit_id")) + target_commits.append(commit_target_1) + target_commits.append(commit_target_2) CCL = CCL + 1 # get bug_fix_num @@ -185,18 +171,19 @@ class RiskEvaluator(object): sql_message = """ select message from `{tablename1}` where id = {id} """.format( - tablename1=commits, - id=commit + tablename1=commits, id=commit ) mysqlOp.cursor.execute(sql_message) messages = mysqlOp.cursor.fetchall() for message in messages: - keywords = message.get("message").decode().replace('\n', ' ').split() + keywords = ( + message.get("message").decode().replace("\n", " ").split() + ) if "fix" in keywords or "Fix" in keywords: bug_fix_num = bug_fix_num + 1 def Harmness_Evaluating( - CpI: float, CCR: float, CCL: float, bug_fix_num: int + CpI: float, CCR: float, CCL: float, bug_fix_num: int ) -> int: """ Function : Evaluate the harmness of a clone @@ -224,7 +211,8 @@ class RiskEvaluator(object): risk_level = 3 return risk_level - return Harmness_Evaluating(sum_changes, CCR, CCL, bug_fix_num) + return Harmness_Evaluating(sum_changes, consistent_changes, CCL, bug_fix_num) + repoInfos: List[RepoInfo] = FileOperator("repos").load_repos() mysqlOp: MySQLOperator = MySQLOperator(config_path="config.yml") diff --git a/sql_templates/repositories.sql b/sql_templates/repositories.sql index 7aecd6f..82e47ac 100644 --- a/sql_templates/repositories.sql +++ b/sql_templates/repositories.sql @@ -6,4 +6,4 @@ CREATE TABLE IF NOT EXISTS `repositories` ( PRIMARY KEY (`id`), UNIQUE INDEX(`ownername`, `reponame`) USING HASH, INDEX(`handled`) USING BTREE -)engine=MyISAM; +); From a07b330531ce9f5436572c8a3261d10c7aafb27c Mon Sep 17 00:00:00 2001 From: zy Date: Tue, 23 Aug 2022 10:28:51 +0800 Subject: [PATCH 07/23] fix riskevaluator --- RiskEvaluator.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/RiskEvaluator.py b/RiskEvaluator.py index 3273dce..15e314a 100644 --- a/RiskEvaluator.py +++ b/RiskEvaluator.py @@ -182,9 +182,7 @@ class RiskEvaluator(object): if "fix" in keywords or "Fix" in keywords: bug_fix_num = bug_fix_num + 1 - def Harmness_Evaluating( - CpI: float, CCR: float, CCL: float, bug_fix_num: int - ) -> int: + def Harmness_Evaluating(CpI: int, CCR: int, CCL: int, bug_fix_num: int) -> int: """ Function : Evaluate the harmness of a clone input: From b74212f5aa747f6b65db6696a1a5b7f2ad1109ef Mon Sep 17 00:00:00 2001 From: zy Date: Tue, 23 Aug 2022 10:31:44 +0800 Subject: [PATCH 08/23] fix riskevaluator --- RiskEvaluator.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/RiskEvaluator.py b/RiskEvaluator.py index 15e314a..56d8a4e 100644 --- a/RiskEvaluator.py +++ b/RiskEvaluator.py @@ -215,5 +215,7 @@ class RiskEvaluator(object): repoInfos: List[RepoInfo] = FileOperator("repos").load_repos() mysqlOp: MySQLOperator = MySQLOperator(config_path="config.yml") for repoInfo in repoInfos: - method = RiskEvaluator(8, 10, repoInfo) - print(method.evaluate(mysqlOp)) + clone_pair = RiskEvaluator( + 8, 10, repoInfo + ) # Once evaluate the risk of one clone pair(function_id) + print(clone_pair.evaluate(mysqlOp)) From 2f2d1d110667ab16bb98b9c3a8e04dfa36f46948 Mon Sep 17 00:00:00 2001 From: zy Date: Wed, 24 Aug 2022 09:17:29 +0800 Subject: [PATCH 09/23] a bug happened with dulwich itself --- MethodFunctionRelationExtractor.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/MethodFunctionRelationExtractor.py b/MethodFunctionRelationExtractor.py index 8c5f4e8..fd545c5 100644 --- a/MethodFunctionRelationExtractor.py +++ b/MethodFunctionRelationExtractor.py @@ -278,12 +278,28 @@ def extract_method_function_relation( if row["filepath_old"] is None: filepath_id_old = np.nan else: - filepath_id_old = filepath_id_dict[row["filepath_old"]] + if row["filepath_old"] not in filepath_id_dict: + """ + Some filepaths gotten by dulwich are different with the real filepaths + in the mysql database and the key names in filepath_id_dict. When this bug + happened we set filepath_id_old = 0. + """ + filepath_id_old = 0 + else: + filepath_id_old = filepath_id_dict[row["filepath_old"]] if row["filepath_new"] is None: filepath_id_new = np.nan else: - filepath_id_new = filepath_id_dict[row["filepath_new"]] + if row["filepath_new"] not in filepath_id_dict: + """ + Some filepaths gotten by dulwich are different with the real filepaths + in the mysql database and the key names in filepath_id_dict. When this bug + happened we set filepath_id_old = 0. + """ + filepath_id_new = 0 + else: + filepath_id_new = filepath_id_dict[row["filepath_new"]] commit_id_old = row["commit_id_old"] commit_id_new = row["commit_id_new"] From d69c5381c4c3ce4f2afada157128363472357b06 Mon Sep 17 00:00:00 2001 From: zy Date: Wed, 24 Aug 2022 09:26:35 +0800 Subject: [PATCH 10/23] a bug from dulwich --- MethodFunctionRelationExtractor.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/MethodFunctionRelationExtractor.py b/MethodFunctionRelationExtractor.py index fd545c5..6e5e2e2 100644 --- a/MethodFunctionRelationExtractor.py +++ b/MethodFunctionRelationExtractor.py @@ -296,6 +296,9 @@ def extract_method_function_relation( Some filepaths gotten by dulwich are different with the real filepaths in the mysql database and the key names in filepath_id_dict. When this bug happened we set filepath_id_old = 0. + Example: When deal with the repository git@github.com:apache/iotdb.git, a filepath in + filepath_id_dict is 'iotdb\\metrics\\interface\\src\\main\\java\\org\\apache\\iotdb\\metrics\\DoNothingMetricService.java' + while the filepath obtained by dulwich will ignore "iotdb\\" """ filepath_id_new = 0 else: From fe0427e2bf73cfd95107aed99cfe6a186731fb47 Mon Sep 17 00:00:00 2001 From: zy Date: Wed, 24 Aug 2022 10:04:15 +0800 Subject: [PATCH 11/23] a bug from dulwich --- MethodFunctionRelationExtractor.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/MethodFunctionRelationExtractor.py b/MethodFunctionRelationExtractor.py index 6e5e2e2..a111d73 100644 --- a/MethodFunctionRelationExtractor.py +++ b/MethodFunctionRelationExtractor.py @@ -282,9 +282,9 @@ def extract_method_function_relation( """ Some filepaths gotten by dulwich are different with the real filepaths in the mysql database and the key names in filepath_id_dict. When this bug - happened we set filepath_id_old = 0. + happened we set filepath_id_old = -1. """ - filepath_id_old = 0 + filepath_id_old = -1 else: filepath_id_old = filepath_id_dict[row["filepath_old"]] @@ -295,12 +295,12 @@ def extract_method_function_relation( """ Some filepaths gotten by dulwich are different with the real filepaths in the mysql database and the key names in filepath_id_dict. When this bug - happened we set filepath_id_old = 0. + happened we set filepath_id_old = -1. Example: When deal with the repository git@github.com:apache/iotdb.git, a filepath in filepath_id_dict is 'iotdb\\metrics\\interface\\src\\main\\java\\org\\apache\\iotdb\\metrics\\DoNothingMetricService.java' while the filepath obtained by dulwich will ignore "iotdb\\" """ - filepath_id_new = 0 + filepath_id_new = -1 else: filepath_id_new = filepath_id_dict[row["filepath_new"]] From 03dd8c12cd7bd7960d982a8a64cc1744fbaa0219 Mon Sep 17 00:00:00 2001 From: zy Date: Wed, 24 Aug 2022 22:28:56 +0800 Subject: [PATCH 12/23] fix the get changed --- RiskEvaluator.py | 144 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 98 insertions(+), 46 deletions(-) diff --git a/RiskEvaluator.py b/RiskEvaluator.py index 56d8a4e..d4007b9 100644 --- a/RiskEvaluator.py +++ b/RiskEvaluator.py @@ -39,42 +39,40 @@ class RiskEvaluator(object): id=self.repoInfo.id, separator=GlobalConstants.SEPARATOR, ) - + blob_methods = "{id}{separator}blob_methods".format( + id=self.repoInfo.id, + separator=GlobalConstants.SEPARATOR, + ) + blob_commit_relations = "{id}{separator}blob_commit_relations".format( + id=self.repoInfo.id, + separator=GlobalConstants.SEPARATOR, + ) # Fix: only consider the life time of clone pair(clone pair existing commits) method_ids_1 = [] method_ids_2 = [] clone_methods_1 = [] clone_methods_2 = [] - clone_pairs = [] - sql_clone_pairs_1 = """ - select method_id_1, method_id_2, commit_id from `{tablename}` - where function_id_1 = {function_id_1} and function_id_2 = {function_id_2} + relate_commits = [] + sql_clone_pairs = """ + select method_id_1,function_id_1,method_id_2,function_id_2,commit_id from `{tablename}` + where function_id_1 = {function_id_1} and function_id_2 = {function_id_2} or + function_id_1 = {function_id_2} and function_id_2 = {function_id_1} """.format( tablename=clone_relations_function, function_id_1=self.function_id_1, function_id_2=self.function_id_2, ) - mysqlOp.cursor.execute(sql_clone_pairs_1) - clone_pairs_1 = mysqlOp.cursor.fetchall() - for clone_pair in clone_pairs_1: - method_ids_1.append(clone_pair.get("method_id_1")) - method_ids_2.append(clone_pair.get("method_id_2")) - clone_pairs.extend(clone_pairs_1) - - sql_clone_pairs_2 = """ - select method_id_1, method_id_2, commit_id from `{tablename}` - where function_id_1 = {function_id_2} and function_id_2 = {function_id_1} - """.format( - tablename=clone_relations_function, - function_id_1=self.function_id_1, - function_id_2=self.function_id_2, - ) - mysqlOp.cursor.execute(sql_clone_pairs_2) - clone_pairs_2 = mysqlOp.cursor.fetchall() - for clone_pair in clone_pairs_2: - method_ids_2.append(clone_pair.get("method_id_1")) - method_ids_1.append(clone_pair.get("method_id_2")) - clone_pairs.extend(clone_pairs_2) + mysqlOp.cursor.execute(sql_clone_pairs) + clone_pairs = mysqlOp.cursor.fetchall() + for clone_pair in clone_pairs: + if clone_pair.get("function_id_1") == self.function_id_1: + method_ids_1.append(clone_pair.get("method_id_1")) + method_ids_2.append(clone_pair.get("method_id_2")) + relate_commits.append(clone_pair.get("commit_id")) + else: + method_ids_1.append(clone_pair.get("method_id_2")) + method_ids_2.append(clone_pair.get("method_id_1")) + relate_commits.append(clone_pair.get("commit_id")) for method_id, count in Counter(method_ids_1).items(): clone_methods_1.append(method_id) @@ -83,28 +81,82 @@ class RiskEvaluator(object): # get the CpI # Find changed methods in clone pairs - result_changes_1 = [] - result_changes_2 = [] - for method_id in clone_methods_1: - sql_change = """ - select * from `{tablename1}` where method_id_1 = {method_id} - """.format( - tablename1=method_function_relations, - method_id=method_id, + def method_change(clone_methods, relate_commits, function_id): + result_changes = [] # 最终涉及到的所有的变化 + out_clone_methods = [] + all_method_id = [] + mysqlOp.cursor.execute( + "select id from `{tablename1}` where function_id = {function_id}".format( + tablename1=blob_methods, function_id=function_id + ) ) + all_methods = mysqlOp.cursor.fetchall() + for method in all_methods: # 该function下的所有method_id + all_method_id.append(method.get("id")) + for method_id in clone_methods: # 找到所有与克隆相关的change + sql_change = """ + select * from `{tablename1}` where method_id_1 = {method_id} + """.format( + tablename1=method_function_relations, + method_id=method_id, + ) + mysqlOp.cursor.execute(sql_change) + result_changes.extend(mysqlOp.cursor.fetchall()) + for change in result_changes: # 挑出发生变化后不属于克隆对但还是属于该function_id下的method_id + if ( + change.get("method_id_2") in all_method_id + and change.get("method_id_2") not in clone_methods + ): + out_clone_methods.append(change.get("method_id_2")) - mysqlOp.cursor.execute(sql_change) - result_changes_1.extend(mysqlOp.cursor.fetchall()) - for method_id in clone_methods_2: - sql_change = """ - select * from `{tablename1}` where method_id_1 = {method_id} - """.format( - tablename1=method_function_relations, - method_id=method_id, - ) + def out_clone_change(out_clone_methods): + next_methods = [] + for method_id in out_clone_methods: + check_commit = """ + select commit_id from `{tablename1}` where blob_id = ( + select blob_id from `{tablename2}` where id = {id} + ) + """.format( + tablename1=blob_commit_relations, + tablename2=blob_methods, + id=method_id, + ) + mysqlOp.cursor.execute(check_commit) + commit_id = mysqlOp.cursor.fetchone()[0] + if commit_id in relate_commits: # 检查这个method所在的commit是否属于克隆的生命周期内 + mysqlOp.cursor.execute( + "select * from `{tablename1}` where method_id_1 = {method_id}".format( + tablename1=method_function_relations, + method_id=method_id, + ) + ) + results = mysqlOp.cursor.fetchall() + if len(results) == 0: # 未产生变化就返回空值 + return [] + else: + for ( + result + ) in results: # 挑出发生变化后不属于克隆对但还是属于该function_id下的method_id + if ( + result.get("method_id_2") in all_method_id + and result.get("method_id_2") not in clone_methods + ): + next_methods.append(result.get("method_id_2")) + results.extend(out_clone_change(next_methods)) + return results + else: + return [] - mysqlOp.cursor.execute(sql_change) - result_changes_2.extend(mysqlOp.cursor.fetchall()) + if len(out_clone_methods) != 0: + result_changes.extend(out_clone_change(out_clone_methods)) + return result_changes + + result_changes_1 = method_change( + clone_methods_1, relate_commits, self.function_id_1 + ) + result_changes_2 = method_change( + clone_methods_2, relate_commits, self.function_id_2 + ) sum_changes = len(result_changes_1) + len(result_changes_2) print(sum_changes) @@ -216,6 +268,6 @@ repoInfos: List[RepoInfo] = FileOperator("repos").load_repos() mysqlOp: MySQLOperator = MySQLOperator(config_path="config.yml") for repoInfo in repoInfos: clone_pair = RiskEvaluator( - 8, 10, repoInfo + 10, 8, repoInfo ) # Once evaluate the risk of one clone pair(function_id) print(clone_pair.evaluate(mysqlOp)) From 78da04a2ae301b85c8e44901dfaae28a9450d131 Mon Sep 17 00:00:00 2001 From: zy Date: Sat, 27 Aug 2022 22:10:44 +0800 Subject: [PATCH 13/23] fix some problems --- GitOperator.py | 73 ++++++--- MethodFunctionRelationExtractor.py | 16 +- RiskEvaluator.py | 249 +++++++++++++++++------------ 3 files changed, 212 insertions(+), 126 deletions(-) diff --git a/GitOperator.py b/GitOperator.py index c6347e1..68d56e3 100644 --- a/GitOperator.py +++ b/GitOperator.py @@ -1,4 +1,5 @@ import os +import pathlib import subprocess from typing import List @@ -39,6 +40,48 @@ class GitOperator(object): - """ + # def find_blobs_in_tree( + # repo: Repo, commit: Commit, tree: Tree, relpath: bytes = b"" + # ) -> List[BlobInfo]: + # """ + # Function: iterately find all the blobs for the target commit + # params: + # - commit: the target commit + # - tree: current Tree object in iteration + # - relpath: the relative path before this iteration + # return: + # - a list of BlobInfo objects regarding this commit + # """ + # result = [] + # for entry in tree.items(): + # if (not repo.object_store.contains_loose(entry.sha)) and ( + # not repo.object_store.contains_packed(entry.sha) + # ): + # # the object cannot be found in the repo + # return result + # obj = repo.object_store[entry.sha] + # new_relpath = os.path.join(relpath, entry.path) + # if obj.type_name == b"blob": + # result.append( + # BlobInfo( + # repo=repo, commit=commit, filepath=new_relpath, blob=obj + # ) + # ) + # elif obj.type_name == b"tree": + # new_tree = obj + # result.extend( + # find_blobs_in_tree( + # repo=repo, + # commit=commit, + # tree=new_tree, + # relpath=new_relpath, + # ) + # ) + # else: + # # there is something wrong with this tree object + # return result + # return result + def find_blobs_in_tree( repo: Repo, commit: Commit, tree: Tree, relpath: bytes = b"" ) -> List[BlobInfo]: @@ -52,33 +95,19 @@ class GitOperator(object): - a list of BlobInfo objects regarding this commit """ result = [] - for entry in tree.items(): + for entry in Repo( + self.repoInfo.bare_repo_path + ).object_store.iter_tree_contents(commit.tree): if (not repo.object_store.contains_loose(entry.sha)) and ( not repo.object_store.contains_packed(entry.sha) ): # the object cannot be found in the repo - return result + continue obj = repo.object_store[entry.sha] - new_relpath = os.path.join(relpath, entry.path) - if obj.type_name == b"blob": - result.append( - BlobInfo( - repo=repo, commit=commit, filepath=new_relpath, blob=obj - ) - ) - elif obj.type_name == b"tree": - new_tree = obj - result.extend( - find_blobs_in_tree( - repo=repo, - commit=commit, - tree=new_tree, - relpath=new_relpath, - ) - ) - else: - # there is something wrong with this tree object - return result + new_relpath = str(pathlib.Path(entry.path.decode())).encode() + result.append( + BlobInfo(repo=repo, commit=commit, filepath=new_relpath, blob=obj) + ) return result blobInfos = find_blobs_in_tree( diff --git a/MethodFunctionRelationExtractor.py b/MethodFunctionRelationExtractor.py index a111d73..4618cf2 100644 --- a/MethodFunctionRelationExtractor.py +++ b/MethodFunctionRelationExtractor.py @@ -313,7 +313,9 @@ def extract_method_function_relation( if np.isnan(line_old): method_id_old = np.nan else: - if ( + if filepath_id_old == -1: + method_id_old = -1 + elif ( commit_id_old not in commit_filepath_lineno_method_id_dict or filepath_id_old not in commit_filepath_lineno_method_id_dict[commit_id_old] @@ -331,7 +333,9 @@ def extract_method_function_relation( if np.isnan(line_new): method_id_new = np.nan else: - if ( + if filepath_id_new == -1: + method_id_new = -1 + elif ( commit_id_new not in commit_filepath_lineno_method_id_dict or filepath_id_new not in commit_filepath_lineno_method_id_dict[commit_id_new] @@ -457,6 +461,14 @@ def extract_method_function_relation( ), (method_id_1, method_id_2, change_content), ) + # delete the commit where method_id_new= -1 or method_id_old = -1 + mysqlOp.cursor.execute( + "delete from `{method_function_relation_tablename}` where method_id_1 = -1 or method_id_2 = -1".format( + method_function_relation_tablename=mysqlOp.tablename_dict[ + "method_function_relations" + ] + ) + ) mysqlOp.connection.commit() for commit_sha, commit_id in commit_sha_id_dict.items(): diff --git a/RiskEvaluator.py b/RiskEvaluator.py index d4007b9..58c4bf9 100644 --- a/RiskEvaluator.py +++ b/RiskEvaluator.py @@ -47,16 +47,16 @@ class RiskEvaluator(object): id=self.repoInfo.id, separator=GlobalConstants.SEPARATOR, ) + commit_relations = "{id}{separator}commit_relations".format( + id=self.repoInfo.id, + separator=GlobalConstants.SEPARATOR, + ) # Fix: only consider the life time of clone pair(clone pair existing commits) - method_ids_1 = [] - method_ids_2 = [] - clone_methods_1 = [] - clone_methods_2 = [] relate_commits = [] sql_clone_pairs = """ select method_id_1,function_id_1,method_id_2,function_id_2,commit_id from `{tablename}` - where function_id_1 = {function_id_1} and function_id_2 = {function_id_2} or - function_id_1 = {function_id_2} and function_id_2 = {function_id_1} + where (function_id_1 = {function_id_1} and function_id_2 = {function_id_2}) or + (function_id_1 = {function_id_2} and function_id_2 = {function_id_1}) """.format( tablename=clone_relations_function, function_id_1=self.function_id_1, @@ -65,98 +65,150 @@ class RiskEvaluator(object): mysqlOp.cursor.execute(sql_clone_pairs) clone_pairs = mysqlOp.cursor.fetchall() for clone_pair in clone_pairs: - if clone_pair.get("function_id_1") == self.function_id_1: - method_ids_1.append(clone_pair.get("method_id_1")) - method_ids_2.append(clone_pair.get("method_id_2")) - relate_commits.append(clone_pair.get("commit_id")) + relate_commits.append(clone_pair.get("commit_id")) + # get the start and end commmits of clone pair + start_commits = [] + end_commits = [] + for commit_id in relate_commits: + sql_parent = """ + select parent_id from `{tablename}` where id={id} + """.format( + tablename=commit_relations, id=commit_id + ) + mysqlOp.cursor.execute(sql_parent) + parent_id = mysqlOp.cursor.fetchone() + sql_son = """ + select id from `{tablename}` where parent_id={id} + """.format( + tablename=commit_relations, id=commit_id + ) + mysqlOp.cursor.execute(sql_son) + son_id = mysqlOp.cursor.fetchone() + if parent_id is None: + start_commits.append(commit_id) + elif son_id is None: + end_commits.append(commit_id) + elif parent_id.get("parent_id") not in relate_commits: + start_commits.append(commit_id) + elif son_id.get("id") not in relate_commits: + end_commits.append(commit_id) else: - method_ids_1.append(clone_pair.get("method_id_2")) - method_ids_2.append(clone_pair.get("method_id_1")) - relate_commits.append(clone_pair.get("commit_id")) + continue + # get the clone pair evolution + clone_evolution = [] + clone_pair_related_commits = [] + start = [] + start.extend(start_commits) - for method_id, count in Counter(method_ids_1).items(): - clone_methods_1.append(method_id) - for method_id, count in Counter(method_ids_2).items(): - clone_methods_2.append(method_id) + def get_evolution_of_clone_pair(start_commits: List, end_commits: List): + clone_evolution = [] + clone_pair_related_commits.extend(start_commits) + for commit in start_commits: + mysqlOp.cursor.execute( + "select id from `{tablename}` where parent_id = {id}".format( + tablename=commit_relations, id=commit + ) + ) + result_commits = mysqlOp.cursor.fetchall() + for result in result_commits: + clone_evolution.append((commit, result.get("id"))) + if result.get("id") in clone_pair_related_commits: + continue + elif result.get("id") in end_commits: + end_commits.remove(result.get("id")) + clone_pair_related_commits.append(result.get("id")) + else: + start_commits.append(result.get("id")) + start_commits.remove(commit) + if len(end_commits) != 0: + clone_evolution.extend( + get_evolution_of_clone_pair(start_commits, end_commits) + ) + return clone_evolution + + clone_evolution = get_evolution_of_clone_pair(start_commits, end_commits) + print(clone_evolution) # get the CpI - # Find changed methods in clone pairs - def method_change(clone_methods, relate_commits, function_id): - result_changes = [] # 最终涉及到的所有的变化 - out_clone_methods = [] - all_method_id = [] - mysqlOp.cursor.execute( - "select id from `{tablename1}` where function_id = {function_id}".format( - tablename1=blob_methods, function_id=function_id - ) + # Find all changes during clone pair evolution + start_methods_1 = [] + start_methods_2 = [] + result_changes_1 = [] + result_changes_2 = [] + for start_commit in start: + sql_start_method = """ + select method_id_1, function_id_1, method_id_2, function_id_2 from `{tablename}` + where (function_id_1 = {function_id_1} and function_id_2 = {function_id_2} and commit_id={commit_id}) or + (function_id_1 = {function_id_2} and function_id_2 = {function_id_1} and commit_id={commit_id}) + """.format( + tablename=clone_relations_function, + function_id_1=self.function_id_1, + function_id_2=self.function_id_2, + commit_id=start_commit, ) - all_methods = mysqlOp.cursor.fetchall() - for method in all_methods: # 该function下的所有method_id - all_method_id.append(method.get("id")) - for method_id in clone_methods: # 找到所有与克隆相关的change - sql_change = """ - select * from `{tablename1}` where method_id_1 = {method_id} - """.format( - tablename1=method_function_relations, - method_id=method_id, - ) - mysqlOp.cursor.execute(sql_change) - result_changes.extend(mysqlOp.cursor.fetchall()) - for change in result_changes: # 挑出发生变化后不属于克隆对但还是属于该function_id下的method_id - if ( - change.get("method_id_2") in all_method_id - and change.get("method_id_2") not in clone_methods - ): - out_clone_methods.append(change.get("method_id_2")) + mysqlOp.cursor.execute(sql_start_method) + start_clone_pair = mysqlOp.cursor.fetchone() + if start_clone_pair.get("function_id_1") == self.function_id_1: + start_methods_1.append(start_clone_pair.get("method_id_1")) + start_methods_2.append(start_clone_pair.get("method_id_2")) + elif start_clone_pair.get("function_id_1") == self.function_id_2: + start_methods_1.append(start_clone_pair.get("method_id_2")) + start_methods_2.append(start_clone_pair.get("method_id_1")) - def out_clone_change(out_clone_methods): - next_methods = [] - for method_id in out_clone_methods: - check_commit = """ - select commit_id from `{tablename1}` where blob_id = ( - select blob_id from `{tablename2}` where id = {id} - ) + def method_change(start_method_id, clone_pair_related_commits, function_id): + result_changes = [] # 最终涉及到的所有的变化 + # 首先:找到该method_id发生的所有变化。没找到变化就返回空值 + sql_change = """ + select * from `{tablename}` where method_id_1 = {method_id} + """.format( + tablename=method_function_relations, method_id=start_method_id + ) + mysqlOp.cursor.execute(sql_change) + changes = mysqlOp.cursor.fetchall() + if len(changes) == 0: + return result_changes + else: # 其次:针对获取到的变化后的所有method_id,查看其是否在克隆的生命周期内。 + for change in changes: + changed_method_id = change.get("method_id_2") + sql_related_methods = """ + select commit_id from `{tablename1}` t1 ,`{tablename2}` t2 + where t1.blob_id = t2.blob_id and + function_id = {function_id} and + t2.id = {method_id} """.format( tablename1=blob_commit_relations, tablename2=blob_methods, - id=method_id, + function_id=function_id, + method_id=changed_method_id, ) - mysqlOp.cursor.execute(check_commit) - commit_id = mysqlOp.cursor.fetchone()[0] - if commit_id in relate_commits: # 检查这个method所在的commit是否属于克隆的生命周期内 - mysqlOp.cursor.execute( - "select * from `{tablename1}` where method_id_1 = {method_id}".format( - tablename1=method_function_relations, - method_id=method_id, - ) - ) - results = mysqlOp.cursor.fetchall() - if len(results) == 0: # 未产生变化就返回空值 - return [] - else: - for ( - result - ) in results: # 挑出发生变化后不属于克隆对但还是属于该function_id下的method_id - if ( - result.get("method_id_2") in all_method_id - and result.get("method_id_2") not in clone_methods - ): - next_methods.append(result.get("method_id_2")) - results.extend(out_clone_change(next_methods)) - return results + mysqlOp.cursor.execute(sql_related_methods) + commits = mysqlOp.cursor.fetchall() + related_commit_id = [] + if len(commits) == 0: + changes.remove(change) else: - return [] - - if len(out_clone_methods) != 0: - result_changes.extend(out_clone_change(out_clone_methods)) + for commit in commits: + related_commit_id.append(commit.get("commit_id")) + if set(related_commit_id) <= set(clone_pair_related_commits): + result_changes.extend( + method_change( + changed_method_id, + clone_pair_related_commits, + function_id, + ) + ) + changed_commit = min(related_commit_id) + change["commit_id"] = changed_commit + result_changes.extend(changes) return result_changes - result_changes_1 = method_change( - clone_methods_1, relate_commits, self.function_id_1 - ) - result_changes_2 = method_change( - clone_methods_2, relate_commits, self.function_id_2 - ) + for i in range(len(start_methods_1)): + result_changes_1.extend( + method_change(start_methods_1[i], relate_commits, self.function_id_1) + ) + result_changes_2.extend( + method_change(start_methods_2[i], relate_commits, self.function_id_2) + ) sum_changes = len(result_changes_1) + len(result_changes_2) print(sum_changes) @@ -184,29 +236,22 @@ class RiskEvaluator(object): # get CCL # Find Latency in different commits - changed_method_list1 = [] - changed_method_list2 = [] + consistent_change_list1 = [] + consistent_change_list2 = [] target_commits = [] if consistent_changes == 0: CCL = 0 else: CCL = 0 - commits_1 = [] - commits_2 = [] for id in change_pair_id_1: - changed_method_list1.append(result_changes_1[id].get("method_id_2")) + consistent_change_list1.append(result_changes_1[id]) for id in change_pair_id_2: - changed_method_list2.append(result_changes_2[id].get("method_id_2")) - for i in range(len(changed_method_list1)): - changed_method_id_1 = changed_method_list1[i] - changed_method_id_2 = changed_method_list2[i] - for clone_pair in clone_pairs: - if clone_pair.get("method_id_1") == changed_method_id_1: - commits_1.append(clone_pair.get("commit_id")) - if clone_pair.get("method_id_2") == changed_method_id_2: - commits_2.append(clone_pair.get("commit_id")) - commit_target_1 = min(commits_1) - commit_target_2 = min(commits_2) + consistent_change_list2.append(result_changes_2[id]) + for i in range(len(consistent_change_list1)): + consistent_changed_method_1 = consistent_change_list1[i] + consistent_changed_method_2 = consistent_change_list2[i] + commit_target_1 = consistent_changed_method_1.get("commmit_id") + commit_target_2 = consistent_changed_method_2.get("commmit_id") if commit_target_1 == commit_target_2: CCL = 0 else: @@ -268,6 +313,6 @@ repoInfos: List[RepoInfo] = FileOperator("repos").load_repos() mysqlOp: MySQLOperator = MySQLOperator(config_path="config.yml") for repoInfo in repoInfos: clone_pair = RiskEvaluator( - 10, 8, repoInfo + 14, 10, repoInfo ) # Once evaluate the risk of one clone pair(function_id) print(clone_pair.evaluate(mysqlOp)) From cc30fcd5595963870b84c97502914c7fb110c9cb Mon Sep 17 00:00:00 2001 From: zhangxunhui Date: Sun, 28 Aug 2022 07:45:53 +0800 Subject: [PATCH 14/23] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=BA=86tablename?= =?UTF-8?q?=E7=9A=84=E8=8E=B7=E5=8F=96=E6=96=B9=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- RiskEvaluator.py | 38 ++++++++++++-------------------------- 1 file changed, 12 insertions(+), 26 deletions(-) diff --git a/RiskEvaluator.py b/RiskEvaluator.py index 58c4bf9..5314d0f 100644 --- a/RiskEvaluator.py +++ b/RiskEvaluator.py @@ -27,30 +27,12 @@ class RiskEvaluator(object): 4.get the fix message of commit of consistent change 5.evaluate the risk """ - method_function_relations = "{id}{separator}method_function_relations".format( - id=self.repoInfo.id, - separator=GlobalConstants.SEPARATOR, - ) - clone_relations_function = "{id}{separator}clone_relations_function".format( - id=self.repoInfo.id, - separator=GlobalConstants.SEPARATOR, - ) - commits = "{id}{separator}commits".format( - id=self.repoInfo.id, - separator=GlobalConstants.SEPARATOR, - ) - blob_methods = "{id}{separator}blob_methods".format( - id=self.repoInfo.id, - separator=GlobalConstants.SEPARATOR, - ) - blob_commit_relations = "{id}{separator}blob_commit_relations".format( - id=self.repoInfo.id, - separator=GlobalConstants.SEPARATOR, - ) - commit_relations = "{id}{separator}commit_relations".format( - id=self.repoInfo.id, - separator=GlobalConstants.SEPARATOR, - ) + method_function_relations = mysqlOp.tablename_dict["method_function_relations"] + clone_relations_function = mysqlOp.tablename_dict["clone_relations_function"] + commits = mysqlOp.tablename_dict["commits"] + blob_methods = mysqlOp.tablename_dict["blob_methods"] + blob_commit_relations = mysqlOp.tablename_dict["blob_commit_relations"] + commit_relations = mysqlOp.tablename_dict["commit_relations"] # Fix: only consider the life time of clone pair(clone pair existing commits) relate_commits = [] sql_clone_pairs = """ @@ -310,9 +292,13 @@ class RiskEvaluator(object): repoInfos: List[RepoInfo] = FileOperator("repos").load_repos() -mysqlOp: MySQLOperator = MySQLOperator(config_path="config.yml") for repoInfo in repoInfos: + mysqlOp: MySQLOperator = MySQLOperator(config_path="config.yml", repoInfo=repoInfo) + # clone_pair = RiskEvaluator( + # 14, 10, repoInfo + # ) # Once evaluate the risk of one clone pair(function_id) + clone_pair = RiskEvaluator( - 14, 10, repoInfo + 10, 7, repoInfo ) # Once evaluate the risk of one clone pair(function_id) print(clone_pair.evaluate(mysqlOp)) From ca6d540b98d735abb3c7a28985bbb9d4e6d26790 Mon Sep 17 00:00:00 2001 From: zhangxunhui Date: Sun, 28 Aug 2022 08:15:48 +0800 Subject: [PATCH 15/23] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E4=BA=86=E5=AF=B9relat?= =?UTF-8?q?ed=5Fcommits=E7=9A=84=E8=8E=B7=E5=8F=96=E6=93=8D=E4=BD=9C?= =?UTF-8?q?=EF=BC=8C=E5=8E=BB=E6=8E=89=E4=BA=86=E4=B8=80=E4=BA=9B=E6=97=A0?= =?UTF-8?q?=E7=94=A8=E5=AD=97=E6=AE=B5=E7=9A=84=E8=8E=B7=E5=8F=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- RiskEvaluator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RiskEvaluator.py b/RiskEvaluator.py index 5314d0f..00ef9f4 100644 --- a/RiskEvaluator.py +++ b/RiskEvaluator.py @@ -36,7 +36,7 @@ class RiskEvaluator(object): # Fix: only consider the life time of clone pair(clone pair existing commits) relate_commits = [] sql_clone_pairs = """ - select method_id_1,function_id_1,method_id_2,function_id_2,commit_id from `{tablename}` + select commit_id from `{tablename}` where (function_id_1 = {function_id_1} and function_id_2 = {function_id_2}) or (function_id_1 = {function_id_2} and function_id_2 = {function_id_1}) """.format( From 4c64d3164ed185c6d32344989cbeabeac28f163f Mon Sep 17 00:00:00 2001 From: zhangxunhui Date: Sun, 28 Aug 2022 08:58:16 +0800 Subject: [PATCH 16/23] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=BA=86start=20end=20?= =?UTF-8?q?middle=20commit=E7=9A=84=E8=AF=86=E5=88=AB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- RiskEvaluator.py | 44 +++++++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/RiskEvaluator.py b/RiskEvaluator.py index 00ef9f4..1d0afa0 100644 --- a/RiskEvaluator.py +++ b/RiskEvaluator.py @@ -48,34 +48,56 @@ class RiskEvaluator(object): clone_pairs = mysqlOp.cursor.fetchall() for clone_pair in clone_pairs: relate_commits.append(clone_pair.get("commit_id")) - # get the start and end commmits of clone pair + # get the related commmits of clone pair + candidate_commits = [ + commit_id + for commit_id in range(min(relate_commits), max(relate_commits) + 1) + ] + start_commits = [] end_commits = [] - for commit_id in relate_commits: + middle_commits = [] + + for commit_id in candidate_commits: sql_parent = """ select parent_id from `{tablename}` where id={id} """.format( tablename=commit_relations, id=commit_id ) mysqlOp.cursor.execute(sql_parent) - parent_id = mysqlOp.cursor.fetchone() + parent_ids = mysqlOp.cursor.fetchall() + parent_ids = [parent_id["parent_id"] for parent_id in parent_ids] sql_son = """ select id from `{tablename}` where parent_id={id} """.format( tablename=commit_relations, id=commit_id ) mysqlOp.cursor.execute(sql_son) - son_id = mysqlOp.cursor.fetchone() - if parent_id is None: + son_ids = mysqlOp.cursor.fetchall() + son_ids = [son_id["id"] for son_id in son_ids] + + intersect_parents = list(set(parent_ids) & set(candidate_commits)) + intersect_children = list(set(son_ids) & set(candidate_commits)) + # if no parent in candidate_commits & at least one child in candidate_commits & there exists clone relationship in this commit -> candidate_start + if ( + len(intersect_parents) == 0 + and len(intersect_children) > 0 + and commit_id in relate_commits + ): start_commits.append(commit_id) - elif son_id is None: + # if at least one parent in candidate_commits & no child in candidate_commits & there exists clone relationship in this commit -> candidate_end + elif ( + len(intersect_parents) > 0 + and len(intersect_children) == 0 + and commit_id in relate_commits + ): end_commits.append(commit_id) - elif parent_id.get("parent_id") not in relate_commits: - start_commits.append(commit_id) - elif son_id.get("id") not in relate_commits: - end_commits.append(commit_id) - else: + # if no parent in candidate_commits & no child in candidate_commits -> ignore + elif len(intersect_parents) == 0 and len(intersect_children) == 0: continue + # if at least one parent in candidate_commits & at least one child in candidate_commits -> middle_commit + elif len(intersect_parents) > 0 and len(intersect_children) > 0: + middle_commits.append(commit_id) # get the clone pair evolution clone_evolution = [] clone_pair_related_commits = [] From 01412c897df76924860ec6c8dfcbf00c05bc8689 Mon Sep 17 00:00:00 2001 From: zhangxunhui Date: Sun, 28 Aug 2022 09:12:34 +0800 Subject: [PATCH 17/23] =?UTF-8?q?=E5=8E=BB=E6=8E=89=E4=BA=86clone=20evolut?= =?UTF-8?q?ion=E7=9A=84=E8=8E=B7=E5=8F=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- RiskEvaluator.py | 36 +----------------------------------- 1 file changed, 1 insertion(+), 35 deletions(-) diff --git a/RiskEvaluator.py b/RiskEvaluator.py index 1d0afa0..c8707ec 100644 --- a/RiskEvaluator.py +++ b/RiskEvaluator.py @@ -98,40 +98,6 @@ class RiskEvaluator(object): # if at least one parent in candidate_commits & at least one child in candidate_commits -> middle_commit elif len(intersect_parents) > 0 and len(intersect_children) > 0: middle_commits.append(commit_id) - # get the clone pair evolution - clone_evolution = [] - clone_pair_related_commits = [] - start = [] - start.extend(start_commits) - - def get_evolution_of_clone_pair(start_commits: List, end_commits: List): - clone_evolution = [] - clone_pair_related_commits.extend(start_commits) - for commit in start_commits: - mysqlOp.cursor.execute( - "select id from `{tablename}` where parent_id = {id}".format( - tablename=commit_relations, id=commit - ) - ) - result_commits = mysqlOp.cursor.fetchall() - for result in result_commits: - clone_evolution.append((commit, result.get("id"))) - if result.get("id") in clone_pair_related_commits: - continue - elif result.get("id") in end_commits: - end_commits.remove(result.get("id")) - clone_pair_related_commits.append(result.get("id")) - else: - start_commits.append(result.get("id")) - start_commits.remove(commit) - if len(end_commits) != 0: - clone_evolution.extend( - get_evolution_of_clone_pair(start_commits, end_commits) - ) - return clone_evolution - - clone_evolution = get_evolution_of_clone_pair(start_commits, end_commits) - print(clone_evolution) # get the CpI # Find all changes during clone pair evolution @@ -139,7 +105,7 @@ class RiskEvaluator(object): start_methods_2 = [] result_changes_1 = [] result_changes_2 = [] - for start_commit in start: + for start_commit in start_commits: sql_start_method = """ select method_id_1, function_id_1, method_id_2, function_id_2 from `{tablename}` where (function_id_1 = {function_id_1} and function_id_2 = {function_id_2} and commit_id={commit_id}) or From ba5436f367006832d906bc0b037adbe534fa580d Mon Sep 17 00:00:00 2001 From: zhangxunhui Date: Sun, 28 Aug 2022 10:04:19 +0800 Subject: [PATCH 18/23] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=BA=86son=E5=92=8Cpa?= =?UTF-8?q?rent=20commit=E7=9A=84=E8=8E=B7=E5=8F=96=E6=96=B9=E5=BC=8F=20&?= =?UTF-8?q?=20=E4=BF=AE=E6=94=B9=E4=BA=86=E6=BC=94=E5=8C=96=E8=B7=AF?= =?UTF-8?q?=E5=BE=84=E4=B8=AD=E7=9B=B8=E5=85=B3method=E7=9A=84=E8=8E=B7?= =?UTF-8?q?=E5=8F=96=E6=96=B9=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- RiskEvaluator.py | 97 +++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 80 insertions(+), 17 deletions(-) diff --git a/RiskEvaluator.py b/RiskEvaluator.py index c8707ec..78192dd 100644 --- a/RiskEvaluator.py +++ b/RiskEvaluator.py @@ -1,5 +1,5 @@ from collections import Counter -from typing import List +from typing import Dict, List, Tuple import GlobalConstants from FileOperator import FileOperator @@ -58,23 +58,32 @@ class RiskEvaluator(object): end_commits = [] middle_commits = [] + commit_children_dict = {} # find children by parent_id + commit_parents_dict = {} # find parents by child_id + mysqlOp.cursor.execute( + "select id, parent_id from `{commit_relations}`".format( + commit_relations=commit_relations + ) + ) + commit_relation_results = mysqlOp.cursor.fetchall() + for relation in commit_relation_results: + commit_id = relation["id"] + parent_id = relation["parent_id"] + commit_children_dict.setdefault(parent_id, []) + commit_children_dict[parent_id].append(commit_id) + commit_parents_dict.setdefault(commit_id, []) + commit_parents_dict[commit_id].append(parent_id) + for commit_id in candidate_commits: - sql_parent = """ - select parent_id from `{tablename}` where id={id} - """.format( - tablename=commit_relations, id=commit_id - ) - mysqlOp.cursor.execute(sql_parent) - parent_ids = mysqlOp.cursor.fetchall() - parent_ids = [parent_id["parent_id"] for parent_id in parent_ids] - sql_son = """ - select id from `{tablename}` where parent_id={id} - """.format( - tablename=commit_relations, id=commit_id - ) - mysqlOp.cursor.execute(sql_son) - son_ids = mysqlOp.cursor.fetchall() - son_ids = [son_id["id"] for son_id in son_ids] + if commit_id not in commit_parents_dict: + parent_ids = [] + else: + parent_ids = commit_parents_dict[commit_id] + + if commit_id not in commit_children_dict: + son_ids = [] + else: + son_ids = commit_children_dict[commit_id] intersect_parents = list(set(parent_ids) & set(candidate_commits)) intersect_children = list(set(son_ids) & set(candidate_commits)) @@ -100,6 +109,60 @@ class RiskEvaluator(object): middle_commits.append(commit_id) # get the CpI + # find related method ids in commits + def find_related_methods(function_id: int) -> List[Tuple[int, int]]: + result = [] + sql = """ + select bm.id as method_id, bcr.commit_id + from `{blob_methods}` bm, `{blob_commit_relations}` bcr + where bm.blob_id=bcr.blob_id + and bm.function_id=%s + """.format( + blob_methods=blob_methods, blob_commit_relations=blob_commit_relations + ) + mysqlOp.cursor.execute(sql, (function_id,)) + methods = mysqlOp.cursor.fetchall() + for method in methods: + method_id = method["method_id"] + commit_id = method["commit_id"] + result.append((method_id, commit_id)) + return result + + candidate_methods_1 = find_related_methods(function_id=self.function_id_1) + candidate_methods_2 = find_related_methods(function_id=self.function_id_2) + + def filter_candidate_methods( + candidate_methods: List[Tuple[int, int]], commit_ids: List[int] + ) -> Tuple[List[int], Dict[int, List[int]]]: + """ + return: + - method ids + - { + method_id: [commit_ids] # a method can be related to multiple commits + } + """ + method_ids = [] + method_commit_dict = {} + for candidate_method in candidate_methods: + if candidate_method[1] in commit_ids: + method_ids.append(candidate_method[0]) + method_commit_dict.setdefault(candidate_method[0], []) + method_commit_dict[candidate_method[0]].append(candidate_method[1]) + return list(set(method_ids)), method_commit_dict + + all_methods_1, method_commit_dict_1 = filter_candidate_methods( + candidate_methods=candidate_methods_1, + commit_ids=list( + set(start_commits) | set(end_commits) | set(middle_commits) + ), + ) + all_methods_2, method_commit_dict_2 = filter_candidate_methods( + candidate_methods=candidate_methods_2, + commit_ids=list( + set(start_commits) | set(end_commits) | set(middle_commits) + ), + ) + # Find all changes during clone pair evolution start_methods_1 = [] start_methods_2 = [] From ef9be527e6812b2403b5eccf55dceade16d07d77 Mon Sep 17 00:00:00 2001 From: zhangxunhui Date: Sun, 28 Aug 2022 10:51:44 +0800 Subject: [PATCH 19/23] =?UTF-8?q?=E5=8F=98=E6=9B=B4=E4=BA=86sum=5Fchange?= =?UTF-8?q?=E7=9A=84=E8=8E=B7=E5=8F=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- RiskEvaluator.py | 123 ++++++++++++++++++++--------------------------- 1 file changed, 53 insertions(+), 70 deletions(-) diff --git a/RiskEvaluator.py b/RiskEvaluator.py index 78192dd..f5fa9f4 100644 --- a/RiskEvaluator.py +++ b/RiskEvaluator.py @@ -164,84 +164,67 @@ class RiskEvaluator(object): ) # Find all changes during clone pair evolution - start_methods_1 = [] - start_methods_2 = [] - result_changes_1 = [] - result_changes_2 = [] - for start_commit in start_commits: - sql_start_method = """ - select method_id_1, function_id_1, method_id_2, function_id_2 from `{tablename}` - where (function_id_1 = {function_id_1} and function_id_2 = {function_id_2} and commit_id={commit_id}) or - (function_id_1 = {function_id_2} and function_id_2 = {function_id_1} and commit_id={commit_id}) - """.format( - tablename=clone_relations_function, - function_id_1=self.function_id_1, - function_id_2=self.function_id_2, - commit_id=start_commit, - ) - mysqlOp.cursor.execute(sql_start_method) - start_clone_pair = mysqlOp.cursor.fetchone() - if start_clone_pair.get("function_id_1") == self.function_id_1: - start_methods_1.append(start_clone_pair.get("method_id_1")) - start_methods_2.append(start_clone_pair.get("method_id_2")) - elif start_clone_pair.get("function_id_1") == self.function_id_2: - start_methods_1.append(start_clone_pair.get("method_id_2")) - start_methods_2.append(start_clone_pair.get("method_id_1")) - - def method_change(start_method_id, clone_pair_related_commits, function_id): - result_changes = [] # 最终涉及到的所有的变化 - # 首先:找到该method_id发生的所有变化。没找到变化就返回空值 + def get_method_change( + all_methods: List[int], + ) -> List[Tuple[int, int, bytes, bytes]]: + """ + result: + [( + method_old, + method_new, + add_change, + delete_change + )] + """ + result_changes = [] + all_methods_str = [str(method_id) for method_id in all_methods] + method_ids = "(" + ",".join(all_methods_str) + ")" sql_change = """ - select * from `{tablename}` where method_id_1 = {method_id} + select method_id_1, method_id_2, GROUP_CONCAT(distinct `change`) as `change` from `{tablename}` + where method_id_1 in {method_ids} + and method_id_2 in {method_ids} + and `change` is not null + group by method_id_1, method_id_2 """.format( - tablename=method_function_relations, method_id=start_method_id + tablename=method_function_relations, method_ids=method_ids ) mysqlOp.cursor.execute(sql_change) changes = mysqlOp.cursor.fetchall() - if len(changes) == 0: - return result_changes - else: # 其次:针对获取到的变化后的所有method_id,查看其是否在克隆的生命周期内。 - for change in changes: - changed_method_id = change.get("method_id_2") - sql_related_methods = """ - select commit_id from `{tablename1}` t1 ,`{tablename2}` t2 - where t1.blob_id = t2.blob_id and - function_id = {function_id} and - t2.id = {method_id} - """.format( - tablename1=blob_commit_relations, - tablename2=blob_methods, - function_id=function_id, - method_id=changed_method_id, - ) - mysqlOp.cursor.execute(sql_related_methods) - commits = mysqlOp.cursor.fetchall() - related_commit_id = [] - if len(commits) == 0: - changes.remove(change) + + def extract_changes(content: bytes) -> Tuple[bytes, bytes]: + """ + return: + - add contents + - delete contents + """ + add_contents = b"" + delete_content = b"" + lines = content.splitlines() + add_flag = False + for line in lines: + if line == b"ADD:": + add_flag = True + elif line == b"DELETE:": + add_flag = False else: - for commit in commits: - related_commit_id.append(commit.get("commit_id")) - if set(related_commit_id) <= set(clone_pair_related_commits): - result_changes.extend( - method_change( - changed_method_id, - clone_pair_related_commits, - function_id, - ) - ) - changed_commit = min(related_commit_id) - change["commit_id"] = changed_commit - result_changes.extend(changes) + if add_flag: + add_contents += b"".join(line.split()) + else: + delete_content += b"".join(line.split()) + return (add_contents, delete_content) + + for change in changes: + method_id_1 = change["method_id_1"] + method_id_2 = change["method_id_2"] + change = change["change"] + add_change, delete_change = extract_changes(content=change) + result_changes.append( + (method_id_1, method_id_2, add_change, delete_change) + ) return result_changes - for i in range(len(start_methods_1)): - result_changes_1.extend( - method_change(start_methods_1[i], relate_commits, self.function_id_1) - ) - result_changes_2.extend( - method_change(start_methods_2[i], relate_commits, self.function_id_2) - ) + result_changes_1 = get_method_change(all_methods=all_methods_1) + result_changes_2 = get_method_change(all_methods=all_methods_2) sum_changes = len(result_changes_1) + len(result_changes_2) print(sum_changes) From bdc4b97a2ebb90a1018406f0f7779d4d8272aec9 Mon Sep 17 00:00:00 2001 From: zhangxunhui Date: Sun, 28 Aug 2022 10:56:31 +0800 Subject: [PATCH 20/23] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=BA=86=E4=B8=80?= =?UTF-8?q?=E8=87=B4=E6=80=A7=E5=8F=98=E6=9B=B4=E7=9A=84=E8=8E=B7=E5=8F=96?= =?UTF-8?q?=E6=96=B9=E6=B3=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- RiskEvaluator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/RiskEvaluator.py b/RiskEvaluator.py index f5fa9f4..26f0d40 100644 --- a/RiskEvaluator.py +++ b/RiskEvaluator.py @@ -240,9 +240,9 @@ class RiskEvaluator(object): change_list_2 = [] consistent_changes = 0 for change in result_changes_1: - change_list_1.append(change.get("change")) + change_list_1.append((change[2], change[3])) for change in result_changes_2: - change_list_2.append(change.get("change")) + change_list_2.append((change[2], change[3])) for i in range(len(change_list_1)): for j in range(len(change_list_2)): if change_list_1[i] == change_list_2[j]: From b05f05ca69892f772eb4e9e346fea8eec249bb81 Mon Sep 17 00:00:00 2001 From: zhangxunhui Date: Sun, 28 Aug 2022 11:46:32 +0800 Subject: [PATCH 21/23] =?UTF-8?q?=E5=AE=8C=E6=88=90=E4=BA=86=E8=84=9A?= =?UTF-8?q?=E6=9C=AC=E7=9A=84=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- RiskEvaluator.py | 105 +++++++++++++++++++++++++++++------------------ 1 file changed, 65 insertions(+), 40 deletions(-) diff --git a/RiskEvaluator.py b/RiskEvaluator.py index 26f0d40..d424499 100644 --- a/RiskEvaluator.py +++ b/RiskEvaluator.py @@ -1,7 +1,6 @@ -from collections import Counter +import re from typing import Dict, List, Tuple -import GlobalConstants from FileOperator import FileOperator from models.RepoInfo import RepoInfo from MySQLOperator import MySQLOperator @@ -231,27 +230,39 @@ class RiskEvaluator(object): # get CCR # Find consistent_changes in all changes - change_pair_id_1 = [] - change_pair_id_2 = [] - if sum_changes <= 1: - consistent_changes = 0 - else: - change_list_1 = [] - change_list_2 = [] - consistent_changes = 0 - for change in result_changes_1: - change_list_1.append((change[2], change[3])) - for change in result_changes_2: - change_list_2.append((change[2], change[3])) - for i in range(len(change_list_1)): - for j in range(len(change_list_2)): - if change_list_1[i] == change_list_2[j]: - change_pair_id_1.append(i) - change_pair_id_2.append(j) - consistent_changes = consistent_changes + 2 + consistent_change_list1 = [] + consistent_change_list2 = [] + consistent_changes = 0 + for change_1 in result_changes_1: + for change_2 in result_changes_2: + if change_1[2] == change_2[2] and change_1[3] == change_2[3]: + consistent_change_list1.append(change_1) + consistent_change_list2.append(change_2) + consistent_changes = consistent_changes + 2 # get CCL # Find Latency in different commits + def get_commit_change_by_method_change( + method_old: int, method_new: int, method_commit_dict: dict + ) -> List[Tuple[int, int]]: + """ + Function: get the change of commits via the change of methods + return: + - [( + commit_old, + commit_new + )] + """ + result = [] + commits_old = method_commit_dict[method_old] + commits_new = method_commit_dict[method_new] + for commit_old in commits_old: + children_old = commit_children_dict[commit_old] + intersect_commits = set(children_old) & set(commits_new) + for commit_id in intersect_commits: + result.append((commit_old, commit_id)) + return result + consistent_change_list1 = [] consistent_change_list2 = [] target_commits = [] @@ -259,21 +270,33 @@ class RiskEvaluator(object): CCL = 0 else: CCL = 0 - for id in change_pair_id_1: - consistent_change_list1.append(result_changes_1[id]) - for id in change_pair_id_2: - consistent_change_list2.append(result_changes_2[id]) for i in range(len(consistent_change_list1)): - consistent_changed_method_1 = consistent_change_list1[i] - consistent_changed_method_2 = consistent_change_list2[i] - commit_target_1 = consistent_changed_method_1.get("commmit_id") - commit_target_2 = consistent_changed_method_2.get("commmit_id") - if commit_target_1 == commit_target_2: - CCL = 0 - else: - target_commits.append(commit_target_1) - target_commits.append(commit_target_2) - CCL = CCL + 1 + change_1 = consistent_change_list1[i] + change_2 = consistent_change_list2[i] + method_old_1 = change_1[0] + method_new_1 = change_1[1] + method_old_2 = change_2[0] + method_new_2 = change_2[1] + + commit_changes_1 = get_commit_change_by_method_change( + method_old=method_old_1, + method_new=method_new_1, + method_commit_dict=method_commit_dict_1, + ) + commit_changes_2 = get_commit_change_by_method_change( + method_old=method_old_2, + method_new=method_new_2, + method_commit_dict=method_commit_dict_2, + ) + + consistent_change_commit_paths = list( + set(commit_changes_1) & set(commit_changes_2) + ) + CCL += len(consistent_change_commit_paths) + + target_commits = list( + set([path[1] for path in consistent_change_commit_paths]) + ) # get bug_fix_num if CCL == 0: @@ -287,13 +310,15 @@ class RiskEvaluator(object): tablename1=commits, id=commit ) mysqlOp.cursor.execute(sql_message) - messages = mysqlOp.cursor.fetchall() - for message in messages: - keywords = ( - message.get("message").decode().replace("\n", " ").split() + message = mysqlOp.cursor.fetchone()["message"].lower() + if ( + re.search( + rb"(close|closes|closed|fix|fixes|fixed|resolve|resolves|resolved)\s+.*?#\d+", + message, ) - if "fix" in keywords or "Fix" in keywords: - bug_fix_num = bug_fix_num + 1 + is not None + ): + bug_fix_num += 1 def Harmness_Evaluating(CpI: int, CCR: int, CCL: int, bug_fix_num: int) -> int: """ From 2ba4cdf1a1bedc2d0266477d1aa50bc5b6287dc8 Mon Sep 17 00:00:00 2001 From: zy Date: Wed, 31 Aug 2022 22:51:34 +0800 Subject: [PATCH 22/23] finish fix the method function relations and get all popular java projects --- .gitignore | 1 + .isort.cfg | 2 +- MethodFunctionRelationExtractor.py | 25 ++------ getPopularJavaProjectsFromGithub.py | 63 +++++++++++++++++++++ sql_templates/method_function_relations.sql | 2 +- 5 files changed, 72 insertions(+), 21 deletions(-) create mode 100644 getPopularJavaProjectsFromGithub.py diff --git a/.gitignore b/.gitignore index 4d2beaa..863df2c 100644 --- a/.gitignore +++ b/.gitignore @@ -144,3 +144,4 @@ config.yml #repos repos delete_repos +test/ diff --git a/.isort.cfg b/.isort.cfg index c1cbfb0..a3dbb92 100644 --- a/.isort.cfg +++ b/.isort.cfg @@ -1,4 +1,4 @@ [settings] line_length = 79 multi_line_output = 3 -known_third_party =dulwich,numpy,pandas,pymysql,sqlalchemy,yaml +known_third_party =dulwich,numpy,pandas,pymysql,requests,sqlalchemy,yaml diff --git a/MethodFunctionRelationExtractor.py b/MethodFunctionRelationExtractor.py index 4618cf2..61d1586 100644 --- a/MethodFunctionRelationExtractor.py +++ b/MethodFunctionRelationExtractor.py @@ -282,9 +282,9 @@ def extract_method_function_relation( """ Some filepaths gotten by dulwich are different with the real filepaths in the mysql database and the key names in filepath_id_dict. When this bug - happened we set filepath_id_old = -1. + happened we set filepath_id_old = None. """ - filepath_id_old = -1 + filepath_id_old = None else: filepath_id_old = filepath_id_dict[row["filepath_old"]] @@ -295,12 +295,12 @@ def extract_method_function_relation( """ Some filepaths gotten by dulwich are different with the real filepaths in the mysql database and the key names in filepath_id_dict. When this bug - happened we set filepath_id_old = -1. + happened we set filepath_id_old = None. Example: When deal with the repository git@github.com:apache/iotdb.git, a filepath in filepath_id_dict is 'iotdb\\metrics\\interface\\src\\main\\java\\org\\apache\\iotdb\\metrics\\DoNothingMetricService.java' while the filepath obtained by dulwich will ignore "iotdb\\" """ - filepath_id_new = -1 + filepath_id_new = None else: filepath_id_new = filepath_id_dict[row["filepath_new"]] @@ -313,9 +313,7 @@ def extract_method_function_relation( if np.isnan(line_old): method_id_old = np.nan else: - if filepath_id_old == -1: - method_id_old = -1 - elif ( + if ( commit_id_old not in commit_filepath_lineno_method_id_dict or filepath_id_old not in commit_filepath_lineno_method_id_dict[commit_id_old] @@ -333,9 +331,7 @@ def extract_method_function_relation( if np.isnan(line_new): method_id_new = np.nan else: - if filepath_id_new == -1: - method_id_new = -1 - elif ( + if ( commit_id_new not in commit_filepath_lineno_method_id_dict or filepath_id_new not in commit_filepath_lineno_method_id_dict[commit_id_new] @@ -461,15 +457,6 @@ def extract_method_function_relation( ), (method_id_1, method_id_2, change_content), ) - # delete the commit where method_id_new= -1 or method_id_old = -1 - mysqlOp.cursor.execute( - "delete from `{method_function_relation_tablename}` where method_id_1 = -1 or method_id_2 = -1".format( - method_function_relation_tablename=mysqlOp.tablename_dict[ - "method_function_relations" - ] - ) - ) - mysqlOp.connection.commit() for commit_sha, commit_id in commit_sha_id_dict.items(): handle_commit(commit_id=commit_id, commit=repo.object_store[commit_sha]) diff --git a/getPopularJavaProjectsFromGithub.py b/getPopularJavaProjectsFromGithub.py new file mode 100644 index 0000000..7644d75 --- /dev/null +++ b/getPopularJavaProjectsFromGithub.py @@ -0,0 +1,63 @@ +# aim: to search java repositories that are popular and active in the last 24 hours; only select the top 100 projects as the simulator +# author: zhangxunhui +# date: 2022-06-22 + +import json +from datetime import datetime, timedelta + +import requests +import yaml + +with open("test/config.yml", "r") as f: + config = yaml.load(f, Loader=yaml.FullLoader) + +projects = [] + + +def query_projects(): + """ + url: https://api.github.com/search/repositories?q=language:java+pushed:>=2022-08-30T10:41:07Z+stars:>=1000&sort=stars&order=desc&per_page=100&page=1 + total_count: 329 + """ + end_time = datetime.utcnow() + start_time = end_time - timedelta(days=1) + pushed = ">=" + start_time.strftime("%Y-%m-%dT%H:%M:%SZ") + page = 1 + url = "https://api.github.com/search/repositories?q=language:{language}+pushed:{pushed}+stars:>={stars}&sort=stars&order=desc&per_page=100&page={page}".format( + language="java", pushed=pushed, stars=1000, page=page + ) + print(url) + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3941.4 Safari/537.36", + "Authorization": "token " + config.get("github_token"), + "Content-Type": "application/json", + "method": "GET", + "Accept": "application/vnd.github.squirrel-girl-preview+json", + } + + response = requests.get(url=url, headers=headers) + response.encoding = "utf-8" + result = json.loads(response.text) + total_count = result.get("total_count") + projects.extend(result["items"]) + if total_count > 100: + while total_count > 100: + page = page + 1 + url = "https://api.github.com/search/repositories?q=language:{language}+pushed:{pushed}+stars:>={stars}&sort=stars&order=desc&per_page=100&page={page}".format( + language="java", pushed=pushed, stars=1000, page=page + ) + response = requests.get(url=url, headers=headers) + response.encoding = "utf-8" + result = json.loads(response.text) + projects.extend(result["items"]) + total_count = total_count - 100 + + print(result["total_count"]) + + with open("test/1_search_popular_active_projects.json", "w") as f: + json.dump(projects, f) + + +if __name__ == "__main__": + query_projects() + print("finish") diff --git a/sql_templates/method_function_relations.sql b/sql_templates/method_function_relations.sql index 5ef7652..5a4a082 100644 --- a/sql_templates/method_function_relations.sql +++ b/sql_templates/method_function_relations.sql @@ -2,7 +2,7 @@ CREATE TABLE IF NOT EXISTS `{tablename}` ( `id` int(11) NOT NULL AUTO_INCREMENT, `method_id_1` int(11) NULL, `method_id_2` int(11) NULL, - `change` blob NULL, # set(block1, block2) a set of change, no order, and no whitespace in each block + `change` longblob NULL, # set(block1, block2) a set of change, no order, and no whitespace in each block PRIMARY KEY (`id`), INDEX(`method_id_1`) USING BTREE, INDEX(`method_id_2`) USING BTREE From 87702ded2517cca750033e0ca2e341dfcb88f0a2 Mon Sep 17 00:00:00 2001 From: zy Date: Fri, 2 Sep 2022 23:11:16 +0800 Subject: [PATCH 23/23] fix the riskevaluator --- .isort.cfg | 2 +- CloneOperator.py | 3 +- RepoExecutor.py | 9 ++ RiskEvaluator.py | 161 ++++++++++++++++++++-------- getPopularJavaProjectsFromGithub.py | 63 ----------- 5 files changed, 127 insertions(+), 111 deletions(-) delete mode 100644 getPopularJavaProjectsFromGithub.py diff --git a/.isort.cfg b/.isort.cfg index a3dbb92..c1cbfb0 100644 --- a/.isort.cfg +++ b/.isort.cfg @@ -1,4 +1,4 @@ [settings] line_length = 79 multi_line_output = 3 -known_third_party =dulwich,numpy,pandas,pymysql,requests,sqlalchemy,yaml +known_third_party =dulwich,numpy,pandas,pymysql,sqlalchemy,yaml diff --git a/CloneOperator.py b/CloneOperator.py index 2781ffa..32d664e 100644 --- a/CloneOperator.py +++ b/CloneOperator.py @@ -84,12 +84,13 @@ class CloneOperator(object): if not handled: p = subprocess.Popen( - "cd {target_folder} && java -jar {NIL_path} -s ./ -mit {mit} -mil {mil} -t {thread_num} -o result.csv".format( + "cd {target_folder} && java -jar {NIL_path} -s ./ -mit {mit} -mil {mil} -t {thread_num} -o result.csv -p {partition_num}".format( target_folder=target_folder, NIL_path=NIL_path, mit=int(nil_config["mit"]), mil=int(nil_config["mil"]), thread_num=int(nil_config["thread_num"]), + partition_num=int(nil_config["partition_num"]), ), shell=True, stdout=subprocess.PIPE, diff --git a/RepoExecutor.py b/RepoExecutor.py index 3a7ad88..5637fb9 100644 --- a/RepoExecutor.py +++ b/RepoExecutor.py @@ -15,6 +15,7 @@ from GitOperator import GitOperator from MethodFunctionRelationExtractor import extract_method_function_relation from models.RepoInfo import RepoInfo from MySQLOperator import MySQLOperator +from RiskEvaluator import evaluate_all_pairs class RepoExecutorThread(threading.Thread): @@ -173,6 +174,14 @@ class RepoExecutorThread(threading.Thread): ) ) + # evaluate the risk of the clone pairs + evaluate_all_pairs(repoInfo=repoInfo) + print( + "[Info]: Thread: {thread_name} finish evaluating all clone pairs in the whole repo: {git_url}".format( + thread_name=self.name, git_url=repoInfo.git_url + ) + ) + # mark the handled repository mysqlOp.update_handled_repository(repoInfo=repoInfo) print( diff --git a/RiskEvaluator.py b/RiskEvaluator.py index d424499..716c57e 100644 --- a/RiskEvaluator.py +++ b/RiskEvaluator.py @@ -1,6 +1,10 @@ import re from typing import Dict, List, Tuple +import pandas as pd +from sqlalchemy import create_engine + +import GlobalConstants from FileOperator import FileOperator from models.RepoInfo import RepoInfo from MySQLOperator import MySQLOperator @@ -73,39 +77,48 @@ class RiskEvaluator(object): commit_parents_dict.setdefault(commit_id, []) commit_parents_dict[commit_id].append(parent_id) - for commit_id in candidate_commits: - if commit_id not in commit_parents_dict: - parent_ids = [] - else: - parent_ids = commit_parents_dict[commit_id] + # if the clone pair only live in one commit, this commit belongs to both start commit and end commit + if len(candidate_commits) == 1: + start_commits.append(candidate_commits[0]) + end_commits.append(candidate_commits[0]) + else: + for commit_id in candidate_commits: + if commit_id not in commit_parents_dict: + parent_ids = [] + else: + parent_ids = commit_parents_dict[commit_id] - if commit_id not in commit_children_dict: - son_ids = [] - else: - son_ids = commit_children_dict[commit_id] + if commit_id not in commit_children_dict: + son_ids = [] + else: + son_ids = commit_children_dict[commit_id] - intersect_parents = list(set(parent_ids) & set(candidate_commits)) - intersect_children = list(set(son_ids) & set(candidate_commits)) - # if no parent in candidate_commits & at least one child in candidate_commits & there exists clone relationship in this commit -> candidate_start - if ( - len(intersect_parents) == 0 - and len(intersect_children) > 0 - and commit_id in relate_commits - ): - start_commits.append(commit_id) - # if at least one parent in candidate_commits & no child in candidate_commits & there exists clone relationship in this commit -> candidate_end - elif ( - len(intersect_parents) > 0 - and len(intersect_children) == 0 - and commit_id in relate_commits - ): - end_commits.append(commit_id) - # if no parent in candidate_commits & no child in candidate_commits -> ignore - elif len(intersect_parents) == 0 and len(intersect_children) == 0: - continue - # if at least one parent in candidate_commits & at least one child in candidate_commits -> middle_commit - elif len(intersect_parents) > 0 and len(intersect_children) > 0: - middle_commits.append(commit_id) + intersect_parents = list(set(parent_ids) & set(candidate_commits)) + intersect_children = list(set(son_ids) & set(candidate_commits)) + # if no parent in candidate_commits & at least one child in candidate_commits & there exists clone relationship in this commit -> candidate_start + if ( + len(intersect_parents) == 0 + and len(intersect_children) > 0 + and commit_id in relate_commits + ): + start_commits.append(commit_id) + # if at least one parent in candidate_commits & no child in candidate_commits & there exists clone relationship in this commit -> candidate_end + elif ( + len(intersect_parents) > 0 + and len(intersect_children) == 0 + and commit_id in relate_commits + ): + end_commits.append(commit_id) + # if no parent in candidate_commits & no child in candidate_commits -> ignore + elif len(intersect_parents) == 0 and len(intersect_children) == 0: + continue + # if at least one parent in candidate_commits & at least one child in candidate_commits -> middle_commit + elif len(intersect_parents) > 0 and len(intersect_children) > 0: + middle_commits.append(commit_id) + for start_commit in start_commits: + commit_parents_dict.setdefault(start_commit, []) + for end_commit in end_commits: + commit_children_dict.setdefault(end_commit, []) # get the CpI # find related method ids in commits @@ -226,7 +239,6 @@ class RiskEvaluator(object): result_changes_2 = get_method_change(all_methods=all_methods_2) sum_changes = len(result_changes_1) + len(result_changes_2) - print(sum_changes) # get CCR # Find consistent_changes in all changes @@ -263,8 +275,6 @@ class RiskEvaluator(object): result.append((commit_old, commit_id)) return result - consistent_change_list1 = [] - consistent_change_list2 = [] target_commits = [] if consistent_changes == 0: CCL = 0 @@ -290,12 +300,12 @@ class RiskEvaluator(object): ) consistent_change_commit_paths = list( - set(commit_changes_1) & set(commit_changes_2) + set(commit_changes_1) | set(commit_changes_2) ) CCL += len(consistent_change_commit_paths) - target_commits = list( - set([path[1] for path in consistent_change_commit_paths]) + target_commits.extend( + list(set([path[1] for path in consistent_change_commit_paths])) ) # get bug_fix_num @@ -350,14 +360,73 @@ class RiskEvaluator(object): return Harmness_Evaluating(sum_changes, consistent_changes, CCL, bug_fix_num) -repoInfos: List[RepoInfo] = FileOperator("repos").load_repos() -for repoInfo in repoInfos: +# Only for test +# repoInfos: List[RepoInfo] = FileOperator("repos").load_repos() +# for repoInfo in repoInfos: +# mysqlOp: MySQLOperator = MySQLOperator(config_path="config.yml", repoInfo=repoInfo) +# clone_pair = RiskEvaluator( +# 10, 9, repoInfo +# ) +# print(clone_pair.evaluate(mysqlOp)) +def evaluate_all_pairs(repoInfo): mysqlOp: MySQLOperator = MySQLOperator(config_path="config.yml", repoInfo=repoInfo) # clone_pair = RiskEvaluator( - # 14, 10, repoInfo - # ) # Once evaluate the risk of one clone pair(function_id) - - clone_pair = RiskEvaluator( - 10, 7, repoInfo - ) # Once evaluate the risk of one clone pair(function_id) - print(clone_pair.evaluate(mysqlOp)) + # 10, 9, repoInfo + # ) + # print(clone_pair.evaluate(mysqlOp)) + clone_relations_function = mysqlOp.tablename_dict["clone_relations_function"] + sql_all_clones = """ + select function_id_1,function_id_2 from `{tablename}` + """.format( + tablename=clone_relations_function + ) + mysqlOp.cursor.execute(sql_all_clones) + all_clone_pairs = mysqlOp.cursor.fetchall() + evaluate_list = [] + for clone_pair in all_clone_pairs: + function_id_1 = clone_pair.get("function_id_1") + function_id_2 = clone_pair.get("function_id_2") + clone_pair = RiskEvaluator(function_id_1, function_id_2, repoInfo) + risklevel = clone_pair.evaluate(mysqlOp) + pair = { + "function_id_1": function_id_1, + "function_id_2": function_id_2, + "risk_level": risklevel, + } + evaluate_list.append(pair) + result = pd.DataFrame(evaluate_list) + result_of_evaluator = "{repo_id}{separator}result_of_evaluator".format( + repo_id=repoInfo.id, separator=GlobalConstants.SEPARATOR + ) + sql_result = """ + create table if not exists `{tablename}` ( + `id` int(11) NOT NULL AUTO_INCREMENT, + `function_id_1` int(11) NULL, + `function_id_2` int(11) NULL, + `risk_level` int(11) NULL, + PRIMARY KEY (`id`), + INDEX(`function_id_1`) USING BTREE, + INDEX(`function_id_2`) USING BTREE, + INDEX(`risk_level`) USING BTREE + ) + """.format( + tablename=result_of_evaluator + ) + mysqlOp.cursor.execute(sql_result) + mysqlOp.truncate_table(tablename=result_of_evaluator) + config = mysqlOp.config["mysql"] + engine = create_engine( + "mysql+pymysql://{username}:{password}@{host}:{port}/{database}".format( + username=config["user"], + password=config["passwd"], + host=config["host"], + port=config["port"], + database=config["database"], + ) + ) + result.to_sql( + result_of_evaluator, + engine, + index=False, + if_exists="append", + ) diff --git a/getPopularJavaProjectsFromGithub.py b/getPopularJavaProjectsFromGithub.py deleted file mode 100644 index 7644d75..0000000 --- a/getPopularJavaProjectsFromGithub.py +++ /dev/null @@ -1,63 +0,0 @@ -# aim: to search java repositories that are popular and active in the last 24 hours; only select the top 100 projects as the simulator -# author: zhangxunhui -# date: 2022-06-22 - -import json -from datetime import datetime, timedelta - -import requests -import yaml - -with open("test/config.yml", "r") as f: - config = yaml.load(f, Loader=yaml.FullLoader) - -projects = [] - - -def query_projects(): - """ - url: https://api.github.com/search/repositories?q=language:java+pushed:>=2022-08-30T10:41:07Z+stars:>=1000&sort=stars&order=desc&per_page=100&page=1 - total_count: 329 - """ - end_time = datetime.utcnow() - start_time = end_time - timedelta(days=1) - pushed = ">=" + start_time.strftime("%Y-%m-%dT%H:%M:%SZ") - page = 1 - url = "https://api.github.com/search/repositories?q=language:{language}+pushed:{pushed}+stars:>={stars}&sort=stars&order=desc&per_page=100&page={page}".format( - language="java", pushed=pushed, stars=1000, page=page - ) - print(url) - headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3941.4 Safari/537.36", - "Authorization": "token " + config.get("github_token"), - "Content-Type": "application/json", - "method": "GET", - "Accept": "application/vnd.github.squirrel-girl-preview+json", - } - - response = requests.get(url=url, headers=headers) - response.encoding = "utf-8" - result = json.loads(response.text) - total_count = result.get("total_count") - projects.extend(result["items"]) - if total_count > 100: - while total_count > 100: - page = page + 1 - url = "https://api.github.com/search/repositories?q=language:{language}+pushed:{pushed}+stars:>={stars}&sort=stars&order=desc&per_page=100&page={page}".format( - language="java", pushed=pushed, stars=1000, page=page - ) - response = requests.get(url=url, headers=headers) - response.encoding = "utf-8" - result = json.loads(response.text) - projects.extend(result["items"]) - total_count = total_count - 100 - - print(result["total_count"]) - - with open("test/1_search_popular_active_projects.json", "w") as f: - json.dump(projects, f) - - -if __name__ == "__main__": - query_projects() - print("finish")