diff --git a/factorExtractor.py b/factorExtractor.py index 669ae79..4c3ad59 100644 --- a/factorExtractor.py +++ b/factorExtractor.py @@ -1,6 +1,6 @@ -import json -import pathlib +import queue import re +import threading import time from collections import Counter from typing import Dict, List, Tuple @@ -9,6 +9,7 @@ import pandas as pd from sqlalchemy import create_engine import GlobalConstants +from ConfigOperator import ConfigOperator from FileOperator import FileOperator from models.RepoInfo import RepoInfo from MySQLOperator import MySQLOperator @@ -518,16 +519,16 @@ def factor_extractor(function_id_1: int, function_id_2: int, mysqlOp: MySQLOpera } -if __name__ == "__main__": - repoInfos: List[RepoInfo] = FileOperator("factor_repos").load_repos() - for repoInfo in repoInfos: +def run(q): + while not q.empty(): + repoInfo: RepoInfo = q.get() mysqlOp: MySQLOperator = MySQLOperator( config_path="config.yml", autocommit=True, repoInfo=repoInfo ) clone_relations_function = mysqlOp.tablename_dict["clone_relations_function"] sql_all_clones = """ - select distinct function_id_1,function_id_2 from `{tablename}` - """.format( + select distinct function_id_1,function_id_2 from `{tablename}` + """.format( tablename=clone_relations_function ) while True: @@ -548,35 +549,35 @@ if __name__ == "__main__": repo_id=repoInfo.id, separator=GlobalConstants.SEPARATOR ) sql_result = """ - create table if not exists `{tablename}` ( - `id` int(11) NOT NULL AUTO_INCREMENT, - `function_id_1` int(11) NULL, - `function_id_2` int(11) NULL, - `similarity` int(11) NULL, - `degree_diff` int(11) NULL, - `is_test` int(11) NULL, - `file_distance` int(11) NULL, - `method_name_same` int(11) NULL, - `history_change_sum` int(11) NULL, - `co_change` int(11) NULL, - `consistant_change` int(11) NULL, - `main_author_same` int(11) NULL, - `author_exp_sum` int(11) NULL, - PRIMARY KEY (`id`), - INDEX(`function_id_1`) USING BTREE, - INDEX(`function_id_2`) USING BTREE, - INDEX(`similarity`) USING BTREE, - INDEX(`degree_diff`) USING BTREE, - INDEX(`is_test`) USING BTREE, - INDEX(`file_distance`) USING BTREE, - INDEX(`method_name_same`) USING BTREE, - INDEX(`history_change_sum`) USING BTREE, - INDEX(`co_change`) USING BTREE, - INDEX(`consistant_change`) USING BTREE, - INDEX(`main_author_same`) USING BTREE, - INDEX(`author_exp_sum`) USING BTREE - ) - """.format( + create table if not exists `{tablename}` ( + `id` int(11) NOT NULL AUTO_INCREMENT, + `function_id_1` int(11) NULL, + `function_id_2` int(11) NULL, + `similarity` int(11) NULL, + `degree_diff` int(11) NULL, + `is_test` int(11) NULL, + `file_distance` int(11) NULL, + `method_name_same` int(11) NULL, + `history_change_sum` int(11) NULL, + `co_change` int(11) NULL, + `consistant_change` int(11) NULL, + `main_author_same` int(11) NULL, + `author_exp_sum` int(11) NULL, + PRIMARY KEY (`id`), + INDEX(`function_id_1`) USING BTREE, + INDEX(`function_id_2`) USING BTREE, + INDEX(`similarity`) USING BTREE, + INDEX(`degree_diff`) USING BTREE, + INDEX(`is_test`) USING BTREE, + INDEX(`file_distance`) USING BTREE, + INDEX(`method_name_same`) USING BTREE, + INDEX(`history_change_sum`) USING BTREE, + INDEX(`co_change`) USING BTREE, + INDEX(`consistant_change`) USING BTREE, + INDEX(`main_author_same`) USING BTREE, + INDEX(`author_exp_sum`) USING BTREE + ) + """.format( tablename=factors ) mysqlOp.cursor.execute(sql_result) @@ -597,6 +598,32 @@ if __name__ == "__main__": index=False, if_exists="append", ) + q.task_done() print( "finish extract factors in repo: {git_url}".format(git_url=repoInfo.git_url) ) + + +if __name__ == "__main__": + repoInfos: List[RepoInfo] = FileOperator("factor_repos").load_repos() + mysqlOp = MySQLOperator(config_path=GlobalConstants.CONFIG_PATH) + workQueue = queue.Queue() + for repoInfo in repoInfos: + # query the id of this repository + repo_id = mysqlOp.get_repo_id_by_names(repoInfo=repoInfo) + repoInfo.id = repo_id + workQueue.put(repoInfo) + threads = [] + for i in range( + int( + ConfigOperator(config_path=GlobalConstants.CONFIG_PATH).read_config()[ + "RCD" + ]["thread_num"] + ) + ): + t = threading.Thread(target=run, args=(workQueue,)) + t.start() + threads.append(t) + for t in threads: + t.join() + print("Finish")