Merge pull request '将特征提取修改为可以多线程运行的' (#61) from zy into master

This commit is contained in:
MillerEvan 2022-09-18 00:28:04 +08:00
commit 67453767a1
1 changed files with 63 additions and 36 deletions

View File

@ -1,6 +1,6 @@
import json import queue
import pathlib
import re import re
import threading
import time import time
from collections import Counter from collections import Counter
from typing import Dict, List, Tuple from typing import Dict, List, Tuple
@ -9,6 +9,7 @@ import pandas as pd
from sqlalchemy import create_engine from sqlalchemy import create_engine
import GlobalConstants import GlobalConstants
from ConfigOperator import ConfigOperator
from FileOperator import FileOperator from FileOperator import FileOperator
from models.RepoInfo import RepoInfo from models.RepoInfo import RepoInfo
from MySQLOperator import MySQLOperator from MySQLOperator import MySQLOperator
@ -518,16 +519,16 @@ def factor_extractor(function_id_1: int, function_id_2: int, mysqlOp: MySQLOpera
} }
if __name__ == "__main__": def run(q):
repoInfos: List[RepoInfo] = FileOperator("factor_repos").load_repos() while not q.empty():
for repoInfo in repoInfos: repoInfo: RepoInfo = q.get()
mysqlOp: MySQLOperator = MySQLOperator( mysqlOp: MySQLOperator = MySQLOperator(
config_path="config.yml", autocommit=True, repoInfo=repoInfo config_path="config.yml", autocommit=True, repoInfo=repoInfo
) )
clone_relations_function = mysqlOp.tablename_dict["clone_relations_function"] clone_relations_function = mysqlOp.tablename_dict["clone_relations_function"]
sql_all_clones = """ sql_all_clones = """
select distinct function_id_1,function_id_2 from `{tablename}` select distinct function_id_1,function_id_2 from `{tablename}`
""".format( """.format(
tablename=clone_relations_function tablename=clone_relations_function
) )
while True: while True:
@ -548,35 +549,35 @@ if __name__ == "__main__":
repo_id=repoInfo.id, separator=GlobalConstants.SEPARATOR repo_id=repoInfo.id, separator=GlobalConstants.SEPARATOR
) )
sql_result = """ sql_result = """
create table if not exists `{tablename}` ( create table if not exists `{tablename}` (
`id` int(11) NOT NULL AUTO_INCREMENT, `id` int(11) NOT NULL AUTO_INCREMENT,
`function_id_1` int(11) NULL, `function_id_1` int(11) NULL,
`function_id_2` int(11) NULL, `function_id_2` int(11) NULL,
`similarity` int(11) NULL, `similarity` int(11) NULL,
`degree_diff` int(11) NULL, `degree_diff` int(11) NULL,
`is_test` int(11) NULL, `is_test` int(11) NULL,
`file_distance` int(11) NULL, `file_distance` int(11) NULL,
`method_name_same` int(11) NULL, `method_name_same` int(11) NULL,
`history_change_sum` int(11) NULL, `history_change_sum` int(11) NULL,
`co_change` int(11) NULL, `co_change` int(11) NULL,
`consistant_change` int(11) NULL, `consistant_change` int(11) NULL,
`main_author_same` int(11) NULL, `main_author_same` int(11) NULL,
`author_exp_sum` int(11) NULL, `author_exp_sum` int(11) NULL,
PRIMARY KEY (`id`), PRIMARY KEY (`id`),
INDEX(`function_id_1`) USING BTREE, INDEX(`function_id_1`) USING BTREE,
INDEX(`function_id_2`) USING BTREE, INDEX(`function_id_2`) USING BTREE,
INDEX(`similarity`) USING BTREE, INDEX(`similarity`) USING BTREE,
INDEX(`degree_diff`) USING BTREE, INDEX(`degree_diff`) USING BTREE,
INDEX(`is_test`) USING BTREE, INDEX(`is_test`) USING BTREE,
INDEX(`file_distance`) USING BTREE, INDEX(`file_distance`) USING BTREE,
INDEX(`method_name_same`) USING BTREE, INDEX(`method_name_same`) USING BTREE,
INDEX(`history_change_sum`) USING BTREE, INDEX(`history_change_sum`) USING BTREE,
INDEX(`co_change`) USING BTREE, INDEX(`co_change`) USING BTREE,
INDEX(`consistant_change`) USING BTREE, INDEX(`consistant_change`) USING BTREE,
INDEX(`main_author_same`) USING BTREE, INDEX(`main_author_same`) USING BTREE,
INDEX(`author_exp_sum`) USING BTREE INDEX(`author_exp_sum`) USING BTREE
) )
""".format( """.format(
tablename=factors tablename=factors
) )
mysqlOp.cursor.execute(sql_result) mysqlOp.cursor.execute(sql_result)
@ -597,6 +598,32 @@ if __name__ == "__main__":
index=False, index=False,
if_exists="append", if_exists="append",
) )
q.task_done()
print( print(
"finish extract factors in repo: {git_url}".format(git_url=repoInfo.git_url) "finish extract factors in repo: {git_url}".format(git_url=repoInfo.git_url)
) )
if __name__ == "__main__":
repoInfos: List[RepoInfo] = FileOperator("factor_repos").load_repos()
mysqlOp = MySQLOperator(config_path=GlobalConstants.CONFIG_PATH)
workQueue = queue.Queue()
for repoInfo in repoInfos:
# query the id of this repository
repo_id = mysqlOp.get_repo_id_by_names(repoInfo=repoInfo)
repoInfo.id = repo_id
workQueue.put(repoInfo)
threads = []
for i in range(
int(
ConfigOperator(config_path=GlobalConstants.CONFIG_PATH).read_config()[
"RCD"
]["thread_num"]
)
):
t = threading.Thread(target=run, args=(workQueue,))
t.start()
threads.append(t)
for t in threads:
t.join()
print("Finish")