bad_clone_prediction/BlobCommitRelationExtractor.py

87 lines
3.4 KiB
Python

from typing import List
from dulwich.objects import Commit
from dulwich.repo import Repo
from ConfigOperator import ConfigOperator
from GitOperator import GitOperator
from models.BlobInfo import BlobInfo
from models.RepoInfo import RepoInfo
from MySQLOperator import MySQLOperator
def extract_blob_commit_relation(
repoInfo: RepoInfo,
mysqlOp: MySQLOperator,
gitOp: GitOperator,
configOp: ConfigOperator,
):
step_name = "blob commit relation extraction"
mysqlOp.cursor.execute(
"select handled from `{steps_tablename}` where step_name=%s".format(
steps_tablename=mysqlOp.tablename_dict["steps"]
),
(step_name),
)
handled = mysqlOp.cursor.fetchone()["handled"]
if not handled:
# this step has not been handled
mysqlOp.truncate_table(mysqlOp.tablename_dict["blob_commit_relations"])
mysqlOp.truncate_table(mysqlOp.tablename_dict["filepaths"])
blob_sha_id_dict = mysqlOp.get_blob_sha_id_dict()
commit_sha_id_dict = mysqlOp.get_commit_sha_id_dict()
filepath_sha_id_dict = mysqlOp.get_filepath_sha_id_dict()
def get_filepath_id(blobInfo: BlobInfo):
"""
Function: get the filepath_id of the target blob
params:
- blobInfo: the BlobInfo object
"""
if blobInfo.filepath_sha in filepath_sha_id_dict:
return filepath_sha_id_dict[blobInfo.filepath_sha]
else:
if not len(filepath_sha_id_dict) == 0:
filepath_id = max(list(filepath_sha_id_dict.values())) + 1
else:
filepath_id = 1
filepath_sha_id_dict[blobInfo.filepath_sha] = filepath_id
mysqlOp.cursor.execute(
"insert into `{filepath_tablename}` (id, sha, filepath) values (%s, %s, %s)".format(
filepath_tablename=mysqlOp.tablename_dict["filepaths"]
),
(filepath_id, blobInfo.filepath_sha, blobInfo.filepath),
) # transaction and commit at the end of handling this commit
return filepath_id
repo = Repo(repoInfo.bare_repo_path)
for commit_sha, commit_id in commit_sha_id_dict.items():
commit: Commit = repo.object_store[commit_sha]
blobInfos: List[BlobInfo] = gitOp.extract_blobs_4_commit(commit=commit)
for blobInfo in blobInfos:
if configOp.is_lang_supported(filepath=blobInfo.filepath):
blob_id = blob_sha_id_dict[blobInfo.blob.id]
filepath_id = get_filepath_id(blobInfo=blobInfo)
mysqlOp.cursor.execute(
"insert into `{blob_commit_relation_tablename}` (blob_id, commit_id, filepath_id) values (%s, %s, %s)".format(
blob_commit_relation_tablename=mysqlOp.tablename_dict[
"blob_commit_relations"
]
),
(blob_id, commit_id, filepath_id),
)
mysqlOp.connection.commit()
# update steps table
mysqlOp.cursor.execute(
"update `{steps_tablename}` set handled=%s where step_name=%s".format(
steps_tablename=mysqlOp.tablename_dict["steps"]
),
(1, step_name),
)
mysqlOp.connection.commit()