From 8ca21b61f1f14c5669ae26d9c96b3b8b171e7542 Mon Sep 17 00:00:00 2001 From: zhangxunhui Date: Tue, 2 Aug 2022 19:19:46 +0800 Subject: [PATCH] =?UTF-8?q?=E5=B0=86loadBlob=E5=92=8CloadProject=E4=B8=AD?= =?UTF-8?q?=E7=9A=84=E5=87=BD=E6=95=B0=E8=BD=AC=E7=A7=BB=E5=88=B0=E4=BA=86?= =?UTF-8?q?utils=E6=96=87=E4=BB=B6=E4=B8=AD=EF=BC=8C=E5=B9=B6=E5=A2=9E?= =?UTF-8?q?=E5=8A=A0=E4=BA=86=E5=8F=A6=E5=A4=96=E4=B8=A4=E4=B8=AA=E9=80=9A?= =?UTF-8?q?=E7=94=A8=E5=87=BD=E6=95=B0=E7=94=A8=E6=9D=A5=E6=8A=BD=E5=8F=96?= =?UTF-8?q?commit=E5=92=8Cblob=E7=9A=84=E5=AF=B9=E5=BA=94=E5=85=B3?= =?UTF-8?q?=E7=B3=BB=EF=BC=8C=E5=A2=9E=E5=8A=A0=E4=BA=86=E4=B8=80=E4=B8=AA?= =?UTF-8?q?=E5=AF=B9=E8=B1=A1=E7=B1=BB=E7=94=A8=E6=9D=A5=E5=AD=98=E5=82=A8?= =?UTF-8?q?Blob=E5=92=8CCommit=E7=9A=84=E5=AF=B9=E5=BA=94=E5=85=B3?= =?UTF-8?q?=E7=B3=BB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .isort.cfg | 2 +- loadBlob.py | 54 ----------------- loadProjects.py | 13 ----- models/BlobInfo.py | 20 +++++++ utils.py | 142 +++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 163 insertions(+), 68 deletions(-) delete mode 100644 loadBlob.py delete mode 100644 loadProjects.py create mode 100644 models/BlobInfo.py create mode 100644 utils.py diff --git a/.isort.cfg b/.isort.cfg index 84fff14..9b916c3 100644 --- a/.isort.cfg +++ b/.isort.cfg @@ -1,4 +1,4 @@ [settings] line_length = 79 multi_line_output = 3 -known_third_party =dulwich +known_third_party =dulwich,loadProjects diff --git a/loadBlob.py b/loadBlob.py deleted file mode 100644 index 80e2b3e..0000000 --- a/loadBlob.py +++ /dev/null @@ -1,54 +0,0 @@ -import os - -from dulwich.repo import Repo - - -def loadBlobObjects(dir): - """ - function: Find all blob objects in a given repo and make a list - input: the directory of a repo - output: a list of blob object - """ - repo = Repo(dir) - objectlist = list(repo.object_store) - bloblist = [] - for object in objectlist: - if ( - repo.object_store[object].type == 3 - ): # type == 1 -> commit; type == 2 -> tree ;type == 3 -> blob. - bloblist.append(repo.object_store[object].data) - repo.close() - return bloblist - - -def writeSourceCode(dir, code): - """ - function: Write source code to a new txt file - input: the directory of a new text file, the code in type byte - output: a file with source code - """ - file = open(dir, "wb") - file.write(code) - file.close() - - -def writeSourceFiles(dir, list): - """ - function: Write all source code to a new folder - input: the directory of a new folder, the code list in type byte - output: a folder with source files - """ - folderpath = os.path.join(dir, "sourceFiles") - os.mkdir(folderpath) - i = 0 - for code in list: - filepath = os.path.join(folderpath, str(i) + ".java") - writeSourceCode(filepath, code) - i = i + 1 - - -# a test for local repo -# list = loadBlobObjects("C:/Users/Administrator/redis") -# print(list[0]) -# writeSourceCode("C:/Users/Administrator/redis/1.txt", list[0]) -# writeSourceFiles("C:/Users/Administrator/redis", list) diff --git a/loadProjects.py b/loadProjects.py deleted file mode 100644 index 8c94986..0000000 --- a/loadProjects.py +++ /dev/null @@ -1,13 +0,0 @@ -import sys - - -def loadProjectList(dir): - """ - Fuction:load project list - input:a txt file including all the address - output: a list including all the address of files in the input file - """ - file = open(dir, "r") - list = file.read().splitlines() - file.close() - return list diff --git a/models/BlobInfo.py b/models/BlobInfo.py new file mode 100644 index 0000000..60ea5a3 --- /dev/null +++ b/models/BlobInfo.py @@ -0,0 +1,20 @@ +from dulwich.objects import Blob, Commit +from dulwich.repo import Repo + + +class BlobInfo(object): + def __init__( + self, + repo: Repo = None, + commit: Commit = None, + filepath: str = None, + blob: Blob = None, + ) -> None: + self.repo = repo + self.commit = commit + self.filepath = filepath + self.blob = blob + + +if __name__ == "__main__": + BlobInfo() diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..fa7337c --- /dev/null +++ b/utils.py @@ -0,0 +1,142 @@ +# aim: this is the utility file for other scripts +# author: zhongyan, zhangxunhui + +import os +from typing import List + +from dulwich.objects import Commit, Tree +from dulwich.repo import Repo + +from models.BlobInfo import BlobInfo + + +def loadBlobObjects(dir): + """ + function: Find all blob objects in a given repo and make a list + input: the directory of a repo + output: a list of blob object + """ + repo = Repo(dir) + objectlist = list(repo.object_store) + bloblist = [] + for object in objectlist: + if ( + repo.object_store[object].type == 3 + ): # type == 1 -> commit; type == 2 -> tree ;type == 3 -> blob. + bloblist.append(repo.object_store[object].data) + repo.close() + return bloblist + + +def writeSourceCode(dir, code): + """ + function: Write source code to a new txt file + input: the directory of a new text file, the code in type byte + output: a file with source code + """ + file = open(dir, "wb") + file.write(code) + file.close() + + +def writeSourceFiles(dir, list): + """ + function: Write all source code to a new folder + input: the directory of a new folder, the code list in type byte + output: a folder with source files + """ + folderpath = os.path.join(dir, "sourceFiles") + os.mkdir(folderpath) + i = 0 + for code in list: + filepath = os.path.join(folderpath, str(i) + ".java") + writeSourceCode(filepath, code) + i = i + 1 + + +def loadProjectList(dir): + """ + Fuction:load project list + input:a txt file including all the address + output: a list including all the address of files in the input file + """ + file = open(dir, "r") + list = file.read().splitlines() + file.close() + return list + + +def find_blobs_in_tree( + repo: Repo, commit: Commit, tree: Tree, relpath: bytes = b"" +) -> List[BlobInfo]: + """ + Function: find all the blobs in a Tree directory + input: + - repo: Repo object, which indicates the repository that the Tree belongs to. + - commit: Commit object, which indicates the commit that the Tree belongs to. + - tree: Tree object, which is the target directory. + - relpath: the relative path for the Tree object. + return: + - A list of BlobInfo objects + """ + result = [] + for entry in tree.items(): + obj = repo.object_store[entry.sha] + new_relpath = os.path.join(relpath, entry.path) + if obj.type_name == b"blob": + result.append( + BlobInfo(repo=repo, commit=commit, filepath=new_relpath, blob=obj) + ) + else: + new_tree = obj + result.extend( + find_blobs_in_tree( + repo=repo, commit=commit, tree=new_tree, relpath=new_relpath + ) + ) + return result + + +def extract_commit_blob_relation(repo_path: str) -> List[dict]: + """ + Function: extract the commit and blob relationship from a repository + input: + - repo_path: the string path of a git repository + return: + - A list of dict: { + key: Commit object + value: [BlobInfo object] + } + """ + result = {} + commits: Commit = [] + repo = Repo(repo_path) + object_shas = list(repo.object_store) + for object_sha in object_shas: + obj = repo.object_store[object_sha] + if obj.type_name == b"commit": + commits.append(obj) + for commit in commits: + blobs = find_blobs_in_tree( + repo=repo, commit=commit, tree=repo.object_store[commit.tree] + ) + result[commit] = blobs + return result + + +""" +test extract_commit_blob_relation function +1. firstly clone a repository to the root directory of this project, e.g., https://gitlink.org.cn/MillerEvan/bad_clone_prediction.git +2. invoke function extract_commit_blob_relation(repo_path="bad_clone_prediction") +""" +result = extract_commit_blob_relation(repo_path="bad_clone_prediction") +print(result) + + +# a test for local repo +# list = loadBlobObjects("C:/Users/Administrator/redis") +# print(list[0]) +# writeSourceCode("C:/Users/Administrator/redis/1.txt", list[0]) +# writeSourceFiles("C:/Users/Administrator/redis", list) + +# print(loadProjectList("testForLoadProjects.txt"))