将loadBlob和loadProject中的函数转移到了utils文件中,并增加了另外两个通用函数用来抽取commit和blob的对应关系,增加了一个对象类用来存储Blob和Commit的对应关系

This commit is contained in:
zhangxunhui 2022-08-02 19:19:46 +08:00
parent e7a5bb83b3
commit 8ca21b61f1
5 changed files with 163 additions and 68 deletions

View File

@ -1,4 +1,4 @@
[settings]
line_length = 79
multi_line_output = 3
known_third_party =dulwich
known_third_party =dulwich,loadProjects

View File

@ -1,54 +0,0 @@
import os
from dulwich.repo import Repo
def loadBlobObjects(dir):
"""
function: Find all blob objects in a given repo and make a list
input: the directory of a repo
output: a list of blob object
"""
repo = Repo(dir)
objectlist = list(repo.object_store)
bloblist = []
for object in objectlist:
if (
repo.object_store[object].type == 3
): # type == 1 -> commit; type == 2 -> tree ;type == 3 -> blob.
bloblist.append(repo.object_store[object].data)
repo.close()
return bloblist
def writeSourceCode(dir, code):
"""
function: Write source code to a new txt file
input: the directory of a new text file, the code in type byte
output: a file with source code
"""
file = open(dir, "wb")
file.write(code)
file.close()
def writeSourceFiles(dir, list):
"""
function: Write all source code to a new folder
input: the directory of a new folder, the code list in type byte
output: a folder with source files
"""
folderpath = os.path.join(dir, "sourceFiles")
os.mkdir(folderpath)
i = 0
for code in list:
filepath = os.path.join(folderpath, str(i) + ".java")
writeSourceCode(filepath, code)
i = i + 1
# a test for local repo
# list = loadBlobObjects("C:/Users/Administrator/redis")
# print(list[0])
# writeSourceCode("C:/Users/Administrator/redis/1.txt", list[0])
# writeSourceFiles("C:/Users/Administrator/redis", list)

View File

@ -1,13 +0,0 @@
import sys
def loadProjectList(dir):
"""
Fuction:load project list
input:a txt file including all the address
output: a list including all the address of files in the input file
"""
file = open(dir, "r")
list = file.read().splitlines()
file.close()
return list

20
models/BlobInfo.py Normal file
View File

@ -0,0 +1,20 @@
from dulwich.objects import Blob, Commit
from dulwich.repo import Repo
class BlobInfo(object):
def __init__(
self,
repo: Repo = None,
commit: Commit = None,
filepath: str = None,
blob: Blob = None,
) -> None:
self.repo = repo
self.commit = commit
self.filepath = filepath
self.blob = blob
if __name__ == "__main__":
BlobInfo()

142
utils.py Normal file
View File

@ -0,0 +1,142 @@
# aim: this is the utility file for other scripts
# author: zhongyan, zhangxunhui
import os
from typing import List
from dulwich.objects import Commit, Tree
from dulwich.repo import Repo
from models.BlobInfo import BlobInfo
def loadBlobObjects(dir):
"""
function: Find all blob objects in a given repo and make a list
input: the directory of a repo
output: a list of blob object
"""
repo = Repo(dir)
objectlist = list(repo.object_store)
bloblist = []
for object in objectlist:
if (
repo.object_store[object].type == 3
): # type == 1 -> commit; type == 2 -> tree ;type == 3 -> blob.
bloblist.append(repo.object_store[object].data)
repo.close()
return bloblist
def writeSourceCode(dir, code):
"""
function: Write source code to a new txt file
input: the directory of a new text file, the code in type byte
output: a file with source code
"""
file = open(dir, "wb")
file.write(code)
file.close()
def writeSourceFiles(dir, list):
"""
function: Write all source code to a new folder
input: the directory of a new folder, the code list in type byte
output: a folder with source files
"""
folderpath = os.path.join(dir, "sourceFiles")
os.mkdir(folderpath)
i = 0
for code in list:
filepath = os.path.join(folderpath, str(i) + ".java")
writeSourceCode(filepath, code)
i = i + 1
def loadProjectList(dir):
"""
Fuction:load project list
input:a txt file including all the address
output: a list including all the address of files in the input file
"""
file = open(dir, "r")
list = file.read().splitlines()
file.close()
return list
def find_blobs_in_tree(
repo: Repo, commit: Commit, tree: Tree, relpath: bytes = b""
) -> List[BlobInfo]:
"""
Function: find all the blobs in a Tree directory
input:
- repo: Repo object, which indicates the repository that the Tree belongs to.
- commit: Commit object, which indicates the commit that the Tree belongs to.
- tree: Tree object, which is the target directory.
- relpath: the relative path for the Tree object.
return:
- A list of BlobInfo objects
"""
result = []
for entry in tree.items():
obj = repo.object_store[entry.sha]
new_relpath = os.path.join(relpath, entry.path)
if obj.type_name == b"blob":
result.append(
BlobInfo(repo=repo, commit=commit, filepath=new_relpath, blob=obj)
)
else:
new_tree = obj
result.extend(
find_blobs_in_tree(
repo=repo, commit=commit, tree=new_tree, relpath=new_relpath
)
)
return result
def extract_commit_blob_relation(repo_path: str) -> List[dict]:
"""
Function: extract the commit and blob relationship from a repository
input:
- repo_path: the string path of a git repository
return:
- A list of dict: {
key: Commit object
value: [BlobInfo object]
}
"""
result = {}
commits: Commit = []
repo = Repo(repo_path)
object_shas = list(repo.object_store)
for object_sha in object_shas:
obj = repo.object_store[object_sha]
if obj.type_name == b"commit":
commits.append(obj)
for commit in commits:
blobs = find_blobs_in_tree(
repo=repo, commit=commit, tree=repo.object_store[commit.tree]
)
result[commit] = blobs
return result
"""
test extract_commit_blob_relation function
1. firstly clone a repository to the root directory of this project, e.g., https://gitlink.org.cn/MillerEvan/bad_clone_prediction.git
2. invoke function extract_commit_blob_relation(repo_path="bad_clone_prediction")
"""
result = extract_commit_blob_relation(repo_path="bad_clone_prediction")
print(result)
# a test for local repo
# list = loadBlobObjects("C:/Users/Administrator/redis")
# print(list[0])
# writeSourceCode("C:/Users/Administrator/redis/1.txt", list[0])
# writeSourceFiles("C:/Users/Administrator/redis", list)
# print(loadProjectList("testForLoadProjects.txt"))