将loadBlob和loadProject中的函数转移到了utils文件中,并增加了另外两个通用函数用来抽取commit和blob的对应关系,增加了一个对象类用来存储Blob和Commit的对应关系
This commit is contained in:
parent
e7a5bb83b3
commit
8ca21b61f1
|
@ -1,4 +1,4 @@
|
|||
[settings]
|
||||
line_length = 79
|
||||
multi_line_output = 3
|
||||
known_third_party =dulwich
|
||||
known_third_party =dulwich,loadProjects
|
||||
|
|
54
loadBlob.py
54
loadBlob.py
|
@ -1,54 +0,0 @@
|
|||
import os
|
||||
|
||||
from dulwich.repo import Repo
|
||||
|
||||
|
||||
def loadBlobObjects(dir):
|
||||
"""
|
||||
function: Find all blob objects in a given repo and make a list
|
||||
input: the directory of a repo
|
||||
output: a list of blob object
|
||||
"""
|
||||
repo = Repo(dir)
|
||||
objectlist = list(repo.object_store)
|
||||
bloblist = []
|
||||
for object in objectlist:
|
||||
if (
|
||||
repo.object_store[object].type == 3
|
||||
): # type == 1 -> commit; type == 2 -> tree ;type == 3 -> blob.
|
||||
bloblist.append(repo.object_store[object].data)
|
||||
repo.close()
|
||||
return bloblist
|
||||
|
||||
|
||||
def writeSourceCode(dir, code):
|
||||
"""
|
||||
function: Write source code to a new txt file
|
||||
input: the directory of a new text file, the code in type byte
|
||||
output: a file with source code
|
||||
"""
|
||||
file = open(dir, "wb")
|
||||
file.write(code)
|
||||
file.close()
|
||||
|
||||
|
||||
def writeSourceFiles(dir, list):
|
||||
"""
|
||||
function: Write all source code to a new folder
|
||||
input: the directory of a new folder, the code list in type byte
|
||||
output: a folder with source files
|
||||
"""
|
||||
folderpath = os.path.join(dir, "sourceFiles")
|
||||
os.mkdir(folderpath)
|
||||
i = 0
|
||||
for code in list:
|
||||
filepath = os.path.join(folderpath, str(i) + ".java")
|
||||
writeSourceCode(filepath, code)
|
||||
i = i + 1
|
||||
|
||||
|
||||
# a test for local repo
|
||||
# list = loadBlobObjects("C:/Users/Administrator/redis")
|
||||
# print(list[0])
|
||||
# writeSourceCode("C:/Users/Administrator/redis/1.txt", list[0])
|
||||
# writeSourceFiles("C:/Users/Administrator/redis", list)
|
|
@ -1,13 +0,0 @@
|
|||
import sys
|
||||
|
||||
|
||||
def loadProjectList(dir):
|
||||
"""
|
||||
Fuction:load project list
|
||||
input:a txt file including all the address
|
||||
output: a list including all the address of files in the input file
|
||||
"""
|
||||
file = open(dir, "r")
|
||||
list = file.read().splitlines()
|
||||
file.close()
|
||||
return list
|
|
@ -0,0 +1,20 @@
|
|||
from dulwich.objects import Blob, Commit
|
||||
from dulwich.repo import Repo
|
||||
|
||||
|
||||
class BlobInfo(object):
|
||||
def __init__(
|
||||
self,
|
||||
repo: Repo = None,
|
||||
commit: Commit = None,
|
||||
filepath: str = None,
|
||||
blob: Blob = None,
|
||||
) -> None:
|
||||
self.repo = repo
|
||||
self.commit = commit
|
||||
self.filepath = filepath
|
||||
self.blob = blob
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
BlobInfo()
|
|
@ -0,0 +1,142 @@
|
|||
# aim: this is the utility file for other scripts
|
||||
# author: zhongyan, zhangxunhui
|
||||
|
||||
import os
|
||||
from typing import List
|
||||
|
||||
from dulwich.objects import Commit, Tree
|
||||
from dulwich.repo import Repo
|
||||
|
||||
from models.BlobInfo import BlobInfo
|
||||
|
||||
|
||||
def loadBlobObjects(dir):
|
||||
"""
|
||||
function: Find all blob objects in a given repo and make a list
|
||||
input: the directory of a repo
|
||||
output: a list of blob object
|
||||
"""
|
||||
repo = Repo(dir)
|
||||
objectlist = list(repo.object_store)
|
||||
bloblist = []
|
||||
for object in objectlist:
|
||||
if (
|
||||
repo.object_store[object].type == 3
|
||||
): # type == 1 -> commit; type == 2 -> tree ;type == 3 -> blob.
|
||||
bloblist.append(repo.object_store[object].data)
|
||||
repo.close()
|
||||
return bloblist
|
||||
|
||||
|
||||
def writeSourceCode(dir, code):
|
||||
"""
|
||||
function: Write source code to a new txt file
|
||||
input: the directory of a new text file, the code in type byte
|
||||
output: a file with source code
|
||||
"""
|
||||
file = open(dir, "wb")
|
||||
file.write(code)
|
||||
file.close()
|
||||
|
||||
|
||||
def writeSourceFiles(dir, list):
|
||||
"""
|
||||
function: Write all source code to a new folder
|
||||
input: the directory of a new folder, the code list in type byte
|
||||
output: a folder with source files
|
||||
"""
|
||||
folderpath = os.path.join(dir, "sourceFiles")
|
||||
os.mkdir(folderpath)
|
||||
i = 0
|
||||
for code in list:
|
||||
filepath = os.path.join(folderpath, str(i) + ".java")
|
||||
writeSourceCode(filepath, code)
|
||||
i = i + 1
|
||||
|
||||
|
||||
def loadProjectList(dir):
|
||||
"""
|
||||
Fuction:load project list
|
||||
input:a txt file including all the address
|
||||
output: a list including all the address of files in the input file
|
||||
"""
|
||||
file = open(dir, "r")
|
||||
list = file.read().splitlines()
|
||||
file.close()
|
||||
return list
|
||||
|
||||
|
||||
def find_blobs_in_tree(
|
||||
repo: Repo, commit: Commit, tree: Tree, relpath: bytes = b""
|
||||
) -> List[BlobInfo]:
|
||||
"""
|
||||
Function: find all the blobs in a Tree directory
|
||||
input:
|
||||
- repo: Repo object, which indicates the repository that the Tree belongs to.
|
||||
- commit: Commit object, which indicates the commit that the Tree belongs to.
|
||||
- tree: Tree object, which is the target directory.
|
||||
- relpath: the relative path for the Tree object.
|
||||
return:
|
||||
- A list of BlobInfo objects
|
||||
"""
|
||||
result = []
|
||||
for entry in tree.items():
|
||||
obj = repo.object_store[entry.sha]
|
||||
new_relpath = os.path.join(relpath, entry.path)
|
||||
if obj.type_name == b"blob":
|
||||
result.append(
|
||||
BlobInfo(repo=repo, commit=commit, filepath=new_relpath, blob=obj)
|
||||
)
|
||||
else:
|
||||
new_tree = obj
|
||||
result.extend(
|
||||
find_blobs_in_tree(
|
||||
repo=repo, commit=commit, tree=new_tree, relpath=new_relpath
|
||||
)
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
def extract_commit_blob_relation(repo_path: str) -> List[dict]:
|
||||
"""
|
||||
Function: extract the commit and blob relationship from a repository
|
||||
input:
|
||||
- repo_path: the string path of a git repository
|
||||
return:
|
||||
- A list of dict: {
|
||||
key: Commit object
|
||||
value: [BlobInfo object]
|
||||
}
|
||||
"""
|
||||
result = {}
|
||||
commits: Commit = []
|
||||
repo = Repo(repo_path)
|
||||
object_shas = list(repo.object_store)
|
||||
for object_sha in object_shas:
|
||||
obj = repo.object_store[object_sha]
|
||||
if obj.type_name == b"commit":
|
||||
commits.append(obj)
|
||||
for commit in commits:
|
||||
blobs = find_blobs_in_tree(
|
||||
repo=repo, commit=commit, tree=repo.object_store[commit.tree]
|
||||
)
|
||||
result[commit] = blobs
|
||||
return result
|
||||
|
||||
|
||||
"""
|
||||
test extract_commit_blob_relation function
|
||||
1. firstly clone a repository to the root directory of this project, e.g., https://gitlink.org.cn/MillerEvan/bad_clone_prediction.git
|
||||
2. invoke function extract_commit_blob_relation(repo_path="bad_clone_prediction")
|
||||
"""
|
||||
result = extract_commit_blob_relation(repo_path="bad_clone_prediction")
|
||||
print(result)
|
||||
|
||||
|
||||
# a test for local repo
|
||||
# list = loadBlobObjects("C:/Users/Administrator/redis")
|
||||
# print(list[0])
|
||||
# writeSourceCode("C:/Users/Administrator/redis/1.txt", list[0])
|
||||
# writeSourceFiles("C:/Users/Administrator/redis", list)
|
||||
|
||||
# print(loadProjectList("testForLoadProjects.txt"))
|
Loading…
Reference in New Issue