2022-08-07 08:33:18 +08:00
import json
2022-08-14 19:14:02 +08:00
import pathlib
2022-08-14 17:02:37 +08:00
import re
2022-08-17 18:40:20 +08:00
from cmath import isnan
2022-08-14 17:02:37 +08:00
from difflib import SequenceMatcher
2022-08-07 08:33:18 +08:00
from typing import List
2022-08-14 17:02:37 +08:00
import numpy as np
import pandas as pd
from dulwich import diff_tree
2022-08-08 09:08:18 +08:00
from dulwich . objects import Commit
2022-08-14 17:02:37 +08:00
from dulwich . repo import Repo
2022-08-08 09:08:18 +08:00
2022-08-07 08:33:18 +08:00
import GlobalConstants
2022-08-14 17:02:37 +08:00
from ConfigOperator import ConfigOperator
from GitOperator import GitOperator
from models . LineRelationInfo import LineRelationInfo
2022-08-07 08:33:18 +08:00
from models . MethodInfo import MethodInfo
from models . RepoInfo import RepoInfo
from MySQLOperator import MySQLOperator
2022-08-14 17:02:37 +08:00
class CommitLineRelationExtractor ( object ) :
"""
This class is used for extracting line number relations between parent commit and child commit
"""
2022-08-07 08:33:18 +08:00
2022-08-14 17:02:37 +08:00
def __init__ (
self ,
repoInfo : RepoInfo ,
commit : Commit ,
commit_id : int ,
parent : Commit ,
parent_id : int ,
tree_changes : List [ diff_tree . TreeChange ] ,
configOp : ConfigOperator ,
) :
self . repoInfo = repoInfo
self . repo = Repo ( self . repoInfo . bare_repo_path )
self . commit = commit
self . commit_id = commit_id
self . parent = parent
self . parent_id = parent_id
self . tree_changes = tree_changes
self . configOp = configOp
def extract_diff ( self , old_content : list , new_content : list ) :
"""
Function : extract the differences between two contents ( Only store new lines for insert and replace ; only store old lines for delete ) .
We also need to store the line number relationship between old and new file , {
old line number : [ new line numbers ] # because there may not be strict line number relationship
}
params :
- old_content : a list of strings
- new_content : a list of strings
return :
- df : pd . DataFrame : columns [ line_old , line_new , content_old , content_new ]
"""
line_old_list = [ ]
line_new_list = [ ]
content_old_list = [ ]
content_new_list = [ ]
for tag , i1 , i2 , j1 , j2 in SequenceMatcher (
None , old_content , new_content
) . get_opcodes ( ) :
if tag == " equal " :
for i in range ( i2 - i1 ) :
old_line = i1 + i + 1
new_line = j1 + i + 1
line_old_list . append ( old_line )
line_new_list . append ( new_line )
content_old_list . append ( None )
content_new_list . append ( None )
elif tag == " insert " :
for i in range ( j1 + 1 , j2 + 1 ) :
line_old_list . append ( np . nan )
line_new_list . append ( i )
content_old_list . append ( None )
content_new_list . append ( new_content [ i - 1 ] )
elif tag == " delete " :
for i in range ( i1 + 1 , i2 + 1 ) :
line_old_list . append ( i )
line_new_list . append ( np . nan )
content_old_list . append ( old_content [ i - 1 ] )
content_new_list . append ( None )
elif tag == " replace " :
for i in range ( i2 - i1 ) :
old_line = i1 + i + 1
line_old_list . append ( old_line )
line_new_list . append ( np . nan )
content_old_list . append ( old_content [ old_line - 1 ] )
content_new_list . append ( None )
for j in range ( j2 - j1 ) :
new_line = j1 + j + 1
line_old_list . append ( np . nan )
line_new_list . append ( new_line )
content_old_list . append ( None )
content_new_list . append ( new_content [ new_line - 1 ] )
else :
raise Exception ( " Function extract_diff Error: type error! " )
return pd . DataFrame . from_dict (
{
" line_old " : line_old_list ,
" line_new " : line_new_list ,
" content_old " : content_old_list ,
" content_new " : content_new_list ,
}
2022-08-07 08:33:18 +08:00
)
2022-08-14 17:02:37 +08:00
def handle_tree_change ( self , tree_change : diff_tree . TreeChange ) :
"""
Function : get the changed relative filepath and changed lines of a diff_tree . TreeChange object
return :
- df : pd . DataFrame : columns [ line_old , line_new , content_old , content_new , filepath_old , filepath_new , commit_id_old , commit_id_new ]
change_types :
CHANGE_ADD = " add "
CHANGE_MODIFY = " modify "
CHANGE_DELETE = " delete "
CHANGE_RENAME = " rename "
CHANGE_COPY = " copy "
CHANGE_UNCHANGED = " unchanged "
"""
2022-08-14 22:36:04 +08:00
if tree_change . old . sha is not None :
old_content = self . repo . object_store [ tree_change . old . sha ] . data . splitlines ( )
if tree_change . new . sha is not None :
new_content = self . repo . object_store [ tree_change . new . sha ] . data . splitlines ( )
2022-08-14 17:02:37 +08:00
change_type = tree_change . type
if change_type == " add " :
line_relation_df = self . extract_diff (
2022-08-14 22:36:04 +08:00
old_content = [ ] , new_content = new_content
2022-08-14 17:02:37 +08:00
)
elif change_type == " delete " :
line_relation_df = self . extract_diff (
2022-08-14 22:36:04 +08:00
old_content = old_content ,
2022-08-14 17:02:37 +08:00
new_content = [ ] ,
)
elif change_type == " modify " :
line_relation_df = self . extract_diff (
2022-08-14 22:36:04 +08:00
old_content = old_content ,
new_content = new_content ,
2022-08-14 17:02:37 +08:00
)
elif change_type == " rename " :
line_relation_df = self . extract_diff (
2022-08-14 22:36:04 +08:00
old_content = old_content ,
new_content = new_content ,
2022-08-14 17:02:37 +08:00
)
elif change_type == " copy " :
line_relation_df = self . extract_diff (
2022-08-14 22:36:04 +08:00
old_content = old_content ,
new_content = new_content ,
2022-08-14 17:02:37 +08:00
)
2022-08-14 19:14:02 +08:00
"""
dulwich ' s TreeChange ' s path is always in Linux mode , Windows is not supported
Therefore , we need to localize the filepath
"""
if tree_change . old . path is None :
filepath_old = None
else :
filepath_old = str ( pathlib . Path ( tree_change . old . path . decode ( ) ) ) . encode ( )
if tree_change . new . path is None :
filepath_new = None
else :
filepath_new = str ( pathlib . Path ( tree_change . new . path . decode ( ) ) ) . encode ( )
line_relation_df [ " filepath_old " ] = filepath_old
line_relation_df [ " filepath_new " ] = filepath_new
2022-08-14 17:02:37 +08:00
line_relation_df [ " commit_id_old " ] = self . parent_id
line_relation_df [ " commit_id_new " ] = self . commit_id
return line_relation_df
def parse ( self ) - > pd . DataFrame :
"""
Function : extracting changes for List [ diff_tree . TreeChange ]
"""
result : pd . DataFrame = pd . DataFrame (
columns = [
" filepath_old " ,
" filepath_new " ,
" line_old " ,
" line_new " ,
" content_old " ,
" content_new " ,
" commit_id_old " ,
" commit_id_new " ,
]
2022-08-07 08:33:18 +08:00
)
2022-08-14 17:02:37 +08:00
for tree_change in self . tree_changes :
if (
tree_change . new . path is not None
2022-08-14 19:14:02 +08:00
and (
not self . configOp . is_lang_supported (
filepath = str (
pathlib . Path ( tree_change . new . path . decode ( ) )
) . encode ( )
)
)
2022-08-14 17:02:37 +08:00
) or (
tree_change . old . path is not None
2022-08-14 19:14:02 +08:00
and (
not self . configOp . is_lang_supported (
filepath = str (
pathlib . Path ( tree_change . old . path . decode ( ) )
) . encode ( )
)
)
2022-08-14 17:02:37 +08:00
) :
continue # the file language is not supported
line_relation_df = self . handle_tree_change ( tree_change )
result = result . append ( line_relation_df )
return result
def extract_method_function_relation (
repoInfo : RepoInfo , mysqlOp : MySQLOperator , configOp : ConfigOperator
) :
2022-08-17 18:40:20 +08:00
step_name = " method function extraction "
2022-08-14 17:02:37 +08:00
mysqlOp . cursor . execute (
2022-08-17 18:40:20 +08:00
" select handled from ` {steps_tablename} ` where step_name= %s " . format (
steps_tablename = mysqlOp . tablename_dict [ " steps " ]
2022-08-14 17:02:37 +08:00
) ,
2022-08-17 18:40:20 +08:00
( step_name ) ,
2022-08-14 17:02:37 +08:00
)
2022-08-17 18:40:20 +08:00
handled = mysqlOp . cursor . fetchone ( ) [ " handled " ]
2022-08-07 08:33:18 +08:00
2022-08-17 18:40:20 +08:00
if not handled :
2022-08-07 08:33:18 +08:00
2022-08-17 18:40:20 +08:00
mysqlOp . truncate_table (
tablename = mysqlOp . tablename_dict [ " method_function_relations " ]
)
2022-08-07 08:33:18 +08:00
2022-08-17 18:40:20 +08:00
# read filepath_id_dict
filepath_id_dict = mysqlOp . get_filepath_id_dict ( )
2022-08-07 08:33:18 +08:00
2022-08-17 18:40:20 +08:00
# read commit_sha_id_dict
commit_sha_id_dict = mysqlOp . get_commit_sha_id_dict ( )
2022-08-14 17:02:37 +08:00
2022-08-17 18:40:20 +08:00
# read method_id_name dict
method_id_name_dict = mysqlOp . get_method_id_name_dict ( )
2022-08-14 17:02:37 +08:00
2022-08-17 18:40:20 +08:00
repo = Repo ( repoInfo . bare_repo_path )
2022-08-14 17:02:37 +08:00
2022-08-17 18:40:20 +08:00
def handle_commit ( commit_id : int , commit : Commit ) :
"""
Function : extract the commit line change relations
params :
- commit_id : the mysql id of Commit
- commit : the Commit object
"""
2022-08-14 17:02:37 +08:00
2022-08-17 18:40:20 +08:00
# read line no and method id relationship (commit_id -> filepath -> line no -> method id), because we also need parent_commits' filepath line method relationship
related_commit_ids = [ commit_id ]
parent_shas = commit . parents
for parent_sha in parent_shas :
parent_id = mysqlOp . get_commit_id_by_sha ( sha = parent_sha )
if parent_id is not None :
related_commit_ids . append ( parent_id )
commit_filepath_lineno_method_id_dict = { }
for related_commit_id in related_commit_ids :
tmp_dict = mysqlOp . get_fp_lineno_method_id_dict (
commit_id = related_commit_id ,
)
commit_filepath_lineno_method_id_dict [ related_commit_id ] = tmp_dict
2022-08-14 17:02:37 +08:00
2022-08-17 18:40:20 +08:00
def formatrow ( row ) :
if row [ " filepath_old " ] is None :
filepath_id_old = np . nan
2022-08-14 17:02:37 +08:00
else :
2022-08-24 09:17:29 +08:00
if row [ " filepath_old " ] not in filepath_id_dict :
"""
Some filepaths gotten by dulwich are different with the real filepaths
in the mysql database and the key names in filepath_id_dict . When this bug
2022-08-31 22:51:34 +08:00
happened we set filepath_id_old = None .
2022-08-24 09:17:29 +08:00
"""
2022-08-31 22:51:34 +08:00
filepath_id_old = None
2022-08-24 09:17:29 +08:00
else :
filepath_id_old = filepath_id_dict [ row [ " filepath_old " ] ]
2022-08-17 18:40:20 +08:00
if row [ " filepath_new " ] is None :
filepath_id_new = np . nan
2022-08-14 17:02:37 +08:00
else :
2022-08-24 09:17:29 +08:00
if row [ " filepath_new " ] not in filepath_id_dict :
"""
Some filepaths gotten by dulwich are different with the real filepaths
in the mysql database and the key names in filepath_id_dict . When this bug
2022-08-31 22:51:34 +08:00
happened we set filepath_id_old = None .
2022-08-24 09:26:35 +08:00
Example : When deal with the repository git @github.com : apache / iotdb . git , a filepath in
filepath_id_dict is ' iotdb \\ metrics \\ interface \\ src \\ main \\ java \\ org \\ apache \\ iotdb \\ metrics \\ DoNothingMetricService.java '
while the filepath obtained by dulwich will ignore " iotdb \\ "
2022-08-24 09:17:29 +08:00
"""
2022-08-31 22:51:34 +08:00
filepath_id_new = None
2022-08-24 09:17:29 +08:00
else :
filepath_id_new = filepath_id_dict [ row [ " filepath_new " ] ]
2022-08-14 17:02:37 +08:00
2022-08-17 18:40:20 +08:00
commit_id_old = row [ " commit_id_old " ]
commit_id_new = row [ " commit_id_new " ]
2022-08-14 17:02:37 +08:00
2022-08-17 18:40:20 +08:00
line_old = row [ " line_old " ]
line_new = row [ " line_new " ]
2022-08-14 17:02:37 +08:00
2022-08-17 18:40:20 +08:00
if np . isnan ( line_old ) :
method_id_old = np . nan
else :
2022-08-31 22:51:34 +08:00
if (
2022-08-17 18:40:20 +08:00
commit_id_old not in commit_filepath_lineno_method_id_dict
or filepath_id_old
not in commit_filepath_lineno_method_id_dict [ commit_id_old ]
or line_old
not in commit_filepath_lineno_method_id_dict [ commit_id_old ] [
filepath_id_old
]
) :
method_id_old = np . nan # this line is not related to a method
else :
2022-08-17 22:22:40 +08:00
method_id_old = commit_filepath_lineno_method_id_dict [
commit_id_old
] [ filepath_id_old ] [ line_old ]
2022-08-17 18:40:20 +08:00
if np . isnan ( line_new ) :
method_id_new = np . nan
else :
2022-08-31 22:51:34 +08:00
if (
2022-08-17 18:40:20 +08:00
commit_id_new not in commit_filepath_lineno_method_id_dict
or filepath_id_new
not in commit_filepath_lineno_method_id_dict [ commit_id_new ]
or line_new
not in commit_filepath_lineno_method_id_dict [ commit_id_new ] [
filepath_id_new
]
) :
method_id_new = np . nan # this line is not related to a method
else :
2022-08-17 22:22:40 +08:00
method_id_new = commit_filepath_lineno_method_id_dict [
commit_id_new
] [ filepath_id_new ] [ line_new ]
2022-08-17 18:40:20 +08:00
if np . isnan ( method_id_old ) :
method_name_old = None
else :
method_name_old = method_id_name_dict [ method_id_old ]
2022-08-14 17:02:37 +08:00
2022-08-17 18:40:20 +08:00
if np . isnan ( method_id_new ) :
method_name_new = None
else :
method_name_new = method_id_name_dict [ method_id_new ]
return method_id_old , method_name_old , method_id_new , method_name_new
def get_changes ( df ) :
"""
Function : get the method_function_changes according to the df
params :
- df : the line change dataframe
return :
- List [ {
" method_id_1 "
" method_id_2 "
" change " : json str
} ]
"""
result = [ ]
filtered_df = df . loc [
( df [ " method_name_old " ] . notnull ( ) )
& ( df [ " method_name_new " ] . notnull ( ) )
& ( df [ " method_name_old " ] == df [ " method_name_new " ] )
] # if functions are related, they should have one line of code that are not modified; therefore, this line of code should have the same related method name
groups = filtered_df . groupby ( [ " method_id_old " , " method_id_new " ] )
for name , _ in groups :
method_id_old = int ( name [ 0 ] )
method_id_new = int ( name [ 1 ] )
# get the changed contents
change_delete = list (
df [
( df [ " method_id_old " ] == method_id_old )
& ( df [ " content_old " ] . notnull ( ) )
] [ " content_old " ]
)
change_add = list (
df [
( df [ " method_id_new " ] == method_id_new )
& ( df [ " content_new " ] . notnull ( ) )
] [ " content_new " ]
)
if len ( change_delete ) == 0 and len ( change_add ) == 0 :
change = None
else :
change = [ b " ADD: " ]
change . extend ( change_add )
change . extend ( [ b " DELETE: " ] )
change . extend ( change_delete )
change = b " \n " . join ( change )
result . append (
{
" method_id_1 " : method_id_old ,
" method_id_2 " : method_id_new ,
" change " : change ,
}
)
return result
2022-08-14 17:02:37 +08:00
2022-08-17 18:40:20 +08:00
if len ( commit . parents ) == 0 :
# no method change
pass
else :
for parent_sha in commit . parents :
parent_id = mysqlOp . get_commit_id_by_sha (
sha = parent_sha ,
2022-08-14 17:02:37 +08:00
)
2022-08-17 18:40:20 +08:00
tree_changes = diff_tree . tree_changes (
store = repo . object_store ,
tree1_id = repo . object_store [ parent_sha ] . tree ,
tree2_id = repo . object_store [ commit . id ] . tree ,
2022-08-14 17:02:37 +08:00
)
2022-08-17 18:40:20 +08:00
line_relation_df = CommitLineRelationExtractor (
repoInfo = repoInfo ,
commit = commit ,
commit_id = commit_id ,
parent = repo . object_store [ parent_sha ] ,
parent_id = parent_id ,
tree_changes = tree_changes ,
configOp = configOp ,
) . parse ( )
if line_relation_df . shape [ 0 ] > 0 :
line_relation_df [
[
" method_id_old " ,
" method_name_old " ,
" method_id_new " ,
" method_name_new " ,
]
] = line_relation_df . apply (
formatrow , axis = 1 , result_type = " expand "
2022-08-14 17:02:37 +08:00
)
2022-08-17 18:40:20 +08:00
changes = get_changes ( line_relation_df )
for change in changes :
method_id_1 = change [ " method_id_1 " ]
method_id_2 = change [ " method_id_2 " ]
change_content = change [ " change " ]
mysqlOp . cursor . execute (
" insert into ` {method_function_relation_tablename} ` (method_id_1, method_id_2, `change`) values ( %s , %s , %s ) " . format (
method_function_relation_tablename = mysqlOp . tablename_dict [
" method_function_relations "
]
) ,
( method_id_1 , method_id_2 , change_content ) ,
)
for commit_sha , commit_id in commit_sha_id_dict . items ( ) :
handle_commit ( commit_id = commit_id , commit = repo . object_store [ commit_sha ] )
2022-08-14 17:02:37 +08:00
2022-08-17 18:40:20 +08:00
# update steps table
2022-08-07 08:33:18 +08:00
mysqlOp . cursor . execute (
2022-08-17 18:40:20 +08:00
" update ` {steps_tablename} ` set handled= %s where step_name= %s " . format (
steps_tablename = mysqlOp . tablename_dict [ " steps " ]
2022-08-07 08:33:18 +08:00
) ,
2022-08-17 18:40:20 +08:00
( 1 , step_name ) ,
2022-08-07 08:33:18 +08:00
)
mysqlOp . connection . commit ( )