371 lines
17 KiB
Python
371 lines
17 KiB
Python
# collect data for the trend analysis
|
|
# aim: find whether sponsor create or first sponse will influence the commit or discussion number
|
|
# date: 2021-05-31
|
|
# author: zxh
|
|
|
|
# collect before and after time point (6 months)
|
|
|
|
import pymysql, yaml, math, datetime, json
|
|
from utils import *
|
|
import seaborn as sns
|
|
import pandas as pd
|
|
from matplotlib import pyplot as plt
|
|
import numpy as np
|
|
import threading, queue # 多线程搜集数据
|
|
|
|
f = open('config.yaml', 'r')
|
|
config = yaml.load(f.read(), Loader=yaml.BaseLoader)
|
|
conn = connectMysqlDB(config, autocommit = True)
|
|
cur = conn.cursor(pymysql.cursors.DictCursor)
|
|
|
|
# read all the sponsors from database (0 sponsor, >=1 sponsor all considered)
|
|
cur.execute("select login from github_sponsor_listing where deleted=0")
|
|
users = cur.fetchall()
|
|
users = [u["login"] for u in users]
|
|
|
|
cur.execute("select login from middle_data_trend")
|
|
handled_users = cur.fetchall()
|
|
handled_users = [u["login"] for u in handled_users]
|
|
|
|
users = list(set(users) - set(handled_users))
|
|
|
|
unstable_n = 0 # 前后多少天是不稳定时间,趋势分析中没有不稳定时间
|
|
step = 30 # 一个gap多少天
|
|
interval_n = 6 # 前后区间数量
|
|
|
|
|
|
|
|
class myThread(threading.Thread):
|
|
def __init__(self, q):
|
|
threading.Thread.__init__(self)
|
|
self.q = q
|
|
self.conn = connectMysqlDB(config, autocommit = False)
|
|
self.cur = self.conn.cursor(pymysql.cursors.DictCursor)
|
|
|
|
'''
|
|
params: {
|
|
login,
|
|
mid_time_point,
|
|
month_index,
|
|
user_account_createtime,
|
|
mid_time_point_type
|
|
}
|
|
'''
|
|
def collect_factors(self, params):
|
|
|
|
login = params['login']
|
|
mid_time_point = params['mid_time_point']
|
|
month_index = params['month_index']
|
|
user_account_createtime = params['user_account_createtime']
|
|
mid_time_point_type = params['mid_time_point_type']
|
|
|
|
result = {}
|
|
result['login'] = login
|
|
result['month_index'] = month_index
|
|
result['mid_time_point_type'] = mid_time_point_type
|
|
|
|
# 计算起止时间
|
|
if month_index > 0:
|
|
start_at = mid_time_point + datetime.timedelta(days=unstable_n) + datetime.timedelta(days=(month_index-1)*step)
|
|
end_at = start_at + datetime.timedelta(days=step)
|
|
elif month_index < 0:
|
|
start_at = mid_time_point - datetime.timedelta(days=unstable_n) + datetime.timedelta(days=month_index*step)
|
|
end_at = start_at + datetime.timedelta(days=step)
|
|
else:
|
|
start_at = mid_time_point - datetime.timedelta(days=unstable_n)
|
|
end_at = mid_time_point + datetime.timedelta(days=unstable_n)
|
|
result['start_at'] = start_at
|
|
result['end_at'] = end_at
|
|
|
|
# user age: 表示当前这个区间的user age
|
|
result['user_age'] = int((start_at - user_account_createtime).days / 30)
|
|
|
|
# history commit num: 表示这个区间起始时间之前的commit总数量
|
|
self.cur.execute("select sum(contribution_count) as history_commit_num from github_user_commits_per_day where date<%s and login=%s", (start_at,login))
|
|
history_commit_num = self.cur.fetchone()['history_commit_num']
|
|
if history_commit_num is None:
|
|
history_commit_num = 0
|
|
else:
|
|
history_commit_num = int(history_commit_num)
|
|
result['history_commit_num'] = history_commit_num
|
|
|
|
# history pr num: 表示这个区间之前的提出pr的总量
|
|
self.cur.execute("select count(*) as history_pr_num from github_user_pr where created_at<%s and login=%s", (start_at, login))
|
|
history_pr_num = self.cur.fetchone()['history_pr_num']
|
|
result['history_pr_num'] = history_pr_num
|
|
|
|
# history issue num: 区间之前的issue总量
|
|
self.cur.execute("select count(*) as history_issue_num from github_user_issue where created_at<%s and login=%s", (start_at, login))
|
|
history_issue_num = self.cur.fetchone()['history_issue_num']
|
|
result['history_issue_num'] = history_issue_num
|
|
|
|
# history discussion num: 区间之前的issue comment + pr comment + commit comment
|
|
self.cur.execute("select count(*) as history_issue_comment_num from github_issue_comment where created_at<%s and login=%s", (start_at, login))
|
|
history_issue_comment_num = self.cur.fetchone()['history_issue_comment_num']
|
|
self.cur.execute("select count(*) as history_pr_comment_num from github_pr_comment where created_at<%s and login=%s", (start_at, login))
|
|
history_pr_comment_num = self.cur.fetchone()['history_pr_comment_num']
|
|
self.cur.execute("select count(*) as history_commit_comment_num from github_commit_comment where created_at<%s and login=%s", (start_at, login))
|
|
history_commit_comment_num = self.cur.fetchone()['history_commit_comment_num']
|
|
history_discussion_num = history_issue_comment_num + history_pr_comment_num + history_commit_comment_num
|
|
result['history_discussion_num'] = history_discussion_num
|
|
|
|
# history pr review num
|
|
self.cur.execute("select count(*) as history_pr_review_num from github_user_pr_review where created_at<%s and login=%s", (start_at, login))
|
|
history_pr_review_num = self.cur.fetchone()['history_pr_review_num']
|
|
result['history_pr_review_num'] = history_pr_review_num
|
|
|
|
# time
|
|
time = month_index + interval_n
|
|
result['time'] = time
|
|
|
|
# intervention
|
|
if month_index > 0:
|
|
intervention = 1
|
|
elif month_index < 0:
|
|
intervention = 0
|
|
else:
|
|
intervention = None
|
|
result['intervention'] = intervention
|
|
|
|
# time after intervention
|
|
if month_index > 0:
|
|
time_after_intervention = time - interval_n
|
|
elif month_index < 0:
|
|
time_after_intervention = 0
|
|
else:
|
|
time_after_intervention = None
|
|
result['time_after_intervention'] = time_after_intervention
|
|
|
|
# commit num: 区间内数量
|
|
self.cur.execute("select sum(contribution_count) as commit_num from github_user_commits_per_day where date>%s and date<=%s and login=%s", (start_at, end_at, login))
|
|
commit_num = self.cur.fetchone()['commit_num']
|
|
if commit_num is None:
|
|
commit_num = 0
|
|
else:
|
|
commit_num = int(commit_num)
|
|
result['commit_num'] = commit_num
|
|
|
|
# pr num
|
|
self.cur.execute("select count(*) as pr_num from github_user_pr where created_at>%s and created_at<=%s and login=%s", (start_at, end_at, login))
|
|
pr_num = self.cur.fetchone()['pr_num']
|
|
result['pr_num'] = pr_num
|
|
|
|
# issue num
|
|
self.cur.execute("select count(*) as issue_num from github_user_issue where created_at>%s and created_at<=%s and login=%s", (start_at, end_at, login))
|
|
issue_num = self.cur.fetchone()['issue_num']
|
|
result['issue_num'] = issue_num
|
|
|
|
# discussion num
|
|
self.cur.execute("select count(*) as issue_comment_num from github_issue_comment where created_at>%s and created_at<=%s and login=%s", (start_at, end_at, login))
|
|
issue_comment_num = self.cur.fetchone()['issue_comment_num']
|
|
self.cur.execute("select count(*) as pr_comment_num from github_pr_comment where created_at>%s and created_at<=%s and login=%s", (start_at, end_at, login))
|
|
pr_comment_num = self.cur.fetchone()['pr_comment_num']
|
|
self.cur.execute("select count(*) as commit_comment_num from github_commit_comment where created_at>%s and created_at<=%s and login=%s", (start_at, end_at, login))
|
|
commit_comment_num = self.cur.fetchone()['commit_comment_num']
|
|
discussion_num = issue_comment_num + pr_comment_num + commit_comment_num
|
|
result['discussion_num'] = discussion_num
|
|
|
|
# pr review num
|
|
self.cur.execute("select count(*) as pr_review_num from github_user_pr_review where created_at>%s and created_at<=%s and login=%s", (start_at, end_at, login))
|
|
pr_review_num = self.cur.fetchone()['pr_review_num']
|
|
result['pr_review_num'] = pr_review_num
|
|
|
|
# history sponsored num: 区间开始时间之前被赞助的数量
|
|
self.cur.execute("select count(*) as num from github_sponsorships_as_maintainer where login=%s and created_at<%s", (login, start_at))
|
|
history_sponsored_num = self.cur.fetchone()['num']
|
|
result['history_sponsored_num'] = history_sponsored_num
|
|
|
|
# min tier, max tier, avg tier: collection data
|
|
self.cur.execute("select avg(monthly_price_in_dollars) as avg_tier, min(monthly_price_in_dollars) as min_tier, max(monthly_price_in_dollars) as max_tier from github_sponsor_listing_tiers where login=%s", (login,))
|
|
tier = self.cur.fetchone()
|
|
avg_tier = tier['avg_tier']
|
|
min_tier = tier['min_tier']
|
|
max_tier = tier['max_tier']
|
|
|
|
if avg_tier is not None:
|
|
avg_tier = int(avg_tier)
|
|
else:
|
|
avg_tier = None
|
|
if min_tier is not None:
|
|
min_tier = int(min_tier)
|
|
else:
|
|
min_tier = None
|
|
if max_tier is not None:
|
|
max_tier = int(max_tier)
|
|
else:
|
|
max_tier = None
|
|
result['min_tier'] = min_tier
|
|
result['max_tier'] = max_tier
|
|
result['avg_tier'] = avg_tier
|
|
|
|
# has goal: collection data
|
|
self.cur.execute("select has_goal from github_sponsor_listing_desc_goal where login=%s", (login,))
|
|
has_goal = self.cur.fetchone()['has_goal']
|
|
result['has_goal'] = has_goal
|
|
|
|
# other way: collection data
|
|
self.cur.execute("select has_other_ways from github_sponsor_listing_other_ways where login=%s", (login,))
|
|
other_way = self.cur.fetchone()
|
|
if other_way is None:
|
|
pass
|
|
else:
|
|
other_way = other_way['has_other_ways']
|
|
result['other_way'] = other_way
|
|
|
|
# company null: collection data
|
|
self.cur.execute("select bio, company, email, isEmployee, isBountyHunter, isCampusExpert, isDeveloperProgramMember, isGitHubStar, isHireable, isSiteAdmin, location, twitterUsername from github_user_follow where login=%s", (login,))
|
|
user = self.cur.fetchone()
|
|
if user is None:
|
|
result['bio_length'] = None
|
|
result['company_null'] = None
|
|
result['email_null'] = None
|
|
result['isHireable'] = None
|
|
result['location_null'] = None
|
|
else:
|
|
bio = user['bio']
|
|
if bio is None:
|
|
bio_length = 0
|
|
else:
|
|
bio_length = len(bio.split())
|
|
result['bio_length'] = bio_length
|
|
|
|
company = user['company']
|
|
if company is None or len(company.strip()) == 0:
|
|
company_null = True
|
|
else:
|
|
company_null = False
|
|
result['company_null'] = company_null
|
|
|
|
email = user['email']
|
|
if email is None or len(email.strip()) == 0:
|
|
email_null = True
|
|
else:
|
|
email_null = False
|
|
result['email_null'] = email_null
|
|
|
|
isHireable = user['isHireable']
|
|
result['isHireable'] = isHireable
|
|
|
|
location = user['location']
|
|
if location is None or len(location.strip()) == 0:
|
|
location_null = True
|
|
else:
|
|
location_null = False
|
|
result['location_null'] = location_null
|
|
|
|
|
|
# history star num: 区间开始时间之前的历史总star数量
|
|
self.cur.execute("select count(*) as star_num from github_repository_stars where login=%s and created_at<%s and created_at is not null", (login, start_at))
|
|
history_star_num = self.cur.fetchone()
|
|
if history_star_num is None:
|
|
history_star_num = 0
|
|
else:
|
|
history_star_num = history_star_num['star_num']
|
|
result['history_star_num'] = history_star_num
|
|
|
|
# star num: 区间中的star num
|
|
self.cur.execute("select count(*) as star_num from github_repository_stars where login=%s and created_at>=%s and created_at<%s and created_at is not null", (login, start_at, end_at))
|
|
star_num = self.cur.fetchone()
|
|
if star_num is None:
|
|
star_num = 0
|
|
else:
|
|
star_num = star_num['star_num']
|
|
result['star_num'] = star_num
|
|
|
|
# num repo dependents, num package dependents
|
|
self.cur.execute("select top_watch_repo_dependents from github_sponsor_top_watch_repo_dependents where login=%s", (login,))
|
|
item = self.cur.fetchone()
|
|
if item is None:
|
|
num_repo_dependents = 0
|
|
num_package_dependents = 0
|
|
elif item['top_watch_repo_dependents'] == "0":
|
|
num_repo_dependents = 0
|
|
num_package_dependents = 0
|
|
else:
|
|
num_repo_dependents = 0
|
|
num_package_dependents = 0
|
|
wheres = item['top_watch_repo_dependents'].split(",")
|
|
for w in wheres:
|
|
if ("repositor" in w.lower()):
|
|
num_repo_dependents = int("".join(w.strip().split(" ")[0].split(",")))
|
|
elif ("package" in w.lower()):
|
|
num_package_dependents = int("".join(w.strip().split(" ")[0].split(",")))
|
|
result['num_repo_dependents'] = num_repo_dependents
|
|
result['num_package_dependents'] = num_package_dependents
|
|
|
|
return result
|
|
|
|
def insert_dict(self, mydict, insert_table):
|
|
placeholders = ', '.join(['%s']* len(mydict))
|
|
columns = ', '.join(mydict.keys())
|
|
insert_sql = "INSERT INTO %s ( %s ) VALUES ( %s )" % (insert_table, columns, placeholders)
|
|
self.cur.execute(insert_sql, list(mydict.values()))
|
|
|
|
|
|
def run(self):
|
|
while(True):
|
|
try:
|
|
login = self.q.get(timeout=0)
|
|
print("loop how many threads left: %d" % (self.q.qsize()))
|
|
|
|
# 0.1 the data collection time
|
|
collection_time = datetime.datetime.strptime("2021-01-23 11:24:27", "%Y-%m-%d %H:%M:%S")
|
|
# 0.2 sponsor account creation time
|
|
self.cur.execute("select created_at from github_sponsor_listing where login=%s", (login,))
|
|
sponsor_account_createtime = self.cur.fetchone()['created_at']
|
|
# 0.3 user account creation time
|
|
self.cur.execute("select created_at from github_user where login=%s", (login,))
|
|
user_account_createtime = self.cur.fetchone()['created_at']
|
|
# 0.4 first sponsor time
|
|
self.cur.execute("select created_at from github_sponsorships_as_maintainer where login=%s order by created_at asc",(login,))
|
|
first_sponsored_createtime = self.cur.fetchone()
|
|
if first_sponsored_createtime is not None:
|
|
first_sponsored_createtime = first_sponsored_createtime['created_at']
|
|
|
|
# whether user have enough time
|
|
# 1. enough sponsor_create time
|
|
if sponsor_account_createtime is not None and ((sponsor_account_createtime - user_account_createtime).days >= unstable_n + interval_n * step) and ((collection_time - sponsor_account_createtime).days >= unstable_n + interval_n * step):
|
|
for month_index in range(int(-1 * interval_n), interval_n+1, 1):
|
|
result = self.collect_factors(params={
|
|
"login": login,
|
|
"mid_time_point": sponsor_account_createtime,
|
|
"month_index": month_index,
|
|
"user_account_createtime": user_account_createtime,
|
|
"mid_time_point_type": "sponsor_account_create_time"
|
|
})
|
|
|
|
# insert db
|
|
self.insert_dict(result, "middle_data_trend")
|
|
|
|
# 2. enough first_create time
|
|
if first_sponsored_createtime is not None and ((first_sponsored_createtime - user_account_createtime).days >= unstable_n + interval_n * step) and ((collection_time - first_sponsored_createtime).days >= unstable_n + interval_n * step):
|
|
for month_index in range(int(-1 * interval_n), interval_n+1, 1):
|
|
result = self.collect_factors(params={
|
|
"login": login,
|
|
"mid_time_point": first_sponsored_createtime,
|
|
"month_index": month_index,
|
|
"user_account_createtime": user_account_createtime,
|
|
"mid_time_point_type": "first_sponsored_create_time"
|
|
})
|
|
|
|
# insert db
|
|
self.insert_dict(result, "middle_data_trend")
|
|
|
|
self.conn.commit()
|
|
|
|
except queue.Empty:
|
|
return
|
|
self.q.task_done()
|
|
|
|
# gather all the infos for each user
|
|
THREADNUM = 50
|
|
tasks = queue.Queue()
|
|
for user in users:
|
|
tasks.put(user)
|
|
for _ in range(THREADNUM):
|
|
t = myThread(tasks)
|
|
t.start()
|
|
tasks.join()
|
|
|
|
print("finish")
|