cscw_2021_sponsor/collect_data_trend_analysis.py

371 lines
17 KiB
Python
Raw Permalink Normal View History

2021-05-31 16:29:36 +08:00
# collect data for the trend analysis
# aim: find whether sponsor create or first sponse will influence the commit or discussion number
# date: 2021-05-31
# author: zxh
# collect before and after time point (6 months)
import pymysql, yaml, math, datetime, json
from utils import *
import seaborn as sns
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import threading, queue # 多线程搜集数据
f = open('config.yaml', 'r')
config = yaml.load(f.read(), Loader=yaml.BaseLoader)
conn = connectMysqlDB(config, autocommit = True)
cur = conn.cursor(pymysql.cursors.DictCursor)
# read all the sponsors from database (0 sponsor, >=1 sponsor all considered)
cur.execute("select login from github_sponsor_listing where deleted=0")
users = cur.fetchall()
users = [u["login"] for u in users]
cur.execute("select login from middle_data_trend")
handled_users = cur.fetchall()
handled_users = [u["login"] for u in handled_users]
users = list(set(users) - set(handled_users))
unstable_n = 0 # 前后多少天是不稳定时间,趋势分析中没有不稳定时间
step = 30 # 一个gap多少天
interval_n = 6 # 前后区间数量
class myThread(threading.Thread):
def __init__(self, q):
threading.Thread.__init__(self)
self.q = q
self.conn = connectMysqlDB(config, autocommit = False)
self.cur = self.conn.cursor(pymysql.cursors.DictCursor)
'''
params: {
login,
mid_time_point,
month_index,
user_account_createtime,
mid_time_point_type
}
'''
def collect_factors(self, params):
login = params['login']
mid_time_point = params['mid_time_point']
month_index = params['month_index']
user_account_createtime = params['user_account_createtime']
mid_time_point_type = params['mid_time_point_type']
result = {}
result['login'] = login
result['month_index'] = month_index
result['mid_time_point_type'] = mid_time_point_type
# 计算起止时间
if month_index > 0:
start_at = mid_time_point + datetime.timedelta(days=unstable_n) + datetime.timedelta(days=(month_index-1)*step)
end_at = start_at + datetime.timedelta(days=step)
elif month_index < 0:
start_at = mid_time_point - datetime.timedelta(days=unstable_n) + datetime.timedelta(days=month_index*step)
end_at = start_at + datetime.timedelta(days=step)
else:
start_at = mid_time_point - datetime.timedelta(days=unstable_n)
end_at = mid_time_point + datetime.timedelta(days=unstable_n)
result['start_at'] = start_at
result['end_at'] = end_at
# user age: 表示当前这个区间的user age
result['user_age'] = int((start_at - user_account_createtime).days / 30)
# history commit num: 表示这个区间起始时间之前的commit总数量
self.cur.execute("select sum(contribution_count) as history_commit_num from github_user_commits_per_day where date<%s and login=%s", (start_at,login))
history_commit_num = self.cur.fetchone()['history_commit_num']
if history_commit_num is None:
history_commit_num = 0
else:
history_commit_num = int(history_commit_num)
result['history_commit_num'] = history_commit_num
# history pr num: 表示这个区间之前的提出pr的总量
self.cur.execute("select count(*) as history_pr_num from github_user_pr where created_at<%s and login=%s", (start_at, login))
history_pr_num = self.cur.fetchone()['history_pr_num']
result['history_pr_num'] = history_pr_num
# history issue num: 区间之前的issue总量
self.cur.execute("select count(*) as history_issue_num from github_user_issue where created_at<%s and login=%s", (start_at, login))
history_issue_num = self.cur.fetchone()['history_issue_num']
result['history_issue_num'] = history_issue_num
# history discussion num: 区间之前的issue comment + pr comment + commit comment
self.cur.execute("select count(*) as history_issue_comment_num from github_issue_comment where created_at<%s and login=%s", (start_at, login))
history_issue_comment_num = self.cur.fetchone()['history_issue_comment_num']
self.cur.execute("select count(*) as history_pr_comment_num from github_pr_comment where created_at<%s and login=%s", (start_at, login))
history_pr_comment_num = self.cur.fetchone()['history_pr_comment_num']
self.cur.execute("select count(*) as history_commit_comment_num from github_commit_comment where created_at<%s and login=%s", (start_at, login))
history_commit_comment_num = self.cur.fetchone()['history_commit_comment_num']
history_discussion_num = history_issue_comment_num + history_pr_comment_num + history_commit_comment_num
result['history_discussion_num'] = history_discussion_num
# history pr review num
self.cur.execute("select count(*) as history_pr_review_num from github_user_pr_review where created_at<%s and login=%s", (start_at, login))
history_pr_review_num = self.cur.fetchone()['history_pr_review_num']
result['history_pr_review_num'] = history_pr_review_num
# time
time = month_index + interval_n
result['time'] = time
# intervention
if month_index > 0:
intervention = 1
elif month_index < 0:
intervention = 0
else:
intervention = None
result['intervention'] = intervention
# time after intervention
if month_index > 0:
time_after_intervention = time - interval_n
elif month_index < 0:
time_after_intervention = 0
else:
time_after_intervention = None
result['time_after_intervention'] = time_after_intervention
# commit num: 区间内数量
self.cur.execute("select sum(contribution_count) as commit_num from github_user_commits_per_day where date>%s and date<=%s and login=%s", (start_at, end_at, login))
commit_num = self.cur.fetchone()['commit_num']
if commit_num is None:
commit_num = 0
else:
commit_num = int(commit_num)
result['commit_num'] = commit_num
# pr num
self.cur.execute("select count(*) as pr_num from github_user_pr where created_at>%s and created_at<=%s and login=%s", (start_at, end_at, login))
pr_num = self.cur.fetchone()['pr_num']
result['pr_num'] = pr_num
# issue num
self.cur.execute("select count(*) as issue_num from github_user_issue where created_at>%s and created_at<=%s and login=%s", (start_at, end_at, login))
issue_num = self.cur.fetchone()['issue_num']
result['issue_num'] = issue_num
# discussion num
self.cur.execute("select count(*) as issue_comment_num from github_issue_comment where created_at>%s and created_at<=%s and login=%s", (start_at, end_at, login))
issue_comment_num = self.cur.fetchone()['issue_comment_num']
self.cur.execute("select count(*) as pr_comment_num from github_pr_comment where created_at>%s and created_at<=%s and login=%s", (start_at, end_at, login))
pr_comment_num = self.cur.fetchone()['pr_comment_num']
self.cur.execute("select count(*) as commit_comment_num from github_commit_comment where created_at>%s and created_at<=%s and login=%s", (start_at, end_at, login))
commit_comment_num = self.cur.fetchone()['commit_comment_num']
discussion_num = issue_comment_num + pr_comment_num + commit_comment_num
result['discussion_num'] = discussion_num
# pr review num
self.cur.execute("select count(*) as pr_review_num from github_user_pr_review where created_at>%s and created_at<=%s and login=%s", (start_at, end_at, login))
pr_review_num = self.cur.fetchone()['pr_review_num']
result['pr_review_num'] = pr_review_num
# history sponsored num: 区间开始时间之前被赞助的数量
self.cur.execute("select count(*) as num from github_sponsorships_as_maintainer where login=%s and created_at<%s", (login, start_at))
history_sponsored_num = self.cur.fetchone()['num']
result['history_sponsored_num'] = history_sponsored_num
# min tier, max tier, avg tier: collection data
self.cur.execute("select avg(monthly_price_in_dollars) as avg_tier, min(monthly_price_in_dollars) as min_tier, max(monthly_price_in_dollars) as max_tier from github_sponsor_listing_tiers where login=%s", (login,))
tier = self.cur.fetchone()
avg_tier = tier['avg_tier']
min_tier = tier['min_tier']
max_tier = tier['max_tier']
if avg_tier is not None:
avg_tier = int(avg_tier)
else:
avg_tier = None
if min_tier is not None:
min_tier = int(min_tier)
else:
min_tier = None
if max_tier is not None:
max_tier = int(max_tier)
else:
max_tier = None
result['min_tier'] = min_tier
result['max_tier'] = max_tier
result['avg_tier'] = avg_tier
# has goal: collection data
self.cur.execute("select has_goal from github_sponsor_listing_desc_goal where login=%s", (login,))
has_goal = self.cur.fetchone()['has_goal']
result['has_goal'] = has_goal
# other way: collection data
self.cur.execute("select has_other_ways from github_sponsor_listing_other_ways where login=%s", (login,))
other_way = self.cur.fetchone()
if other_way is None:
pass
else:
other_way = other_way['has_other_ways']
result['other_way'] = other_way
# company null: collection data
self.cur.execute("select bio, company, email, isEmployee, isBountyHunter, isCampusExpert, isDeveloperProgramMember, isGitHubStar, isHireable, isSiteAdmin, location, twitterUsername from github_user_follow where login=%s", (login,))
user = self.cur.fetchone()
if user is None:
result['bio_length'] = None
result['company_null'] = None
result['email_null'] = None
result['isHireable'] = None
result['location_null'] = None
else:
bio = user['bio']
if bio is None:
bio_length = 0
else:
bio_length = len(bio.split())
result['bio_length'] = bio_length
company = user['company']
if company is None or len(company.strip()) == 0:
company_null = True
else:
company_null = False
result['company_null'] = company_null
email = user['email']
if email is None or len(email.strip()) == 0:
email_null = True
else:
email_null = False
result['email_null'] = email_null
isHireable = user['isHireable']
result['isHireable'] = isHireable
location = user['location']
if location is None or len(location.strip()) == 0:
location_null = True
else:
location_null = False
result['location_null'] = location_null
# history star num: 区间开始时间之前的历史总star数量
self.cur.execute("select count(*) as star_num from github_repository_stars where login=%s and created_at<%s and created_at is not null", (login, start_at))
history_star_num = self.cur.fetchone()
if history_star_num is None:
history_star_num = 0
else:
history_star_num = history_star_num['star_num']
result['history_star_num'] = history_star_num
# star num: 区间中的star num
self.cur.execute("select count(*) as star_num from github_repository_stars where login=%s and created_at>=%s and created_at<%s and created_at is not null", (login, start_at, end_at))
star_num = self.cur.fetchone()
if star_num is None:
star_num = 0
else:
star_num = star_num['star_num']
result['star_num'] = star_num
# num repo dependents, num package dependents
self.cur.execute("select top_watch_repo_dependents from github_sponsor_top_watch_repo_dependents where login=%s", (login,))
item = self.cur.fetchone()
if item is None:
num_repo_dependents = 0
num_package_dependents = 0
elif item['top_watch_repo_dependents'] == "0":
num_repo_dependents = 0
num_package_dependents = 0
else:
num_repo_dependents = 0
num_package_dependents = 0
wheres = item['top_watch_repo_dependents'].split(",")
for w in wheres:
if ("repositor" in w.lower()):
num_repo_dependents = int("".join(w.strip().split(" ")[0].split(",")))
elif ("package" in w.lower()):
num_package_dependents = int("".join(w.strip().split(" ")[0].split(",")))
result['num_repo_dependents'] = num_repo_dependents
result['num_package_dependents'] = num_package_dependents
return result
def insert_dict(self, mydict, insert_table):
placeholders = ', '.join(['%s']* len(mydict))
columns = ', '.join(mydict.keys())
insert_sql = "INSERT INTO %s ( %s ) VALUES ( %s )" % (insert_table, columns, placeholders)
self.cur.execute(insert_sql, list(mydict.values()))
def run(self):
while(True):
try:
login = self.q.get(timeout=0)
print("loop how many threads left: %d" % (self.q.qsize()))
# 0.1 the data collection time
collection_time = datetime.datetime.strptime("2021-01-23 11:24:27", "%Y-%m-%d %H:%M:%S")
# 0.2 sponsor account creation time
self.cur.execute("select created_at from github_sponsor_listing where login=%s", (login,))
sponsor_account_createtime = self.cur.fetchone()['created_at']
# 0.3 user account creation time
self.cur.execute("select created_at from github_user where login=%s", (login,))
user_account_createtime = self.cur.fetchone()['created_at']
# 0.4 first sponsor time
self.cur.execute("select created_at from github_sponsorships_as_maintainer where login=%s order by created_at asc",(login,))
first_sponsored_createtime = self.cur.fetchone()
if first_sponsored_createtime is not None:
first_sponsored_createtime = first_sponsored_createtime['created_at']
# whether user have enough time
# 1. enough sponsor_create time
if sponsor_account_createtime is not None and ((sponsor_account_createtime - user_account_createtime).days >= unstable_n + interval_n * step) and ((collection_time - sponsor_account_createtime).days >= unstable_n + interval_n * step):
for month_index in range(int(-1 * interval_n), interval_n+1, 1):
result = self.collect_factors(params={
"login": login,
"mid_time_point": sponsor_account_createtime,
"month_index": month_index,
"user_account_createtime": user_account_createtime,
"mid_time_point_type": "sponsor_account_create_time"
})
# insert db
self.insert_dict(result, "middle_data_trend")
# 2. enough first_create time
if first_sponsored_createtime is not None and ((first_sponsored_createtime - user_account_createtime).days >= unstable_n + interval_n * step) and ((collection_time - first_sponsored_createtime).days >= unstable_n + interval_n * step):
for month_index in range(int(-1 * interval_n), interval_n+1, 1):
result = self.collect_factors(params={
"login": login,
"mid_time_point": first_sponsored_createtime,
"month_index": month_index,
"user_account_createtime": user_account_createtime,
"mid_time_point_type": "first_sponsored_create_time"
})
# insert db
self.insert_dict(result, "middle_data_trend")
self.conn.commit()
except queue.Empty:
return
self.q.task_done()
# gather all the infos for each user
THREADNUM = 50
tasks = queue.Queue()
for user in users:
tasks.put(user)
for _ in range(THREADNUM):
t = myThread(tasks)
t.start()
tasks.join()
print("finish")