cscw_2021_sponsor/collect_data_questionnaire_...

130 lines
6.4 KiB
Python

# aim: find users(maintainer, sponsor, others)
# author: zxh
# date: 2021-06-02
import pymysql, yaml, math, datetime, json
from utils import *
import seaborn as sns
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import threading, queue # 多线程搜集数据
from ghapi.all import GhApi
from pymongo import MongoClient
f = open('config.yaml', 'r')
config = yaml.load(f.read(), Loader=yaml.BaseLoader)
conn = connectMysqlDB(config, autocommit = False)
cur = conn.cursor(pymysql.cursors.DictCursor)
import random
random.seed( 10 )
# find maintainers
cur.execute("select gsl.login, mdre.name, mdre.email from github_sponsor_listing gsl, middle_data_recent_emails mdre where gsl.deleted=0 and gsl.login=mdre.login and mdre.deleted=0 and mdre.name is not null and mdre.name != '' and mdre.email not like '%noreply%'")
items = cur.fetchall()
query_dict = {}
all_sponsor_account_users = []
for item in items:
query_dict[item['login']] = {
"name": item['name'],
"email": item['email']
}
all_sponsor_account_users.append(item['login'])
all_sponsor_account_users = list(set(all_sponsor_account_users))
# read first round user logins
cur.execute("select login from middle_data_questionnaire_users_firstround where type='maintainer' and which_round=1")
first_round = cur.fetchall()
first_round = [u['login'] for u in first_round]
# 2. the rest 80% for second round
second_round = list(set(all_sponsor_account_users) - set(first_round))
for user in first_round:
if user in query_dict:
cur.execute("insert into middle_data_questionnaire_users (login, type, which_round, name, email) values (%s, %s, %s, %s, %s)", (user, "maintainer", 1, query_dict[user]['name'], query_dict[user]['email']))
else:
cur.execute("insert into middle_data_questionnaire_users (login, type, which_round) values (%s, %s, %s)", (user, "maintainer", 1))
for user in second_round:
if user in query_dict:
cur.execute("insert into middle_data_questionnaire_users (login, type, which_round, name, email) values (%s, %s, %s, %s, %s)", (user, "maintainer", 2, query_dict[user]['name'], query_dict[user]['email']))
else:
cur.execute("insert into middle_data_questionnaire_users (login, type, which_round) values (%s, %s, %s)", (user, "maintainer", 2))
conn.commit()
# find sponsors
cur.execute("select gsam.sponsor_login, mdre.name, mdre.email from github_sponsorships_as_maintainer gsam, middle_data_recent_emails mdre where gsam.flag=0 and gsam.sponsor_login=mdre.login and mdre.deleted=0 and mdre.name is not null and mdre.name != '' and mdre.email not like '%noreply%'")
items = cur.fetchall()
query_dict = {}
all_sponsors = []
for item in items:
query_dict[item['sponsor_login']] = {
"name": item['name'],
"email": item['email']
}
all_sponsors.append(item['sponsor_login'])
all_sponsors = list(set(all_sponsors) - set(all_sponsor_account_users))
# read first round user logins
cur.execute("select login from middle_data_questionnaire_users_firstround where type='sponsors' and which_round=1")
first_round = cur.fetchall()
first_round = [u['login'] for u in first_round]
# 2. the rest
second_round = list(set(all_sponsors) - set(first_round))
for user in first_round:
if user in query_dict:
cur.execute("insert into middle_data_questionnaire_users (login, type, which_round, name, email) values (%s, %s, %s, %s, %s)", (user, "sponsors", 1, query_dict[user]['name'], query_dict[user]['email']))
else:
cur.execute("insert into middle_data_questionnaire_users (login, type, which_round) values (%s, %s, %s)", (user, "sponsors", 1))
for user in second_round:
if user in query_dict:
cur.execute("insert into middle_data_questionnaire_users (login, type, which_round, name, email) values (%s, %s, %s, %s, %s)", (user, "sponsors", 2, query_dict[user]['name'], query_dict[user]['email']))
else:
cur.execute("insert into middle_data_questionnaire_users (login, type, which_round) values (%s, %s, %s)", (user, "sponsors", 2))
conn.commit()
# find others
cur.execute("select md.login, mdre.name, mdre.email from middle_data_no_sponsor_account_users md, middle_data_recent_emails mdre, middle_data_recent_stars mdrs where md.login=mdre.login and mdre.login=mdrs.login and mdre.deleted=0 and mdre.name is not null and mdre.name != '' and mdre.email not like '%noreply%' and mdrs.total_own_repo_stars>=10 and mdrs.deleted=0")
items = cur.fetchall()
query_dict = {}
all_others = []
for item in items:
query_dict[item['login']] = {
"name": item['name'],
"email": item['email']
}
all_others.append(item['login'])
all_others = list(set(all_others) - set(all_sponsor_account_users) - set(all_sponsors))
# read first round user logins
cur.execute("select login from middle_data_questionnaire_users_firstround where type='others' and which_round=1")
first_round = cur.fetchall()
first_round = [u['login'] for u in first_round]
# 2. random 7500
second_round = random.sample(list(set(all_others) - set(first_round)), 7500)
# 3. random 7500 for the third time
third_round = random.sample(list((set(all_others) - set(first_round)) - set(second_round)), 7500)
for user in first_round:
if user in query_dict:
cur.execute("insert into middle_data_questionnaire_users (login, type, which_round, name, email) values (%s, %s, %s, %s, %s)", (user, "others", 1, query_dict[user]['name'], query_dict[user]['email']))
else:
cur.execute("insert into middle_data_questionnaire_users (login, type, which_round) values (%s, %s, %s)", (user, "others", 1))
for user in second_round:
if user in query_dict:
cur.execute("insert into middle_data_questionnaire_users (login, type, which_round, name, email) values (%s, %s, %s, %s, %s)", (user, "others", 2, query_dict[user]['name'], query_dict[user]['email']))
else:
cur.execute("insert into middle_data_questionnaire_users (login, type, which_round) values (%s, %s, %s)", (user, "others", 2))
for user in third_round:
if user in query_dict:
cur.execute("insert into middle_data_questionnaire_users (login, type, which_round, name, email) values (%s, %s, %s, %s, %s)", (user, "others", 3, query_dict[user]['name'], query_dict[user]['email']))
else:
cur.execute("insert into middle_data_questionnaire_users (login, type, which_round) values (%s, %s, %s)", (user, "others", 3))
conn.commit()