cscw_2021_sponsor/collect_data_questionnaire_...

120 lines
5.6 KiB
Python
Raw Permalink Normal View History

2021-06-03 07:55:52 +08:00
# aim: find users(maintainer, sponsor, others)
# author: zxh
# date: 2021-06-02
import pymysql, yaml, math, datetime, json
from utils import *
import seaborn as sns
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import threading, queue # 多线程搜集数据
from ghapi.all import GhApi
from pymongo import MongoClient
f = open('config.yaml', 'r')
config = yaml.load(f.read(), Loader=yaml.BaseLoader)
conn = connectMysqlDB(config, autocommit = False)
cur = conn.cursor(pymysql.cursors.DictCursor)
import random
random.seed( 10 )
# find maintainers
2021-06-07 22:27:36 +08:00
cur.execute("select gsl.login, mdre.name, mdre.email from github_sponsor_listing gsl, middle_data_recent_emails mdre where gsl.deleted=0 and gsl.login=mdre.login and mdre.deleted=0 and mdre.name is not null and mdre.name != '' and mdre.email not like '%noreply%'")
2021-06-03 07:55:52 +08:00
items = cur.fetchall()
query_dict = {}
all_sponsor_account_users = []
for item in items:
query_dict[item['login']] = {
"name": item['name'],
"email": item['email']
}
all_sponsor_account_users.append(item['login'])
2021-06-07 22:27:36 +08:00
all_sponsor_account_users = list(set(all_sponsor_account_users))
2021-06-03 07:55:52 +08:00
2021-06-07 22:27:36 +08:00
all_sponsor_account_users = random.sample(all_sponsor_account_users, 2000) # firstly random 2000
2021-06-03 07:55:52 +08:00
# 1. random 20% for first round
first_round = random.sample(all_sponsor_account_users, int(len(all_sponsor_account_users) * 0.2))
# 2. the rest 80% for second round
second_round = list(set(all_sponsor_account_users) - set(first_round))
for user in first_round:
if user in query_dict:
cur.execute("insert into middle_data_questionnaire_users (login, type, which_round, name, email) values (%s, %s, %s, %s, %s)", (user, "maintainer", 1, query_dict[user]['name'], query_dict[user]['email']))
else:
cur.execute("insert into middle_data_questionnaire_users (login, type, which_round) values (%s, %s, %s)", (user, "maintainer", 1))
for user in second_round:
if user in query_dict:
cur.execute("insert into middle_data_questionnaire_users (login, type, which_round, name, email) values (%s, %s, %s, %s, %s)", (user, "maintainer", 2, query_dict[user]['name'], query_dict[user]['email']))
else:
cur.execute("insert into middle_data_questionnaire_users (login, type, which_round) values (%s, %s, %s)", (user, "maintainer", 2))
conn.commit()
# find sponsors
2021-06-07 22:27:36 +08:00
cur.execute("select gsam.sponsor_login, mdre.name, mdre.email from github_sponsorships_as_maintainer gsam, middle_data_recent_emails mdre where gsam.flag=0 and gsam.sponsor_login=mdre.login and mdre.deleted=0 and mdre.name is not null and mdre.name != '' and mdre.email not like '%noreply%'")
2021-06-03 07:55:52 +08:00
items = cur.fetchall()
query_dict = {}
all_sponsors = []
for item in items:
query_dict[item['sponsor_login']] = {
"name": item['name'],
"email": item['email']
}
all_sponsors.append(item['sponsor_login'])
all_sponsors = list(set(all_sponsors) - set(all_sponsor_account_users))
# 1. random 20% for first round
2021-06-07 22:27:36 +08:00
all_sponsors = random.sample(all_sponsors, 2000)
2021-06-03 07:55:52 +08:00
first_round = random.sample(all_sponsors, int(len(all_sponsors) * 0.2))
# 2. the rest 80% for second round
second_round = list(set(all_sponsors) - set(first_round))
for user in first_round:
if user in query_dict:
cur.execute("insert into middle_data_questionnaire_users (login, type, which_round, name, email) values (%s, %s, %s, %s, %s)", (user, "sponsors", 1, query_dict[user]['name'], query_dict[user]['email']))
else:
cur.execute("insert into middle_data_questionnaire_users (login, type, which_round) values (%s, %s, %s)", (user, "sponsors", 1))
for user in second_round:
if user in query_dict:
cur.execute("insert into middle_data_questionnaire_users (login, type, which_round, name, email) values (%s, %s, %s, %s, %s)", (user, "sponsors", 2, query_dict[user]['name'], query_dict[user]['email']))
else:
cur.execute("insert into middle_data_questionnaire_users (login, type, which_round) values (%s, %s, %s)", (user, "sponsors", 2))
conn.commit()
# find others
2021-06-07 22:27:36 +08:00
cur.execute("select md.login, mdre.name, mdre.email from middle_data_no_sponsor_account_users md, middle_data_recent_emails mdre where md.login=mdre.login and mdre.deleted=0 and mdre.name is not null and mdre.name != '' and mdre.email not like '%noreply%'")
2021-06-03 07:55:52 +08:00
items = cur.fetchall()
query_dict = {}
all_others = []
for item in items:
query_dict[item['login']] = {
"name": item['name'],
"email": item['email']
}
all_others.append(item['login'])
all_others = list(set(all_others) - set(all_sponsor_account_users) - set(all_sponsors))
2021-06-07 22:27:36 +08:00
# first random 2000 users
all_others = random.sample(all_others, 2000)
2021-06-03 07:55:52 +08:00
# 1. random 20% for first round
first_round = random.sample(all_others, int(len(all_others) * 0.2))
# 2. the rest 80% for second round
second_round = list(set(all_others) - set(first_round))
for user in first_round:
if user in query_dict:
cur.execute("insert into middle_data_questionnaire_users (login, type, which_round, name, email) values (%s, %s, %s, %s, %s)", (user, "others", 1, query_dict[user]['name'], query_dict[user]['email']))
else:
cur.execute("insert into middle_data_questionnaire_users (login, type, which_round) values (%s, %s, %s)", (user, "others", 1))
for user in second_round:
if user in query_dict:
cur.execute("insert into middle_data_questionnaire_users (login, type, which_round, name, email) values (%s, %s, %s, %s, %s)", (user, "others", 2, query_dict[user]['name'], query_dict[user]['email']))
else:
cur.execute("insert into middle_data_questionnaire_users (login, type, which_round) values (%s, %s, %s)", (user, "others", 2))
conn.commit()