120 lines
5.6 KiB
Python
120 lines
5.6 KiB
Python
# aim: find users(maintainer, sponsor, others)
|
|
# author: zxh
|
|
# date: 2021-06-02
|
|
|
|
import pymysql, yaml, math, datetime, json
|
|
from utils import *
|
|
import seaborn as sns
|
|
import pandas as pd
|
|
from matplotlib import pyplot as plt
|
|
import numpy as np
|
|
import threading, queue # 多线程搜集数据
|
|
from ghapi.all import GhApi
|
|
from pymongo import MongoClient
|
|
|
|
f = open('config.yaml', 'r')
|
|
config = yaml.load(f.read(), Loader=yaml.BaseLoader)
|
|
conn = connectMysqlDB(config, autocommit = False)
|
|
cur = conn.cursor(pymysql.cursors.DictCursor)
|
|
|
|
import random
|
|
random.seed( 10 )
|
|
|
|
# find maintainers
|
|
cur.execute("select gsl.login, mdre.name, mdre.email from github_sponsor_listing gsl, middle_data_recent_emails mdre where gsl.deleted=0 and gsl.login=mdre.login and mdre.deleted=0 and mdre.name is not null and mdre.name != '' and mdre.email not like '%noreply%'")
|
|
items = cur.fetchall()
|
|
query_dict = {}
|
|
all_sponsor_account_users = []
|
|
for item in items:
|
|
query_dict[item['login']] = {
|
|
"name": item['name'],
|
|
"email": item['email']
|
|
}
|
|
all_sponsor_account_users.append(item['login'])
|
|
all_sponsor_account_users = list(set(all_sponsor_account_users))
|
|
|
|
all_sponsor_account_users = random.sample(all_sponsor_account_users, 2000) # firstly random 2000
|
|
# 1. random 20% for first round
|
|
first_round = random.sample(all_sponsor_account_users, int(len(all_sponsor_account_users) * 0.2))
|
|
# 2. the rest 80% for second round
|
|
second_round = list(set(all_sponsor_account_users) - set(first_round))
|
|
|
|
for user in first_round:
|
|
if user in query_dict:
|
|
cur.execute("insert into middle_data_questionnaire_users (login, type, which_round, name, email) values (%s, %s, %s, %s, %s)", (user, "maintainer", 1, query_dict[user]['name'], query_dict[user]['email']))
|
|
else:
|
|
cur.execute("insert into middle_data_questionnaire_users (login, type, which_round) values (%s, %s, %s)", (user, "maintainer", 1))
|
|
|
|
for user in second_round:
|
|
if user in query_dict:
|
|
cur.execute("insert into middle_data_questionnaire_users (login, type, which_round, name, email) values (%s, %s, %s, %s, %s)", (user, "maintainer", 2, query_dict[user]['name'], query_dict[user]['email']))
|
|
else:
|
|
cur.execute("insert into middle_data_questionnaire_users (login, type, which_round) values (%s, %s, %s)", (user, "maintainer", 2))
|
|
conn.commit()
|
|
|
|
# find sponsors
|
|
cur.execute("select gsam.sponsor_login, mdre.name, mdre.email from github_sponsorships_as_maintainer gsam, middle_data_recent_emails mdre where gsam.flag=0 and gsam.sponsor_login=mdre.login and mdre.deleted=0 and mdre.name is not null and mdre.name != '' and mdre.email not like '%noreply%'")
|
|
items = cur.fetchall()
|
|
query_dict = {}
|
|
all_sponsors = []
|
|
for item in items:
|
|
query_dict[item['sponsor_login']] = {
|
|
"name": item['name'],
|
|
"email": item['email']
|
|
}
|
|
all_sponsors.append(item['sponsor_login'])
|
|
all_sponsors = list(set(all_sponsors) - set(all_sponsor_account_users))
|
|
|
|
# 1. random 20% for first round
|
|
all_sponsors = random.sample(all_sponsors, 2000)
|
|
first_round = random.sample(all_sponsors, int(len(all_sponsors) * 0.2))
|
|
# 2. the rest 80% for second round
|
|
second_round = list(set(all_sponsors) - set(first_round))
|
|
|
|
for user in first_round:
|
|
if user in query_dict:
|
|
cur.execute("insert into middle_data_questionnaire_users (login, type, which_round, name, email) values (%s, %s, %s, %s, %s)", (user, "sponsors", 1, query_dict[user]['name'], query_dict[user]['email']))
|
|
else:
|
|
cur.execute("insert into middle_data_questionnaire_users (login, type, which_round) values (%s, %s, %s)", (user, "sponsors", 1))
|
|
|
|
for user in second_round:
|
|
if user in query_dict:
|
|
cur.execute("insert into middle_data_questionnaire_users (login, type, which_round, name, email) values (%s, %s, %s, %s, %s)", (user, "sponsors", 2, query_dict[user]['name'], query_dict[user]['email']))
|
|
else:
|
|
cur.execute("insert into middle_data_questionnaire_users (login, type, which_round) values (%s, %s, %s)", (user, "sponsors", 2))
|
|
conn.commit()
|
|
|
|
|
|
# find others
|
|
cur.execute("select md.login, mdre.name, mdre.email from middle_data_no_sponsor_account_users md, middle_data_recent_emails mdre where md.login=mdre.login and mdre.deleted=0 and mdre.name is not null and mdre.name != '' and mdre.email not like '%noreply%'")
|
|
items = cur.fetchall()
|
|
query_dict = {}
|
|
all_others = []
|
|
for item in items:
|
|
query_dict[item['login']] = {
|
|
"name": item['name'],
|
|
"email": item['email']
|
|
}
|
|
all_others.append(item['login'])
|
|
all_others = list(set(all_others) - set(all_sponsor_account_users) - set(all_sponsors))
|
|
|
|
# first random 2000 users
|
|
all_others = random.sample(all_others, 2000)
|
|
|
|
# 1. random 20% for first round
|
|
first_round = random.sample(all_others, int(len(all_others) * 0.2))
|
|
# 2. the rest 80% for second round
|
|
second_round = list(set(all_others) - set(first_round))
|
|
|
|
for user in first_round:
|
|
if user in query_dict:
|
|
cur.execute("insert into middle_data_questionnaire_users (login, type, which_round, name, email) values (%s, %s, %s, %s, %s)", (user, "others", 1, query_dict[user]['name'], query_dict[user]['email']))
|
|
else:
|
|
cur.execute("insert into middle_data_questionnaire_users (login, type, which_round) values (%s, %s, %s)", (user, "others", 1))
|
|
|
|
for user in second_round:
|
|
if user in query_dict:
|
|
cur.execute("insert into middle_data_questionnaire_users (login, type, which_round, name, email) values (%s, %s, %s, %s, %s)", (user, "others", 2, query_dict[user]['name'], query_dict[user]['email']))
|
|
else:
|
|
cur.execute("insert into middle_data_questionnaire_users (login, type, which_round) values (%s, %s, %s)", (user, "others", 2))
|
|
conn.commit() |