234 lines
7.1 KiB
Plaintext
234 lines
7.1 KiB
Plaintext
|
{
|
|||
|
"metadata": {
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.7.9"
|
|||
|
},
|
|||
|
"orig_nbformat": 2,
|
|||
|
"kernelspec": {
|
|||
|
"name": "python3",
|
|||
|
"display_name": "Python 3.7.9 64-bit ('cscw_2021_sponsor': conda)",
|
|||
|
"metadata": {
|
|||
|
"interpreter": {
|
|||
|
"hash": "ffeddcc79d80f7c9c5ec51de4979e749e22c27f4b977e5c0d8b1b28de2095a12"
|
|||
|
}
|
|||
|
}
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2,
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 41,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# 用来找到特殊的用户案例\n",
|
|||
|
"import numpy as np\n",
|
|||
|
"import pandas as pd\n",
|
|||
|
"import math"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 60,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# 按照commit变化情况找到历史上增加,被赞助后减少的例子,使用一次函数拟合\n",
|
|||
|
"df = pd.read_csv('data_4_filtered_users_first_sponsored.csv')\n",
|
|||
|
"logins = df.login.unique()\n",
|
|||
|
"df_time = pd.read_csv('selected_sponsor_firstSponsored_logins.csv')"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 43,
|
|||
|
"metadata": {
|
|||
|
"tags": []
|
|||
|
},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"output_type": "stream",
|
|||
|
"name": "stdout",
|
|||
|
"text": [
|
|||
|
"149\n258\n591\n155\n240\n279\n4\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# 1. sponsor有积极作用: 1.1 一直递减,斜率减缓; 1.2 一直递增,斜率增加; 1.3 先递减,后递增\n",
|
|||
|
"# 2. sponsor有消极作用: 2.1 一直递减,斜率增加; 1.2 一直递增,斜率降低; 1.3 先递增,后递减\n",
|
|||
|
"# 3. sponsor没有作用: 斜率没有变化\n",
|
|||
|
"first_1 = {} # value为斜率的差值绝对值\n",
|
|||
|
"first_2 = {}\n",
|
|||
|
"first_3 = {}\n",
|
|||
|
"second_1 = {}\n",
|
|||
|
"second_2 = {}\n",
|
|||
|
"second_3 = {}\n",
|
|||
|
"third = {}\n",
|
|||
|
"for login in logins:\n",
|
|||
|
" rows = df.loc[df.login==login, ['index', 'commit_num']]\n",
|
|||
|
" rows = rows.set_index('index')\n",
|
|||
|
" x1 = [-6, -5, -4, -3, -2, -1]\n",
|
|||
|
" x2 = [1, 2, 3, 4, 5, 6]\n",
|
|||
|
" y1 = []\n",
|
|||
|
" y2 = []\n",
|
|||
|
" for x in x1:\n",
|
|||
|
" y1.append(rows.iloc[x]['commit_num'])\n",
|
|||
|
" x1 = [x+7 for x in x1]\n",
|
|||
|
" for x in x2:\n",
|
|||
|
" y2.append(rows.iloc[x]['commit_num'])\n",
|
|||
|
" z1 = np.polyfit(x1, y1, 1)\n",
|
|||
|
" p1 = np.poly1d(z1)\n",
|
|||
|
" z2 = np.polyfit(x2, y2, 1)\n",
|
|||
|
" p2 = np.poly1d(z2)\n",
|
|||
|
"\n",
|
|||
|
" if z1[0] <= 0 and z2[0] <= 0 and z2[0] > z1[0]:\n",
|
|||
|
" v = abs(z2[0] - z1[0])\n",
|
|||
|
" first_1[login] = v\n",
|
|||
|
" elif z1[0] >= 0 and z2[0] >= 0 and z2[0] > z1[0]:\n",
|
|||
|
" v = abs(z2[0] - z1[0])\n",
|
|||
|
" first_2[login] = v\n",
|
|||
|
" elif z1[0] <= 0 and z2[0] >=0 and z2[0] > z1[0]:\n",
|
|||
|
" v = abs(z2[0] - z1[0])\n",
|
|||
|
" first_3[login] = v\n",
|
|||
|
" elif z1[0] <=0 and z2[0] <= 0 and z2[0] < z1[0]:\n",
|
|||
|
" v = abs(z1[0] - z2[0])\n",
|
|||
|
" second_1[login] = v \n",
|
|||
|
" elif z1[0] >=0 and z2[0] >= 0 and z2[0] < z1[0]:\n",
|
|||
|
" v = abs(z1[0] - z2[0])\n",
|
|||
|
" second_2[login] = v \n",
|
|||
|
" elif z1[0] >=0 and z2[0] <= 0 and z2[0] < z1[0]:\n",
|
|||
|
" v = abs(z1[0] - z2[0])\n",
|
|||
|
" second_3[login] = v\n",
|
|||
|
" elif z2[0] == z1[0]:\n",
|
|||
|
" v = abs(z1[0] - z2[0])\n",
|
|||
|
" third[login] = v\n",
|
|||
|
"print(len(first_1))\n",
|
|||
|
"print(len(first_2))\n",
|
|||
|
"print(len(first_3))\n",
|
|||
|
"print(len(second_1))\n",
|
|||
|
"print(len(second_2))\n",
|
|||
|
"print(len(second_3))\n",
|
|||
|
"print(len(third))\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 44,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# 将first second按照斜率差值排序\n",
|
|||
|
"first_1 = sorted(first_1.items(), key=lambda x: (-x[1], x[0]))\n",
|
|||
|
"first_2 = sorted(first_2.items(), key=lambda x: (-x[1], x[0]))\n",
|
|||
|
"first_3 = sorted(first_3.items(), key=lambda x: (-x[1], x[0]))\n",
|
|||
|
"second_1 = sorted(second_1.items(), key=lambda x: (-x[1], x[0]))\n",
|
|||
|
"second_2 = sorted(second_2.items(), key=lambda x: (-x[1], x[0]))\n",
|
|||
|
"second_3 = sorted(second_3.items(), key=lambda x: (-x[1], x[0]))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 76,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# 拿到top 10 in each set\n",
|
|||
|
"n = 10\n",
|
|||
|
"first_1_logins = [(item[0], df_time.loc[df_time.login==item[0], 'first_sponsored_at'].values[0]) for item in first_1][:n]\n",
|
|||
|
"first_2_logins = [(item[0], df_time.loc[df_time.login==item[0], 'first_sponsored_at'].values[0]) for item in first_2][:n]\n",
|
|||
|
"first_3_logins = [(item[0], df_time.loc[df_time.login==item[0], 'first_sponsored_at'].values[0]) for item in first_3][:n]\n",
|
|||
|
"second_1_logins = [(item[0], df_time.loc[df_time.login==item[0], 'first_sponsored_at'].values[0]) for item in second_1][:n]\n",
|
|||
|
"second_2_logins = [(item[0], df_time.loc[df_time.login==item[0], 'first_sponsored_at'].values[0]) for item in second_2][:n]\n",
|
|||
|
"second_3_logins = [(item[0], df_time.loc[df_time.login==item[0], 'first_sponsored_at'].values[0]) for item in second_3][:n]"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 63,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"output_type": "execute_result",
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"[('b4b4r07',\n",
|
|||
|
" 144 2019-10-25\n",
|
|||
|
" Name: first_sponsored_at, dtype: object),\n",
|
|||
|
" ('Harmon758',\n",
|
|||
|
" 551 2019-11-01\n",
|
|||
|
" Name: first_sponsored_at, dtype: object),\n",
|
|||
|
" ('swannodette',\n",
|
|||
|
" 1429 2020-01-31\n",
|
|||
|
" Name: first_sponsored_at, dtype: object),\n",
|
|||
|
" ('nikitavoloboev',\n",
|
|||
|
" 1064 2020-03-21\n",
|
|||
|
" Name: first_sponsored_at, dtype: object),\n",
|
|||
|
" ('astrojuanlu',\n",
|
|||
|
" 120 2019-11-20\n",
|
|||
|
" Name: first_sponsored_at, dtype: object),\n",
|
|||
|
" ('wopian',\n",
|
|||
|
" 1612 2020-06-20\n",
|
|||
|
" Name: first_sponsored_at, dtype: object),\n",
|
|||
|
" ('JonnyHaystack',\n",
|
|||
|
" 693 2020-02-09\n",
|
|||
|
" Name: first_sponsored_at, dtype: object),\n",
|
|||
|
" ('octref',\n",
|
|||
|
" 1101 2020-06-12\n",
|
|||
|
" Name: first_sponsored_at, dtype: object),\n",
|
|||
|
" ('groue',\n",
|
|||
|
" 533 2020-01-27\n",
|
|||
|
" Name: first_sponsored_at, dtype: object),\n",
|
|||
|
" ('r9y9',\n",
|
|||
|
" 1212 2020-06-28\n",
|
|||
|
" Name: first_sponsored_at, dtype: object)]"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"execution_count": 63
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"first_1_logins\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 74,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"output_type": "execute_result",
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"'2019-10-25'"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"execution_count": 74
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"df_time.loc[df_time.login=='b4b4r07', 'first_sponsored_at'].values[0]"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": []
|
|||
|
}
|
|||
|
]
|
|||
|
}
|