cscw_2021_sponsor/find_special_cases.ipynb

234 lines
7.1 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"metadata": {
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.9"
},
"orig_nbformat": 2,
"kernelspec": {
"name": "python3",
"display_name": "Python 3.7.9 64-bit ('cscw_2021_sponsor': conda)",
"metadata": {
"interpreter": {
"hash": "ffeddcc79d80f7c9c5ec51de4979e749e22c27f4b977e5c0d8b1b28de2095a12"
}
}
}
},
"nbformat": 4,
"nbformat_minor": 2,
"cells": [
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"# 用来找到特殊的用户案例\n",
"import numpy as np\n",
"import pandas as pd\n",
"import math"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
"# 按照commit变化情况找到历史上增加被赞助后减少的例子使用一次函数拟合\n",
"df = pd.read_csv('data_4_filtered_users_first_sponsored.csv')\n",
"logins = df.login.unique()\n",
"df_time = pd.read_csv('selected_sponsor_firstSponsored_logins.csv')"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"tags": []
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"149\n258\n591\n155\n240\n279\n4\n"
]
}
],
"source": [
"# 1. sponsor有积极作用: 1.1 一直递减,斜率减缓; 1.2 一直递增,斜率增加; 1.3 先递减,后递增\n",
"# 2. sponsor有消极作用: 2.1 一直递减,斜率增加; 1.2 一直递增,斜率降低; 1.3 先递增,后递减\n",
"# 3. sponsor没有作用: 斜率没有变化\n",
"first_1 = {} # value为斜率的差值绝对值\n",
"first_2 = {}\n",
"first_3 = {}\n",
"second_1 = {}\n",
"second_2 = {}\n",
"second_3 = {}\n",
"third = {}\n",
"for login in logins:\n",
" rows = df.loc[df.login==login, ['index', 'commit_num']]\n",
" rows = rows.set_index('index')\n",
" x1 = [-6, -5, -4, -3, -2, -1]\n",
" x2 = [1, 2, 3, 4, 5, 6]\n",
" y1 = []\n",
" y2 = []\n",
" for x in x1:\n",
" y1.append(rows.iloc[x]['commit_num'])\n",
" x1 = [x+7 for x in x1]\n",
" for x in x2:\n",
" y2.append(rows.iloc[x]['commit_num'])\n",
" z1 = np.polyfit(x1, y1, 1)\n",
" p1 = np.poly1d(z1)\n",
" z2 = np.polyfit(x2, y2, 1)\n",
" p2 = np.poly1d(z2)\n",
"\n",
" if z1[0] <= 0 and z2[0] <= 0 and z2[0] > z1[0]:\n",
" v = abs(z2[0] - z1[0])\n",
" first_1[login] = v\n",
" elif z1[0] >= 0 and z2[0] >= 0 and z2[0] > z1[0]:\n",
" v = abs(z2[0] - z1[0])\n",
" first_2[login] = v\n",
" elif z1[0] <= 0 and z2[0] >=0 and z2[0] > z1[0]:\n",
" v = abs(z2[0] - z1[0])\n",
" first_3[login] = v\n",
" elif z1[0] <=0 and z2[0] <= 0 and z2[0] < z1[0]:\n",
" v = abs(z1[0] - z2[0])\n",
" second_1[login] = v \n",
" elif z1[0] >=0 and z2[0] >= 0 and z2[0] < z1[0]:\n",
" v = abs(z1[0] - z2[0])\n",
" second_2[login] = v \n",
" elif z1[0] >=0 and z2[0] <= 0 and z2[0] < z1[0]:\n",
" v = abs(z1[0] - z2[0])\n",
" second_3[login] = v\n",
" elif z2[0] == z1[0]:\n",
" v = abs(z1[0] - z2[0])\n",
" third[login] = v\n",
"print(len(first_1))\n",
"print(len(first_2))\n",
"print(len(first_3))\n",
"print(len(second_1))\n",
"print(len(second_2))\n",
"print(len(second_3))\n",
"print(len(third))\n"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"# 将first second按照斜率差值排序\n",
"first_1 = sorted(first_1.items(), key=lambda x: (-x[1], x[0]))\n",
"first_2 = sorted(first_2.items(), key=lambda x: (-x[1], x[0]))\n",
"first_3 = sorted(first_3.items(), key=lambda x: (-x[1], x[0]))\n",
"second_1 = sorted(second_1.items(), key=lambda x: (-x[1], x[0]))\n",
"second_2 = sorted(second_2.items(), key=lambda x: (-x[1], x[0]))\n",
"second_3 = sorted(second_3.items(), key=lambda x: (-x[1], x[0]))"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [],
"source": [
"# 拿到top 10 in each set\n",
"n = 10\n",
"first_1_logins = [(item[0], df_time.loc[df_time.login==item[0], 'first_sponsored_at'].values[0]) for item in first_1][:n]\n",
"first_2_logins = [(item[0], df_time.loc[df_time.login==item[0], 'first_sponsored_at'].values[0]) for item in first_2][:n]\n",
"first_3_logins = [(item[0], df_time.loc[df_time.login==item[0], 'first_sponsored_at'].values[0]) for item in first_3][:n]\n",
"second_1_logins = [(item[0], df_time.loc[df_time.login==item[0], 'first_sponsored_at'].values[0]) for item in second_1][:n]\n",
"second_2_logins = [(item[0], df_time.loc[df_time.login==item[0], 'first_sponsored_at'].values[0]) for item in second_2][:n]\n",
"second_3_logins = [(item[0], df_time.loc[df_time.login==item[0], 'first_sponsored_at'].values[0]) for item in second_3][:n]"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"[('b4b4r07',\n",
" 144 2019-10-25\n",
" Name: first_sponsored_at, dtype: object),\n",
" ('Harmon758',\n",
" 551 2019-11-01\n",
" Name: first_sponsored_at, dtype: object),\n",
" ('swannodette',\n",
" 1429 2020-01-31\n",
" Name: first_sponsored_at, dtype: object),\n",
" ('nikitavoloboev',\n",
" 1064 2020-03-21\n",
" Name: first_sponsored_at, dtype: object),\n",
" ('astrojuanlu',\n",
" 120 2019-11-20\n",
" Name: first_sponsored_at, dtype: object),\n",
" ('wopian',\n",
" 1612 2020-06-20\n",
" Name: first_sponsored_at, dtype: object),\n",
" ('JonnyHaystack',\n",
" 693 2020-02-09\n",
" Name: first_sponsored_at, dtype: object),\n",
" ('octref',\n",
" 1101 2020-06-12\n",
" Name: first_sponsored_at, dtype: object),\n",
" ('groue',\n",
" 533 2020-01-27\n",
" Name: first_sponsored_at, dtype: object),\n",
" ('r9y9',\n",
" 1212 2020-06-28\n",
" Name: first_sponsored_at, dtype: object)]"
]
},
"metadata": {},
"execution_count": 63
}
],
"source": [
"first_1_logins\n"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"'2019-10-25'"
]
},
"metadata": {},
"execution_count": 74
}
],
"source": [
"df_time.loc[df_time.login=='b4b4r07', 'first_sponsored_at'].values[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
]
}