cscw_2021_sponsor/find_special_cases.ipynb

{
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.9"
  },
  "orig_nbformat": 2,
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3.7.9 64-bit ('cscw_2021_sponsor': conda)",
   "metadata": {
    "interpreter": {
     "hash": "ffeddcc79d80f7c9c5ec51de4979e749e22c27f4b977e5c0d8b1b28de2095a12"
    }
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 用来找到特殊的用户案例\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import math"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 按照commit变化情况找到历史上增加，被赞助后减少的例子，使用一次函数拟合\n",
    "df = pd.read_csv('data_4_filtered_users_first_sponsored.csv')\n",
    "logins = df.login.unique()\n",
    "df_time = pd.read_csv('selected_sponsor_firstSponsored_logins.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "149\n258\n591\n155\n240\n279\n4\n"
     ]
    }
   ],
   "source": [
    "# 1. sponsor有积极作用: 1.1 一直递减，斜率减缓; 1.2 一直递增，斜率增加; 1.3 先递减，后递增\n",
    "# 2. sponsor有消极作用: 2.1 一直递减，斜率增加; 1.2 一直递增，斜率降低; 1.3 先递增，后递减\n",
    "# 3. sponsor没有作用: 斜率没有变化\n",
    "first_1 = {} # value为斜率的差值绝对值\n",
    "first_2 = {}\n",
    "first_3 = {}\n",
    "second_1 = {}\n",
    "second_2 = {}\n",
    "second_3 = {}\n",
    "third = {}\n",
    "for login in logins:\n",
    "    rows = df.loc[df.login==login, ['index', 'commit_num']]\n",
    "    rows = rows.set_index('index')\n",
    "    x1 = [-6, -5, -4, -3, -2, -1]\n",
    "    x2 = [1, 2, 3, 4, 5, 6]\n",
    "    y1 = []\n",
    "    y2 = []\n",
    "    for x in x1:\n",
    "        y1.append(rows.iloc[x]['commit_num'])\n",
    "    x1 = [x+7 for x in x1]\n",
    "    for x in x2:\n",
    "        y2.append(rows.iloc[x]['commit_num'])\n",
    "    z1 = np.polyfit(x1, y1, 1)\n",
    "    p1 = np.poly1d(z1)\n",
    "    z2 = np.polyfit(x2, y2, 1)\n",
    "    p2 = np.poly1d(z2)\n",
    "\n",
    "    if z1[0] <= 0 and z2[0] <= 0 and z2[0] > z1[0]:\n",
    "        v = abs(z2[0] - z1[0])\n",
    "        first_1[login] = v\n",
    "    elif z1[0] >= 0 and z2[0] >= 0 and z2[0] > z1[0]:\n",
    "        v = abs(z2[0] - z1[0])\n",
    "        first_2[login] = v\n",
    "    elif z1[0] <= 0 and z2[0] >=0 and z2[0] > z1[0]:\n",
    "        v = abs(z2[0] - z1[0])\n",
    "        first_3[login] = v\n",
    "    elif z1[0] <=0 and z2[0] <= 0 and z2[0] < z1[0]:\n",
    "        v = abs(z1[0] - z2[0])\n",
    "        second_1[login] = v \n",
    "    elif z1[0] >=0 and z2[0] >= 0 and z2[0] < z1[0]:\n",
    "        v = abs(z1[0] - z2[0])\n",
    "        second_2[login] = v \n",
    "    elif z1[0] >=0 and z2[0] <= 0 and z2[0] < z1[0]:\n",
    "        v = abs(z1[0] - z2[0])\n",
    "        second_3[login] = v\n",
    "    elif z2[0] == z1[0]:\n",
    "        v = abs(z1[0] - z2[0])\n",
    "        third[login] = v\n",
    "print(len(first_1))\n",
    "print(len(first_2))\n",
    "print(len(first_3))\n",
    "print(len(second_1))\n",
    "print(len(second_2))\n",
    "print(len(second_3))\n",
    "print(len(third))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 将first second按照斜率差值排序\n",
    "first_1 = sorted(first_1.items(), key=lambda x: (-x[1], x[0]))\n",
    "first_2 = sorted(first_2.items(), key=lambda x: (-x[1], x[0]))\n",
    "first_3 = sorted(first_3.items(), key=lambda x: (-x[1], x[0]))\n",
    "second_1 = sorted(second_1.items(), key=lambda x: (-x[1], x[0]))\n",
    "second_2 = sorted(second_2.items(), key=lambda x: (-x[1], x[0]))\n",
    "second_3 = sorted(second_3.items(), key=lambda x: (-x[1], x[0]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 拿到top 10 in each set\n",
    "n = 10\n",
    "first_1_logins = [(item[0], df_time.loc[df_time.login==item[0], 'first_sponsored_at'].values[0]) for item in first_1][:n]\n",
    "first_2_logins = [(item[0], df_time.loc[df_time.login==item[0], 'first_sponsored_at'].values[0]) for item in first_2][:n]\n",
    "first_3_logins = [(item[0], df_time.loc[df_time.login==item[0], 'first_sponsored_at'].values[0]) for item in first_3][:n]\n",
    "second_1_logins = [(item[0], df_time.loc[df_time.login==item[0], 'first_sponsored_at'].values[0]) for item in second_1][:n]\n",
    "second_2_logins = [(item[0], df_time.loc[df_time.login==item[0], 'first_sponsored_at'].values[0]) for item in second_2][:n]\n",
    "second_3_logins = [(item[0], df_time.loc[df_time.login==item[0], 'first_sponsored_at'].values[0]) for item in second_3][:n]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "[('b4b4r07',\n",
       "  144    2019-10-25\n",
       "  Name: first_sponsored_at, dtype: object),\n",
       " ('Harmon758',\n",
       "  551    2019-11-01\n",
       "  Name: first_sponsored_at, dtype: object),\n",
       " ('swannodette',\n",
       "  1429    2020-01-31\n",
       "  Name: first_sponsored_at, dtype: object),\n",
       " ('nikitavoloboev',\n",
       "  1064    2020-03-21\n",
       "  Name: first_sponsored_at, dtype: object),\n",
       " ('astrojuanlu',\n",
       "  120    2019-11-20\n",
       "  Name: first_sponsored_at, dtype: object),\n",
       " ('wopian',\n",
       "  1612    2020-06-20\n",
       "  Name: first_sponsored_at, dtype: object),\n",
       " ('JonnyHaystack',\n",
       "  693    2020-02-09\n",
       "  Name: first_sponsored_at, dtype: object),\n",
       " ('octref',\n",
       "  1101    2020-06-12\n",
       "  Name: first_sponsored_at, dtype: object),\n",
       " ('groue',\n",
       "  533    2020-01-27\n",
       "  Name: first_sponsored_at, dtype: object),\n",
       " ('r9y9',\n",
       "  1212    2020-06-28\n",
       "  Name: first_sponsored_at, dtype: object)]"
      ]
     },
     "metadata": {},
     "execution_count": 63
    }
   ],
   "source": [
    "first_1_logins\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "'2019-10-25'"
      ]
     },
     "metadata": {},
     "execution_count": 74
    }
   ],
   "source": [
    "df_time.loc[df_time.login=='b4b4r07', 'first_sponsored_at'].values[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ]
}