stdb
This commit is contained in:
parent
ac1a65d88f
commit
5c8f7ddcf8
|
@ -2,7 +2,7 @@
|
|||
.project
|
||||
.pydevproject
|
||||
*.png
|
||||
model/*
|
||||
other/model/*
|
||||
*.log
|
||||
files/*
|
||||
*.csv
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,102 @@
|
|||
#coding:utf-8
|
||||
'''
|
||||
Created on 2016年10月7日
|
||||
|
||||
@author: StarLee
|
||||
'''
|
||||
import MySQLdb
|
||||
import logging
|
||||
|
||||
|
||||
logger = logging.getLogger()
|
||||
hdlr = logging.FileHandler("num_answer_stastics.log")
|
||||
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
|
||||
hdlr.setFormatter(formatter)
|
||||
logger.addHandler(hdlr)
|
||||
logger.setLevel(logging.NOTSET)
|
||||
|
||||
|
||||
conn = MySQLdb.connect(host="localhost",user="starlee",passwd="1234",db="sldb",charset='utf8' )
|
||||
cursor = conn.cursor()
|
||||
|
||||
|
||||
def get_answers():
|
||||
|
||||
cursor.execute('select value from pointers where name = "num_answer_stastics"')
|
||||
pointer = cursor.fetchone()[0]
|
||||
logger.info("last pointer: %d"%pointer)
|
||||
|
||||
|
||||
cursor.execute('select id,posttypeid,owneruserid from posts where id>%s limit %s',(pointer,1000))
|
||||
posts = cursor.fetchall()
|
||||
|
||||
if posts is None or len(posts) == 0:
|
||||
return None
|
||||
new_pointer = posts[-1][0]
|
||||
cursor.execute('update pointers set value=%s where name = "num_answer_stastics"',(new_pointer,))
|
||||
conn.commit()
|
||||
|
||||
return [item[2] for item in posts if item[1] == 2 and item[2] is not None]
|
||||
|
||||
|
||||
|
||||
|
||||
user_answer_count = dict() #already processed tag-pair: <tag-&-tag,count>
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
####count
|
||||
# answers = get_answers()
|
||||
#
|
||||
# while answers is not None :
|
||||
# for answer in answers:
|
||||
# if answer not in user_answer_count:
|
||||
# user_answer_count[answer] = 0
|
||||
# user_answer_count[answer] += 1
|
||||
# answers = get_answers()
|
||||
#
|
||||
# for key,value in user_answer_count.items():
|
||||
# cursor.execute('insert into answer_stastics(user_id,count_answer) values(%s,%s)',(key,value))
|
||||
# conn.commit()
|
||||
############################3
|
||||
|
||||
|
||||
########stastics
|
||||
# with open('answer_stastics.txt','w+') as fp:
|
||||
# sql_sel_count = 'SELECT count_answer FROM `answer_stastics` ORDER BY `count_answer` DESC'
|
||||
# cursor.execute(sql_sel_count)
|
||||
# counts = [item[0] for item in cursor.fetchall()]
|
||||
# last_count = counts[-1]
|
||||
# print last_count
|
||||
# fp.write("%d,%d\n"%(counts[-1],len(counts)))
|
||||
# for i in range(len(counts)-2,-1,-1):
|
||||
# for c in range(last_count + 1,counts[i]):
|
||||
# fp.write("%d,%d\n"%(c,i+1))
|
||||
# if counts[i]!=counts[i+1]:
|
||||
# fp.write("%d,%d\n"%(counts[i],i+1))
|
||||
# last_count = counts[i]
|
||||
# print last_count
|
||||
# ##########################
|
||||
|
||||
|
||||
########plot
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from mpl_toolkits.axes_grid1.axes_divider import make_axes_area_auto_adjustable
|
||||
|
||||
plt.figure(figsize=(16,8),num="ap")
|
||||
plt.subplots_adjust(wspace=0.2)
|
||||
plt.subplot(111)
|
||||
|
||||
num_count = dict()
|
||||
with open('answer_stastics.txt','r+') as fp:
|
||||
for line in fp.readlines():
|
||||
items = line.split(',')
|
||||
num_count[int(items[0])] = int(items[1])
|
||||
|
||||
start,end = 0,100
|
||||
num_count[0] = num_count[1]
|
||||
|
||||
for i in range(0,100):
|
||||
print "between [%d,%d) has: %d"%(start + i*100,end + i*100,num_count[start + i*100] - num_count[end + i*100])
|
||||
|
|
@ -0,0 +1,19 @@
|
|||
#coding:utf-8
|
||||
'''
|
||||
Created on 2016年9月10日
|
||||
|
||||
@author: StarLee
|
||||
'''
|
||||
import urllib2
|
||||
import json
|
||||
import gzip
|
||||
from StringIO import StringIO
|
||||
|
||||
def api_http(url):
|
||||
response = urllib2.urlopen(url,timeout=600)
|
||||
#decode gzip
|
||||
buf = StringIO(response.read())
|
||||
f = gzip.GzipFile(fileobj=buf)
|
||||
plain_data = f.read()
|
||||
# print plain_data
|
||||
return json.loads(plain_data)
|
|
@ -0,0 +1,59 @@
|
|||
#coding:utf-8
|
||||
'''
|
||||
Created on 2016年9月10日
|
||||
|
||||
@author: StarLee
|
||||
'''
|
||||
from so_api import api_http
|
||||
import logging
|
||||
import time
|
||||
import MySQLdb
|
||||
conn = MySQLdb.connect(host="localhost",user="starlee",passwd="1234",db="sldb",charset='utf8' )
|
||||
cursor = conn.cursor()
|
||||
|
||||
logger = logging.getLogger()
|
||||
hdlr = logging.FileHandler("tag_wiki.log")
|
||||
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
|
||||
hdlr.setFormatter(formatter)
|
||||
logger.addHandler(hdlr)
|
||||
logger.setLevel(logging.NOTSET)
|
||||
|
||||
####################
|
||||
#为节省网络访问时间,可以一次访问多个tag,如/2.2/tags/redis;java/wikis?site=stackoverflow。每个tag之间用;隔开。最后的filter是把wiki的body也爬取下来
|
||||
api_tag_wiki_so = "http://api.stackexchange.com:80/2.2/tags/%s/wikis?client_id=6781&key=6HF6aOk)jUbSHpRXUrVCFg((&site=stackoverflow&filter=!9YdnSD8kT"
|
||||
####################
|
||||
|
||||
sql_tag_name = "select name from tag where id>=%s and id<%s"
|
||||
sql_insert_wiki = "update tag set tag_wiki_excerpt=%s,tag_wiki_body=%s where name=%s"
|
||||
index = 4520
|
||||
|
||||
logger.info("start>>>>>")
|
||||
step = 5
|
||||
while True:
|
||||
try:
|
||||
cursor.execute(sql_tag_name,(index,index+step))
|
||||
logger.info( "%s-%s",(index,index+step))
|
||||
index += step
|
||||
result = cursor.fetchall()
|
||||
|
||||
if len(result) == 0:
|
||||
break;
|
||||
tags = ";".join([item[0] for item in result])
|
||||
print api_tag_wiki_so%tags
|
||||
wikis = api_http(api_tag_wiki_so%tags)
|
||||
|
||||
for item in wikis["items"]:
|
||||
try:
|
||||
cursor.execute(sql_insert_wiki,(item["excerpt"],item["body"],item["tag_name"]))
|
||||
conn.commit()
|
||||
except Exception,e:
|
||||
continue
|
||||
except Exception,e:
|
||||
continue
|
||||
|
||||
|
||||
|
||||
logger.info("done<<<<<<<<")
|
||||
cursor.close()
|
||||
conn.close()
|
||||
print '--------------------------------'
|
|
@ -0,0 +1,128 @@
|
|||
#coding:utf-8
|
||||
'''
|
||||
Created on 2016年9月10日
|
||||
|
||||
@author: StarLee
|
||||
'''
|
||||
import MySQLdb
|
||||
from so_api import api_http
|
||||
import json
|
||||
import logging
|
||||
import urllib2
|
||||
from time import sleep
|
||||
from lxml import etree
|
||||
|
||||
logger = logging.getLogger()
|
||||
hdlr = logging.FileHandler("top_tag.log")
|
||||
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
|
||||
hdlr.setFormatter(formatter)
|
||||
logger.addHandler(hdlr)
|
||||
logger.setLevel(logging.NOTSET)
|
||||
|
||||
conn = MySQLdb.connect(host="localhost",user="starlee",passwd="1234",db="sldb",charset='utf8' )
|
||||
cursor = conn.cursor()
|
||||
|
||||
sql_sel_all_user = 'select distinct top_user_id from top_user'
|
||||
sql_sel_done_user = 'select distinct user_id from top_tag'
|
||||
|
||||
|
||||
sql_ins_tt = 'insert into top_tag(user_id,tag_name,count,score) values(%s,%s,%s,%s)'
|
||||
sql_ins_tt_rj = 'insert into top_tag_raw_json(user_id,result_json) values(%s,%s)'
|
||||
|
||||
api_top_user = "http://api.stackexchange.com:80/2.2/users/%s/top-answer-tags?client_id=6781&key=6HF6aOk)jUbSHpRXUrVCFg((&site=stackoverflow&page=%d&pagesize=%d"
|
||||
top_tags_url = 'http://stackoverflow.com/users/%s?tab=tags&sort=votes'
|
||||
|
||||
headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/2010010 1 Firefox/34.0"}
|
||||
|
||||
def do_job():
|
||||
|
||||
print 'start'
|
||||
cursor.execute(sql_sel_all_user)
|
||||
all_users = [item[0] for item in cursor.fetchall()]
|
||||
|
||||
cursor.execute(sql_sel_done_user)
|
||||
done_users = [item[0] for item in cursor.fetchall()]
|
||||
|
||||
left_users = set(all_users) - set(done_users)
|
||||
print 'all-done-left:%d-%d-%d'%(len(all_users),len(done_users),len(left_users))
|
||||
|
||||
|
||||
left_users = sorted(list(left_users))
|
||||
for user in left_users:
|
||||
logger.info(user)
|
||||
print user
|
||||
try:
|
||||
# page = 1
|
||||
# data= api_http(api_top_user%(user,page,100))
|
||||
# page += 1
|
||||
# cursor.execute(sql_ins_tt_rj, (user,json.dumps(data)))
|
||||
#
|
||||
# tags = data['items']
|
||||
# for tag in tags:
|
||||
# cursor.execute(sql_ins_tt,(user,tag['tag_name'],tag['answer_count'],tag['answer_score']))
|
||||
# conn.commit()
|
||||
#
|
||||
# logger.info(data['has_more'])
|
||||
|
||||
|
||||
#利用 html自己抽
|
||||
#注意事项,有的好几个page,不过api返回的都是score>0的,因此 在存的时候也要判断是否大于0
|
||||
top_tags = list()
|
||||
req = urllib2.Request(top_tags_url%user,headers=headers)
|
||||
ini_html = urllib2.urlopen(req).read().lower().decode('utf-8');
|
||||
page = etree.HTML(ini_html)
|
||||
#先获取第一页的tags
|
||||
tags = page.xpath('//*[@id="user-tab-tags"]/div[2]/table/tbody/tr/td')
|
||||
for tag in tags:
|
||||
if tag.find('div').text == '0':
|
||||
zero_score = True
|
||||
continue
|
||||
else:
|
||||
name = tag.find('a').text
|
||||
value_text = tag.find('div').get('title')
|
||||
count_pos = value_text.find("gave")
|
||||
count = int(value_text[count_pos+5:value_text.find('non-wiki answer')-1])
|
||||
score = int(value_text[value_text.find("score of",count_pos+5)+9:-1])
|
||||
top_tags.append((name,count,score))
|
||||
#通过header看有多少个tag判断多少页
|
||||
num_tags = page.xpath('//*[@id="user-tab-tags"]/div[1]/h1/span')[0]
|
||||
num_page = int(num_tags.text) / 52 + 1
|
||||
#获取其他页的tag
|
||||
zero_score = False
|
||||
for i in range(2,num_page + 1):
|
||||
req = urllib2.Request("%s&page=%s"%(top_tags_url%user,i),headers=headers)
|
||||
ini_html = urllib2.urlopen(req).read().lower().decode('utf-8');
|
||||
page = etree.HTML(ini_html)
|
||||
#判断最后一个是否score为0,如果是就不往后面看了
|
||||
tags = page.xpath('//*[@id="user-tab-tags"]/div[2]/table/tbody/tr/td')
|
||||
for tag in tags:
|
||||
if tag.find('div').text == '0':
|
||||
zero_score = True
|
||||
continue
|
||||
else:
|
||||
name = tag.find('a').text
|
||||
value_text = tag.find('div').get('title')
|
||||
count_pos = value_text.find("gave")
|
||||
count = int(value_text[count_pos+5:value_text.find('non-wiki answer')-1])
|
||||
score = int(value_text[value_text.find("score of",count_pos+5)+9:-1])
|
||||
top_tags.append((name,count,score))
|
||||
if zero_score:
|
||||
break
|
||||
for top_tag in top_tags:
|
||||
cursor.execute(sql_ins_tt,(user,top_tag[0],top_tag[1],top_tag[2]))
|
||||
conn.commit()
|
||||
except Exception,e:
|
||||
sleep(1)
|
||||
print e
|
||||
pass
|
||||
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
do_job()
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,103 @@
|
|||
#coding:utf-8
|
||||
'''
|
||||
Created on 2016年10月7日
|
||||
|
||||
compute top tag from posts
|
||||
@author: StarLee
|
||||
'''
|
||||
import MySQLdb
|
||||
import logging
|
||||
import re
|
||||
|
||||
logger = logging.getLogger()
|
||||
hdlr = logging.FileHandler("top_tag_late.log")
|
||||
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
|
||||
hdlr.setFormatter(formatter)
|
||||
logger.addHandler(hdlr)
|
||||
logger.setLevel(logging.NOTSET)
|
||||
|
||||
conn = MySQLdb.connect(host="localhost",user="starlee",passwd="1234",db="sldb",charset='utf8' )
|
||||
cursor = conn.cursor()
|
||||
|
||||
def get_answers():
|
||||
|
||||
cursor.execute('select value from pointers where name = "top_tag_late_answer"')
|
||||
pointer = cursor.fetchone()[0]
|
||||
logger.info("last pointer: %d"%pointer)
|
||||
|
||||
|
||||
cursor.execute('select id,posttypeid,parentid,owneruserid from posts where id>%s limit %s',(pointer,1000))
|
||||
posts = cursor.fetchall()
|
||||
|
||||
if posts is None or len(posts) == 0:
|
||||
return None
|
||||
new_pointer = posts[-1][0]
|
||||
cursor.execute('update pointers set value=%s where name = "top_tag_late_answer"',(new_pointer,))
|
||||
conn.commit()
|
||||
|
||||
return [(item[2],item[3]) for item in posts if item[1] == 2 and item[3] is not None]
|
||||
|
||||
|
||||
def get_questions():
|
||||
|
||||
cursor.execute('select value from pointers where name = "top_tag_late_question"')
|
||||
pointer = cursor.fetchone()[0]
|
||||
logger.info("last pointer: %d"%pointer)
|
||||
|
||||
|
||||
cursor.execute('select id,posttypeid,tags from posts where id>%s limit %s',(pointer,1000))
|
||||
posts = cursor.fetchall()
|
||||
|
||||
if posts is None or len(posts) == 0:
|
||||
return None
|
||||
new_pointer = posts[-1][0]
|
||||
cursor.execute('update pointers set value=%s where name = "top_tag_late_question"',(new_pointer,))
|
||||
conn.commit()
|
||||
|
||||
return [(item[0],item[2]) for item in posts if item[1] == 1 and item[2] is not None]
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# #step 1: select users whose answers count > 100
|
||||
# sql_sel_top_user = 'select user_id from answer_stastics where count_answer>=100'
|
||||
# cursor.execute(sql_sel_top_user)
|
||||
# users = cursor.fetchall()
|
||||
#
|
||||
# #step 2: fetch answers for each user
|
||||
# for user in users:
|
||||
# sql_sel_answers = 'select '
|
||||
|
||||
#step 1: get post-[user,user,,,.....]
|
||||
q_u = dict()# 每一个quesion对应的回答者列表(questionid:[userid,userid......])
|
||||
answers = get_answers()
|
||||
while answers is not None :
|
||||
for answer in answers:
|
||||
if answer[0] not in q_u:
|
||||
q_u[answer[0]] = list()
|
||||
q_u[answer[0]].append(answer[1])
|
||||
answers = get_answers()
|
||||
|
||||
#step 1: get user-tag
|
||||
u_t = dict()# 每一个user 对应的tag列表(userid:[(tag:count),(tag:count)......])
|
||||
print 'answer is done'
|
||||
questions = get_questions()
|
||||
while questions is not None :
|
||||
for question in questions:
|
||||
if question[0] in q_u:
|
||||
tags = re.findall('<([\s\S]*?)>',question[1]) #获取该问题的tag
|
||||
users = q_u[question[0]] #该问题的回答者
|
||||
for user in users:
|
||||
if user not in u_t:
|
||||
u_t[user] = dict()
|
||||
for tag in tags:
|
||||
if tag not in u_t[user]:
|
||||
u_t[user][tag] = 0
|
||||
u_t[user][tag] += 1
|
||||
questions= get_questions()
|
||||
print 'question is done'
|
||||
for user,value in u_t.items():
|
||||
for tag,count in value.items():
|
||||
cursor.execute('insert into user_tag_count(user_id,tag,count) values(%s,%s,%s)',(user,tag,count))
|
||||
conn.commit()
|
||||
print "%d is done"%user
|
|
@ -0,0 +1,141 @@
|
|||
#coding:utf-8
|
||||
'''
|
||||
Created on 2016年10月7日
|
||||
|
||||
compute top tag from posts
|
||||
@author: StarLee
|
||||
'''
|
||||
import MySQLdb
|
||||
import logging
|
||||
import re
|
||||
|
||||
logger = logging.getLogger()
|
||||
hdlr = logging.FileHandler("top_tag_late.log")
|
||||
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
|
||||
hdlr.setFormatter(formatter)
|
||||
logger.addHandler(hdlr)
|
||||
logger.setLevel(logging.NOTSET)
|
||||
|
||||
conn = MySQLdb.connect(host="localhost",user="starlee",passwd="1234",db="sldb",charset='utf8' )
|
||||
cursor = conn.cursor()
|
||||
|
||||
def get_answers():
|
||||
|
||||
cursor.execute('select value from pointers where name = "top_tag_score_answer"')
|
||||
pointer = cursor.fetchone()[0]
|
||||
logger.info("last pointer: %d"%pointer)
|
||||
|
||||
|
||||
cursor.execute('select id,posttypeid,parentid,owneruserid from posts where id>%s limit %s',(pointer,1000))
|
||||
posts = cursor.fetchall()
|
||||
|
||||
if posts is None or len(posts) == 0:
|
||||
return None
|
||||
new_pointer = posts[-1][0]
|
||||
cursor.execute('update pointers set value=%s where name = "top_tag_score_answer"',(new_pointer,))
|
||||
conn.commit()
|
||||
|
||||
return [(item[2],item[3],item[0]) for item in posts if item[1] == 2 and item[3] is not None]
|
||||
|
||||
|
||||
def get_questions():
|
||||
|
||||
cursor.execute('select value from pointers where name = "top_tag_score_question"')
|
||||
pointer = cursor.fetchone()[0]
|
||||
logger.info("last pointer: %d"%pointer)
|
||||
|
||||
|
||||
cursor.execute('select id,posttypeid,tags from posts where id>%s limit %s',(pointer,1000))
|
||||
posts = cursor.fetchall()
|
||||
|
||||
if posts is None or len(posts) == 0:
|
||||
return None
|
||||
new_pointer = posts[-1][0]
|
||||
cursor.execute('update pointers set value=%s where name = "top_tag_score_question"',(new_pointer,))
|
||||
conn.commit()
|
||||
|
||||
return [(item[0],item[2]) for item in posts if item[1] == 1 and item[2] is not None]
|
||||
|
||||
def get_votes():
|
||||
|
||||
cursor.execute('select value from pointers where name = "top_tag_score_vote"')
|
||||
pointer = cursor.fetchone()[0]
|
||||
logger.info("last pointer: %d"%pointer)
|
||||
|
||||
|
||||
cursor.execute('select id,postid,votetypeid from votes where id>%s limit %s',(pointer,1000))
|
||||
posts = cursor.fetchall()
|
||||
|
||||
if posts is None or len(posts) == 0:
|
||||
return None
|
||||
new_pointer = posts[-1][0]
|
||||
cursor.execute('update pointers set value=%s where name = "top_tag_score_vote"',(new_pointer,))
|
||||
conn.commit()
|
||||
|
||||
return [(item[1],item[2]) for item in posts if item[2] is not None]
|
||||
|
||||
if __name__ == '__main__':
|
||||
# #step 1: select users whose answers count > 100
|
||||
# sql_sel_top_user = 'select user_id from answer_stastics where count_answer>=100'
|
||||
# cursor.execute(sql_sel_top_user)
|
||||
# users = cursor.fetchall()
|
||||
#
|
||||
# #step 2: fetch answers for each user
|
||||
# for user in users:
|
||||
# sql_sel_answers = 'select '
|
||||
|
||||
#step 0: get answer - score
|
||||
a_s = dict()
|
||||
votes = get_votes()
|
||||
while votes is not None :
|
||||
for vote in votes:
|
||||
if vote[0] not in a_s:
|
||||
a_s[vote[0]] = 0
|
||||
if vote[1] == 2:
|
||||
a_s[vote[0]] += 1
|
||||
elif vote[1] == 3:
|
||||
a_s[vote[0]] += -1
|
||||
else:
|
||||
pass
|
||||
votes = get_votes()
|
||||
|
||||
|
||||
|
||||
|
||||
#step 1: get post-[user,user,,,.....]
|
||||
q_u = dict()# 每一个quesion对应的回答者列表(questionid:[userid,userid......])
|
||||
answers = get_answers()
|
||||
while answers is not None :
|
||||
for answer in answers:
|
||||
if answer[0] not in q_u:
|
||||
q_u[answer[0]] = list()
|
||||
if answer[2] in a_s:
|
||||
q_u[answer[0]].append((answer[1],a_s[answer[2]]))
|
||||
answers = get_answers()
|
||||
del a_s
|
||||
|
||||
#step 1: get user-tag
|
||||
u_t = dict()# 每一个user 对应的tag列表(userid:[(tag:score),(tag:score)......])
|
||||
print 'answer is done'
|
||||
questions = get_questions()
|
||||
while questions is not None :
|
||||
for question in questions:
|
||||
if question[0] in q_u:
|
||||
tags = re.findall('<([\s\S]*?)>',question[1]) #获取该问题的tag
|
||||
users = q_u[question[0]] #该问题的回答者
|
||||
for user in users:
|
||||
if user[0] not in u_t:
|
||||
u_t[user[0]] = dict()
|
||||
for tag in tags:
|
||||
if tag not in u_t[user[0]]:
|
||||
u_t[user[0]][tag] = 0
|
||||
u_t[user][tag] += user[1]
|
||||
questions= get_questions()
|
||||
print 'question is done'
|
||||
del q_u
|
||||
|
||||
for user,value in u_t.iteritems():
|
||||
for tag,count in value.iteritems():
|
||||
cursor.execute('insert into user_tag_score(user_id,tag,count) values(%s,%s,%s)',(user,tag,count))
|
||||
conn.commit()
|
||||
print "%d is done"%user
|
|
@ -0,0 +1,114 @@
|
|||
#coding:utf-8
|
||||
'''
|
||||
Created on 2016年9月10日
|
||||
|
||||
@author: StarLee
|
||||
'''
|
||||
import MySQLdb
|
||||
from so_api import api_http
|
||||
import json
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger()
|
||||
hdlr = logging.FileHandler("%s.log"%__name__)
|
||||
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
|
||||
hdlr.setFormatter(formatter)
|
||||
logger.addHandler(hdlr)
|
||||
logger.setLevel(logging.NOTSET)
|
||||
|
||||
conn = MySQLdb.connect(host="localhost",user="starlee",passwd="1234",db="sldb",charset='utf8' )
|
||||
cursor = conn.cursor()
|
||||
|
||||
sql_sel_tag = 'select id,name from tag where id>=43786'
|
||||
sql_ins_tu = 'insert into top_user(tag_id,top_user_id) values(%s,%s)'
|
||||
sql_ins_tu_rj = 'insert into top_user_raw_json(tag_id,result_json) values(%s,%s)'
|
||||
|
||||
sql_sel_all_tag = 'select id,name from tag'
|
||||
sql_sel_done_tag = 'select distinct tag_id from top_user'
|
||||
|
||||
api_top_user = "http://api.stackexchange.com:80/2.2/tags/%s/top-answerers/all_time?client_id=6781&key=6HF6aOk)jUbSHpRXUrVCFg((&site=stackoverflow&page=%d&pagesize=%d"
|
||||
|
||||
|
||||
# (6140L, u'clash')
|
||||
# [Errno 10053]
|
||||
# (6141L, u'class')
|
||||
# <urlopen error [Errno 11001] getaddrinfo failed>
|
||||
# (6142L, u'class-attribute')
|
||||
# <urlopen error [Errno 11001] getaddrinfo failed>
|
||||
#
|
||||
def do_job():
|
||||
|
||||
print 'start'
|
||||
cursor.execute(sql_sel_tag)
|
||||
tags = cursor.fetchall()
|
||||
|
||||
print 'start'
|
||||
count = 0
|
||||
for tag in tags:
|
||||
logger.info(tag)
|
||||
print tag
|
||||
try:
|
||||
page = 1
|
||||
has_more = True
|
||||
|
||||
while has_more:
|
||||
data= api_http(api_top_user%(tag[1],page,100))
|
||||
page += 1
|
||||
cursor.execute(sql_ins_tu_rj, (tag[0],json.dumps(data)))
|
||||
|
||||
answerers = data['items']
|
||||
for answerer in answerers:
|
||||
cursor.execute(sql_ins_tu,(tag[0],answerer['user']['user_id']))
|
||||
conn.commit()
|
||||
|
||||
logger.info(data['has_more'])
|
||||
has_more = data["has_more"]
|
||||
|
||||
|
||||
except Exception,e:
|
||||
print e
|
||||
pass
|
||||
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
|
||||
def get_top_user(tag_id,tag_name):
|
||||
logger.info(tag)
|
||||
print tag
|
||||
try:
|
||||
page = 1
|
||||
has_more = True
|
||||
|
||||
while has_more:
|
||||
data= api_http(api_top_user%(tag_name,page,100))
|
||||
page += 1
|
||||
cursor.execute(sql_ins_tu_rj, (tag_id,json.dumps(data)))
|
||||
|
||||
answerers = data['items']
|
||||
for answerer in answerers:
|
||||
cursor.execute(sql_ins_tu,(tag_id,answerer['user']['user_id']))
|
||||
conn.commit()
|
||||
|
||||
logger.info(data['has_more'])
|
||||
has_more = data["has_more"]
|
||||
except Exception,e:
|
||||
print e
|
||||
pass
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
cursor.execute(sql_sel_all_tag)
|
||||
tags = cursor.fetchall()
|
||||
tags = {tag[0]:tag[1] for tag in tags}
|
||||
|
||||
cursor.execute(sql_sel_done_tag)
|
||||
done_tags = cursor.fetchall()
|
||||
for tag in done_tags:
|
||||
del tags[tag[0]]
|
||||
|
||||
for tag in tags.iteritems():
|
||||
get_top_user(tag[0], tag[1])
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,39 @@
|
|||
#coding:utf-8
|
||||
'''
|
||||
Created on 2016年9月29日
|
||||
|
||||
@author: StarLee
|
||||
'''
|
||||
|
||||
import MySQLdb
|
||||
conn = MySQLdb.connect(host="localhost",user="starlee",passwd="1234",db="sldb",charset='utf8' )
|
||||
cursor = conn.cursor()
|
||||
|
||||
sql_sewm = 'select term_2,similarity from word_similarity where term_1=%s order by similarity DESC limit 10'
|
||||
|
||||
|
||||
queries = ['python','java','c','c++','ruby','scala',
|
||||
'windows','linux','mac','android','ios','win8','x86',
|
||||
'html','json','css','xml','http',
|
||||
'redis','eclipse', 'mysql', 'scikit-learn', 'numpy', 'panda', 'django','spring', 'git',
|
||||
'machine','learning', 'ide','directory','class', 'apache', 'maven'
|
||||
]
|
||||
|
||||
|
||||
import csv
|
||||
import sys
|
||||
reload(sys)
|
||||
sys.setdefaultencoding('utf-8')
|
||||
|
||||
with open('sews.csv','w+') as fp:
|
||||
cfp = csv.writer(fp,dialect='excel')
|
||||
for query in queries:
|
||||
row_data = list()
|
||||
row_data.append(query)
|
||||
cursor.execute(sql_sewm,(query,))
|
||||
try:
|
||||
row_data.append('\n'.join([item[0] for item in cursor.fetchall()]))
|
||||
except Exception,e:
|
||||
row_data.append("null")
|
||||
|
||||
cfp.writerow(row_data)
|
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,151 @@
|
|||
#coding: utf-8
|
||||
'''
|
||||
Created on 2016年9月15日
|
||||
|
||||
@author: StarLee
|
||||
'''
|
||||
import nltk
|
||||
# #
|
||||
# # # nltk.app.chunkparser()
|
||||
from gensim.models import word2vec
|
||||
from gensim.models import Word2Vec
|
||||
from nltk.corpus import stopwords
|
||||
|
||||
stpw = stopwords.words('english')
|
||||
from string import punctuation
|
||||
stpw.extend(punctuation)
|
||||
import numpy as np
|
||||
import MySQLdb
|
||||
import scipy.cluster.hierarchy as hcluster
|
||||
|
||||
conn = MySQLdb.connect(host="localhost",user="starlee",passwd="1234",db="sldb",charset='utf8' )
|
||||
cursor = conn.cursor()
|
||||
# model_2 = Word2Vec.load('model/so_excep_model')
|
||||
|
||||
import re
|
||||
def extract_so_wiki_body(text):
|
||||
#step 1: extract text between <p> and </p>
|
||||
texts = re.findall('<p>([\s\S]*?)</p>', text)
|
||||
|
||||
#step 2: remove href and code
|
||||
texts = [ re.sub('<a href[\s\S]*?>|</a>|<pre>>[\s\S]*?</pre>|<code>[\s\S]*?</code>','',text) for text in texts]
|
||||
|
||||
return ''.join(texts)
|
||||
|
||||
def get_tag_tokens(text,text_filter):
|
||||
token_list = list()
|
||||
sens = nltk.sent_tokenize(text_filter(text))
|
||||
for i in range(0,len(sens)):
|
||||
tokens = nltk.word_tokenize(sens[i])
|
||||
tokens =[token.lower() for token in tokens if token.lower() not in stpw]
|
||||
token_list.extend(tokens)
|
||||
return token_list
|
||||
|
||||
|
||||
def get_text_vec(tag_name):
|
||||
sql_sel_so_excep = "select tag_wiki_excerpt from tag_old where name=%s"
|
||||
# sql_sel_so_body = "select tag_wiki_body from tag_old where name=%s "
|
||||
|
||||
cursor.execute(sql_sel_so_excep,(tag_name,))
|
||||
excep = cursor.fetchone()[0]
|
||||
|
||||
# cursor.execute(sql_sel_so_body,(tag_name,))
|
||||
# body = cursor.fetchone()[0]
|
||||
|
||||
all_token = get_tag_tokens(excep, lambda x:x)
|
||||
# all_token.extend(get_tag_tokens(body, extract_so_wiki_body))
|
||||
|
||||
all_vec = list()
|
||||
for token in all_token:
|
||||
if token in model_2:
|
||||
all_vec.append(model_2[token])
|
||||
else:
|
||||
print token
|
||||
|
||||
final_vec = np.array([0.0] * len(all_vec[0]))
|
||||
for vec in all_vec:
|
||||
final_vec += vec
|
||||
return final_vec / len(all_vec)
|
||||
|
||||
|
||||
tags = ['java', 'ruby', 'python', 'javascript',
|
||||
'html', 'xml', 'json', 'ajax',
|
||||
'windows', 'android', 'linux',
|
||||
'redis', 'maven', 'sql',
|
||||
'eclipse', 'netbeans', 'ide']
|
||||
|
||||
|
||||
|
||||
import numpy
|
||||
def coss(m1,m2):
|
||||
n_vec_i = numpy.matrix(m1)
|
||||
n_vec_j = numpy.matrix(m2)
|
||||
num = float(n_vec_i * n_vec_j.T)
|
||||
|
||||
denom = numpy.linalg.norm(n_vec_i ) * numpy.linalg.norm( n_vec_j )
|
||||
|
||||
return num / denom #余弦值
|
||||
|
||||
# tag_vec = dict()
|
||||
# for tag in tags[0:]:
|
||||
# if tag in model_2:
|
||||
# tag_vec[tag] = get_text_vec(tag)
|
||||
#
|
||||
#
|
||||
# for k1,v1 in tag_vec.items():
|
||||
# for k2,v2 in tag_vec.items():
|
||||
# print "%s-%s:%f"%(k1,k2,coss(v1,v2))
|
||||
# print '--------------------'
|
||||
# X = []
|
||||
# valid_tag = list()
|
||||
# for tag in tags[0:]:
|
||||
# if tag in model_2:
|
||||
# valid_tag.append(tag)
|
||||
# X.append(list(get_text_vec(tag)))
|
||||
#
|
||||
# cp = hcluster.fclusterdata(np.array(X),t=1)
|
||||
# ct = dict()
|
||||
# for pos in range(0,len(cp)):
|
||||
# if cp[pos] not in ct:
|
||||
# ct[cp[pos]] = list()
|
||||
# ct[cp[pos]].append((pos,valid_tag[pos]))
|
||||
#
|
||||
# for key,value in ct.iteritems():
|
||||
# print value
|
||||
#
|
||||
|
||||
|
||||
# X =[[1,1],
|
||||
# [1,2],
|
||||
# [2,1],
|
||||
# [2,2],
|
||||
#
|
||||
# [3,7],
|
||||
# [3,8],
|
||||
# [4,7],
|
||||
# [4,8],
|
||||
#
|
||||
# [7,2],
|
||||
# [7,3],
|
||||
# [8,2],
|
||||
# [8,3]
|
||||
#
|
||||
# ]
|
||||
#
|
||||
# print hcluster.fclusterdata(np.array(X),t=1)
|
||||
|
||||
|
||||
# import matplotlib.pylab as plt
|
||||
#
|
||||
# d = hcluster.distance.pdist(X)
|
||||
#
|
||||
# Z= hcluster.linkage(d,method='complete')
|
||||
#
|
||||
# P =hcluster.dendrogram(Z)
|
||||
#
|
||||
# plt.savefig('plot_dendrogram.png')
|
||||
# print 'done'
|
||||
|
||||
|
||||
|
||||
|
Binary file not shown.
|
@ -0,0 +1,133 @@
|
|||
#coding: utf-8
|
||||
'''
|
||||
Created on 2016年9月15日
|
||||
|
||||
@author: StarLee
|
||||
'''
|
||||
import nltk
|
||||
# #
|
||||
# # # nltk.app.chunkparser()
|
||||
from gensim.models import word2vec
|
||||
from gensim.models import Word2Vec
|
||||
from nltk.corpus import stopwords
|
||||
|
||||
stpw = stopwords.words('english')
|
||||
from string import punctuation
|
||||
stpw.extend(punctuation)
|
||||
import numpy as np
|
||||
import MySQLdb
|
||||
import scipy.cluster.hierarchy as hcluster
|
||||
|
||||
conn = MySQLdb.connect(host="localhost",user="starlee",passwd="1234",db="sldb",charset='utf8' )
|
||||
cursor = conn.cursor()
|
||||
model_2 = Word2Vec.load('../model/so_excep_model')
|
||||
#
|
||||
# import re
|
||||
# def extract_so_wiki_body(text):
|
||||
# #step 1: extract text between <p> and </p>
|
||||
# texts = re.findall('<p>([\s\S]*?)</p>', text)
|
||||
#
|
||||
# #step 2: remove href and code
|
||||
# texts = [ re.sub('<a href[\s\S]*?>|</a>|<pre>>[\s\S]*?</pre>|<code>[\s\S]*?</code>','',text) for text in texts]
|
||||
#
|
||||
# return ''.join(texts)
|
||||
#
|
||||
# def get_tag_tokens(text,text_filter):
|
||||
# token_list = list()
|
||||
# sens = nltk.sent_tokenize(text_filter(text))
|
||||
# for i in range(0,len(sens)):
|
||||
# tokens = nltk.word_tokenize(sens[i])
|
||||
# tokens =[token.lower() for token in tokens if token.lower() not in stpw]
|
||||
# token_list.extend(tokens)
|
||||
# return token_list
|
||||
#
|
||||
#
|
||||
# def get_text_vec(tag_name):
|
||||
# sql_sel_so_excep = "select tag_wiki_excerpt from tag_old where name=%s"
|
||||
# # sql_sel_so_body = "select tag_wiki_body from tag_old where name=%s "
|
||||
#
|
||||
# cursor.execute(sql_sel_so_excep,(tag_name,))
|
||||
# excep = cursor.fetchone()[0]
|
||||
#
|
||||
# # cursor.execute(sql_sel_so_body,(tag_name,))
|
||||
# # body = cursor.fetchone()[0]
|
||||
#
|
||||
# all_token = get_tag_tokens(excep, lambda x:x)
|
||||
# # all_token.extend(get_tag_tokens(body, extract_so_wiki_body))
|
||||
#
|
||||
# all_vec = list()
|
||||
# for token in all_token:
|
||||
# if token in model_2:
|
||||
# all_vec.append(model_2[token])
|
||||
# else:
|
||||
# print token
|
||||
#
|
||||
# final_vec = np.array([0.0] * len(all_vec[0]))
|
||||
# for vec in all_vec:
|
||||
# final_vec += vec
|
||||
# return final_vec / len(all_vec)
|
||||
#
|
||||
#
|
||||
# tags = ['java', 'ruby', 'python', 'javascript',
|
||||
# 'html', 'xml', 'json', 'ajax',
|
||||
# 'windows', 'android', 'linux',
|
||||
# 'redis', 'maven', 'sql',
|
||||
# 'eclipse', 'netbeans', 'ide']
|
||||
#
|
||||
#
|
||||
# X = []
|
||||
# valid_tag = list()
|
||||
# for tag in tags[0:]:
|
||||
# if tag in model_2:
|
||||
# valid_tag.append(tag)
|
||||
# X.append(list(get_text_vec(tag)))
|
||||
#
|
||||
# cp = hcluster.fclusterdata(np.array(X),t=1)
|
||||
# ct = dict()
|
||||
# for pos in range(0,len(cp)):
|
||||
# if cp[pos] not in ct:
|
||||
# ct[cp[pos]] = list()
|
||||
# ct[cp[pos]].append((pos,valid_tag[pos]))
|
||||
#
|
||||
# for key,value in ct.iteritems():
|
||||
# print value
|
||||
#
|
||||
#
|
||||
#
|
||||
# # X =[[1,1],
|
||||
# # [1,2],
|
||||
# # [2,1],
|
||||
# # [2,2],
|
||||
# #
|
||||
# # [3,7],
|
||||
# # [3,8],
|
||||
# # [4,7],
|
||||
# # [4,8],
|
||||
# #
|
||||
# # [7,2],
|
||||
# # [7,3],
|
||||
# # [8,2],
|
||||
# # [8,3]
|
||||
# #
|
||||
# # ]
|
||||
# #
|
||||
# # print hcluster.fclusterdata(np.array(X),t=1)
|
||||
#
|
||||
#
|
||||
# import matplotlib.pylab as plt
|
||||
#
|
||||
# d = hcluster.distance.pdist(X)
|
||||
#
|
||||
# Z= hcluster.linkage(d,method='complete')
|
||||
#
|
||||
# P =hcluster.dendrogram(Z)
|
||||
#
|
||||
# plt.savefig('plot_dendrogram.png')
|
||||
# print 'done'
|
||||
|
||||
import numpy as np
|
||||
|
||||
aa = np.array([2,4,6,8])
|
||||
print aa/2
|
||||
|
||||
|
Binary file not shown.
|
@ -0,0 +1,102 @@
|
|||
#coding:utf-8
|
||||
'''
|
||||
Created on 2016年9月10日
|
||||
|
||||
@author: StarLee
|
||||
'''
|
||||
from so_api import api_http
|
||||
from boto.cloudfront.origin import get_oai_value
|
||||
|
||||
class Answer:
|
||||
def __init__(self):
|
||||
self.id = -1
|
||||
self.top_tags = dict()
|
||||
def order_tag_by_count(self):
|
||||
return sorted(self.top_tags.iteritems(), key=lambda d:d[1][0], reverse = True)
|
||||
|
||||
def order_tag_by_score(self):
|
||||
return sorted(self.top_tags.iteritems(), key=lambda d:d[1][1], reverse = True)
|
||||
|
||||
def tag_count(self,tag_name):
|
||||
return self.top_tags[tag_name][0]
|
||||
|
||||
def tag_score(self,tag_name):
|
||||
return self.top_tags[tag_name][1]
|
||||
|
||||
api_top_user = "http://api.stackexchange.com:80/2.2/tags/%s/top-answerers/all_time?client_id=6781&key=6HF6aOk)jUbSHpRXUrVCFg((&site=stackoverflow"
|
||||
api_top_tag = "http://api.stackexchange.com:80/2.2/users/%s/top-tags?client_id=6781&key=6HF6aOk)jUbSHpRXUrVCFg((&site=stackoverflow"
|
||||
|
||||
|
||||
def get_top_tags(tag_name,sort_by_score = 1):
|
||||
top_users = list()
|
||||
print 'for tag: %s'%tag_name
|
||||
|
||||
data= api_http(api_top_user%tag_name)
|
||||
answerers = data['items']
|
||||
|
||||
for ans in answerers:
|
||||
user_id = ans['user']['user_id']
|
||||
# print user_id
|
||||
|
||||
top_user = Answer()
|
||||
top_user.id = user_id
|
||||
top_user.top_tags[tag_name] = (0,0) #in case, tag_name is not in the top-tag list(didn't check the has_more)
|
||||
|
||||
|
||||
data = api_http(api_top_tag%user_id)
|
||||
tags = data['items']
|
||||
|
||||
for tag in tags:
|
||||
# print tag['tag_name'],'%d-%d'%(tag['answer_count'],tag['answer_score']),'%d-%d'%(tag['question_count'],tag['question_score'])
|
||||
top_user.top_tags[tag['tag_name']] = (tag['answer_count'],tag['answer_score'])
|
||||
|
||||
top_users.append(top_user)
|
||||
|
||||
sorted_tu = None
|
||||
if sort_by_score == 1:
|
||||
sorted_tu = sorted(top_users, key=lambda d:d.tag_score(tag_name), reverse = True)
|
||||
else:
|
||||
sorted_tu = sorted(top_users, key=lambda d:d.tag_count(tag_name), reverse = True)
|
||||
|
||||
# for u in sorted_tu:
|
||||
# print u.id,u.tag_count(tag_name)
|
||||
# print [item[0] for item in u.order_tag_by_count()]
|
||||
# print '--------------------------'
|
||||
|
||||
print '********************************'
|
||||
return sorted_tu
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import csv
|
||||
import sys
|
||||
reload(sys)
|
||||
sys.setdefaultencoding('utf-8')
|
||||
|
||||
queries = ['python','java',
|
||||
'linux','android',
|
||||
'html','json','css',
|
||||
'redis','eclipse', 'mysql', 'scikit-learn', 'django','spring', 'git',
|
||||
'ide',
|
||||
]
|
||||
|
||||
with open('top_tag.csv','w+') as fp:
|
||||
cfp = csv.writer(fp,dialect='excel')
|
||||
for query in queries:
|
||||
row_data = list()
|
||||
row_data.append(query)
|
||||
sorted_tu = get_top_tags(query)
|
||||
for u in sorted_tu:
|
||||
row_data.append('\n'.join([item[0] for item in u.order_tag_by_score()]))
|
||||
cfp.writerow(row_data)
|
||||
#
|
||||
# an1 = answerers[0]
|
||||
# user_id = an1['user']['user_id']
|
||||
# print "top user: %d>>>>>>>"%user_id
|
||||
#
|
||||
# data = api_http(api_top_tag%user_id)
|
||||
# tags = data['items']
|
||||
#
|
||||
# for tag in tags:
|
||||
# print tag['tag_name'],tag['answer_count'],tag['question_count']
|
||||
# print '--------------------------------'
|
Binary file not shown.
|
@ -0,0 +1,390 @@
|
|||
#coding:utf-8
|
||||
import nltk
|
||||
from nltk.text import Text
|
||||
from nltk.corpus import stopwords
|
||||
import string
|
||||
from string import punctuation
|
||||
from gensim.models import Word2Vec
|
||||
import os
|
||||
from gensim.models import Phrases
|
||||
import re
|
||||
|
||||
|
||||
stpw = stopwords.words('english')
|
||||
stpw.extend(punctuation)
|
||||
|
||||
# nltk.download()
|
||||
# text_in_gh = 'Redis is an in-memory database that persists on disk. The data model is key-value, but many different kind of values are supported: Strings, Lists, Sets, Sorted Sets, Hashes, HyperLogLogs, Bitmaps. http://redis.io'
|
||||
# text_in_so = 'An open source BSD-licensed in-memory data structure store used as database, cache and message broker. Supports data structures such as strings, hashes, lists, sets, sorted sets with range queries, bitmaps, hyperloglogs and geospatial indexes with radius queries. Has built-in replication, Lua scripting, LRU eviction, transactions and different levels of on-disk persistence, high availability via Redis Sentinel and automatic partitioning with Redis Cluster.'
|
||||
|
||||
import MySQLdb
|
||||
conn = MySQLdb.connect(host="localhost",user="starlee",passwd="1234",db="sldb",charset='utf8' )
|
||||
cursor = conn.cursor()
|
||||
|
||||
sql_sel_so_pos = 'select id,body from shit_lee where id>%s limit %s'
|
||||
sql_sel_text = "select text from text where id>%s id< %s"
|
||||
|
||||
|
||||
class Token4DB(object):
|
||||
def __init__(self, sql,text_filter=lambda a:a):
|
||||
self.sql = sql
|
||||
self.flag = True
|
||||
self.text_filter = text_filter
|
||||
def __iter__(self):
|
||||
start = 0
|
||||
limit = 382
|
||||
count = 0;
|
||||
while self.flag:
|
||||
# print start,start + limit
|
||||
cursor.execute(self.sql,(start,limit))
|
||||
texts = cursor.fetchall()
|
||||
|
||||
# texts = [self.text_filter(item[0]) for item in cursor.fetchall() if item[0] is not None]
|
||||
for text in texts:
|
||||
sens = nltk.sent_tokenize(self.text_filter(text[1]))
|
||||
for i in range(0,len(sens)):
|
||||
tokens = nltk.word_tokenize(sens[i])
|
||||
tokens =[token.lower() for token in tokens if token.lower() not in stpw ]
|
||||
yield tokens
|
||||
start = text[0]
|
||||
count += 1
|
||||
##############
|
||||
if count >= 50000 :
|
||||
self.flag = False
|
||||
break
|
||||
##############
|
||||
|
||||
if len(texts) == 0:
|
||||
self.flag = False
|
||||
def get(self,text_count= 100000):
|
||||
print 'get fun'
|
||||
start = 0
|
||||
limit = 1000
|
||||
count = 0
|
||||
token_list = list()
|
||||
while self.flag:
|
||||
print start,count
|
||||
cursor.execute(self.sql,(start,limit))
|
||||
texts = cursor.fetchall()
|
||||
for text in texts:
|
||||
sens = nltk.sent_tokenize(self.text_filter(text[1]))
|
||||
for i in range(0,len(sens)):
|
||||
tokens = nltk.word_tokenize(sens[i])
|
||||
tokens =[token.lower() for token in tokens if token.lower() not in stpw]
|
||||
token_list.append(tokens)
|
||||
|
||||
start = text[0]
|
||||
count += 1
|
||||
##############
|
||||
if count >= text_count :
|
||||
self.flag = False
|
||||
break
|
||||
##############
|
||||
|
||||
if len(texts) == 0:
|
||||
self.flag = False
|
||||
return token_list
|
||||
|
||||
class Token4File(object):
|
||||
def __init__(self, dirname):
|
||||
self.dirname = dirname
|
||||
self.flag = True
|
||||
def __iter__(self):
|
||||
for fname in os.listdir(self.dirname):
|
||||
sens = nltk.sent_tokenize(''.join(open(os.path.join(self.dirname, fname))))
|
||||
for i in range(0,len(sens)):
|
||||
tokens = nltk.word_tokenize(sens[i])
|
||||
tokens =[token for token in tokens if token not in punctuation]
|
||||
yield tokens
|
||||
|
||||
def get_so_excep_model():
|
||||
sql_sel_so_excep = "select id,tag_wiki_excerpt from tag where id>%s and tag_wiki_excerpt is not null limit %s"
|
||||
if(os.path.exists('model/so_excep_model')):
|
||||
return Word2Vec.load('model/so_excep_model')
|
||||
else:
|
||||
model = Word2Vec(Token4DB(sql_sel_so_excep).get(),min_count=1)
|
||||
model.save('model/so_excep_model')
|
||||
return model
|
||||
|
||||
def get_so_tag_model():
|
||||
sql_sel_so_excep = "select id,tag_wiki_excerpt from tag where id>%s and tag_wiki_excerpt is not null limit %s"
|
||||
sql_sel_so_body = "select id,tag_wiki_body from tag where id>%s and tag_wiki_body is not null limit %s"
|
||||
|
||||
if(os.path.exists('model/so_tag_model')):
|
||||
return Word2Vec.load('model/so_tag_model')
|
||||
else:
|
||||
tokens = list()
|
||||
tokens.extend(Token4DB(sql_sel_so_excep).get())
|
||||
tokens.extend(Token4DB(sql_sel_so_body,extract_so_wiki_body).get())
|
||||
|
||||
model = Word2Vec(tokens,min_count=1)
|
||||
model.save('model/so_tag_model')
|
||||
return model
|
||||
|
||||
def get_so_post_model():
|
||||
if(os.path.exists('model/so_post_model')):
|
||||
return Word2Vec.load('model/so_post_model')
|
||||
else:
|
||||
model = Word2Vec(Token4DB(sql_sel_so_pos,extract_so).get(200000),min_count=1)
|
||||
model.save('model/so_post_model')
|
||||
return model
|
||||
|
||||
def extract_so_wiki_body(text):
|
||||
#step 1: extract text between <p> and </p>
|
||||
texts = re.findall('<p>([\s\S]*?)</p>', text)
|
||||
|
||||
#step 2: remove href and code
|
||||
texts = [ re.sub('<a href[\s\S]*?>|</a>|<pre>>[\s\S]*?</pre>|<code>[\s\S]*?</code>','',text) for text in texts]
|
||||
|
||||
return ''.join(texts)
|
||||
|
||||
|
||||
def extract_so(text):
|
||||
#step 1: extract text between <p> and </p>
|
||||
texts = re.findall('<p>([\s\S]*?)</p>', text)
|
||||
|
||||
#step 2: remove href and code
|
||||
texts = [ re.sub('<a href[\s\S]*?>|</a>|<code>|</code>|<pre>|</pre>','',text) for text in texts]
|
||||
|
||||
return ''.join(texts)
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
################### so 利用tag的wiki 摘要和body训练的##########################
|
||||
# stemmer = nltk.PorterStemmer()
|
||||
# model = get_so_tag_model()
|
||||
# print model
|
||||
##########################################################################
|
||||
|
||||
|
||||
|
||||
################### so 利用tag的wiki 摘要 训练##########################
|
||||
# model = get_so_excep_model()
|
||||
# print model
|
||||
# ##########################################################################
|
||||
|
||||
|
||||
# ################### so 利用post 20w##########################
|
||||
# model = get_so_post_model()
|
||||
# print model
|
||||
# # ##########################################################################
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# model = Word2Vec.load('model/so_post_model-1w')
|
||||
# model = model = get_so_excep_model()
|
||||
|
||||
|
||||
# ################### 写入csv ##########################
|
||||
import csv
|
||||
import sys
|
||||
reload(sys)
|
||||
sys.setdefaultencoding('utf-8')
|
||||
model_so_tag_excerpt = Word2Vec.load('model/so_excep_model')
|
||||
model_so_tag = Word2Vec.load('model/so_tag_model')
|
||||
model_so_post_1w = Word2Vec.load('model/so_post_model-1w')
|
||||
model_so_post_20w = Word2Vec.load('model/so_post_model_20w')
|
||||
print 'model is loaded'
|
||||
queries = ['python','java','c','c++','ruby','scala',
|
||||
'windows','linux','mac','android','ios','win8','x86',
|
||||
'html','json','css','xml','http',
|
||||
'redis','eclipse', 'mysql', 'scikit-learn', 'numpy', 'panda', 'django','spring', 'git',
|
||||
'machine','learning', 'ide','directory','class', 'apache', 'maven'
|
||||
]
|
||||
titles = ['query','so_tag_excerpt','so_tag','so_post_1w','so_post_20w']
|
||||
models = [model_so_tag_excerpt,model_so_tag,model_so_post_1w,model_so_post_20w]
|
||||
with open('result.csv','w+') as fp:
|
||||
cfp = csv.writer(fp,dialect='excel')
|
||||
cfp.writerow(titles)
|
||||
for query in queries:
|
||||
row_data = list()
|
||||
row_data.append(query)
|
||||
for model in models:
|
||||
try:
|
||||
row_data.append('\n'.join([item[0] for item in model.most_similar(query)]))
|
||||
except Exception,e:
|
||||
row_data.append("null")
|
||||
|
||||
cfp.writerow(row_data)
|
||||
# # ##########################################################################
|
||||
|
||||
|
||||
|
||||
|
||||
# ################## 语言 ##################
|
||||
# print 'python','>>>>>>>>>', model.most_similar(positive=['python'])
|
||||
# print 'java','>>>>>>>>>', model.most_similar('java')
|
||||
# print 'c','>>>>>>>>>', model.most_similar('c')
|
||||
# print 'c++','>>>>>>>>>', model.most_similar('c++')
|
||||
# print 'ruby','>>>>>>>>>', model.most_similar('ruby')
|
||||
# # print 'c#','>>>>>>>>>', model.most_similar('c#')
|
||||
# print 'scala','>>>>>>>>>', model.most_similar('scala')
|
||||
#
|
||||
#
|
||||
# print '**********************'
|
||||
# ################## 平台 ##################
|
||||
# print 'windows','>>>>>>>>>', model.most_similar(positive=['windows'])
|
||||
# print 'linux','>>>>>>>>>', model.most_similar('linux')
|
||||
# print 'mac','>>>>>>>>>', model.most_similar('mac')
|
||||
# print 'android','>>>>>>>>>', model.most_similar('android')
|
||||
# print 'ios','>>>>>>>>>', model.most_similar('ios')
|
||||
# # print 'win8','>>>>>>>>>', model.most_similar('win8')
|
||||
# print 'x86','>>>>>>>>>', model.most_similar('x86')
|
||||
#
|
||||
#
|
||||
# print '**********************'
|
||||
# ################## 标准 ##################
|
||||
# print 'html','>>>>>>>>>', model.most_similar(positive=['html'])
|
||||
# print 'json','>>>>>>>>>', model.most_similar('json')
|
||||
# print 'css','>>>>>>>>>', model.most_similar('css')
|
||||
# print 'xml','>>>>>>>>>', model.most_similar('xml')
|
||||
# print 'http','>>>>>>>>>', model.most_similar('http')
|
||||
#
|
||||
#
|
||||
#
|
||||
# print '**********************'
|
||||
# ################## 软件 ##################
|
||||
# # print 'redis','>>>>>>>>>',model.most_similar(positive=['redis'])
|
||||
# print 'eclipse', '>>>>>>>>>',model.most_similar(positive=['eclipse'])
|
||||
# print 'mysql', '>>>>>>>>>',model.most_similar(positive=['mysql'])
|
||||
# # print 'scikit-learn', '>>>>>>>>>',model.most_similar(positive=['scikit-learn'])
|
||||
# print 'numpy', '>>>>>>>>>',model.most_similar(positive=['numpy'])
|
||||
# # print 'panda', '>>>>>>>>>',model.most_similar(positive=['panda'])
|
||||
# print 'django', '>>>>>>>>>',model.most_similar(positive=['django'])
|
||||
# print 'spring', '>>>>>>>>>',model.most_similar(positive=['spring'])
|
||||
# print 'git', '>>>>>>>>>',model.most_similar(positive=['git'])
|
||||
#
|
||||
# print '**********************'
|
||||
# ################## 术语 ##################
|
||||
# print 'machine','>>>>>>>>>', model.most_similar('machine')
|
||||
# print 'learning', '>>>>>>>>>',model.most_similar('learning')
|
||||
# print 'ide','>>>>>>>>>', model.most_similar('ide')
|
||||
# print 'directory', '>>>>>>>>>',model.most_similar(positive=['directory'])
|
||||
# print 'class', '>>>>>>>>>',model.most_similar(positive=['class'])
|
||||
# print 'apache', '>>>>>>>>>',model.most_similar(positive=['apache'])
|
||||
# print 'maven', '>>>>>>>>>',model.most_similar(positive=['maven'])
|
||||
#
|
||||
#
|
||||
# print '**********************'
|
||||
# ################## 组合 ##################
|
||||
# print 'machine learning', '>>>>>>>>>',model.most_similar(positive=['machine','learning'])
|
||||
# print 'python machine learning', '>>>>>>>>>',model.most_similar(positive=['python','machine','learning'])
|
||||
# print 'java ide', '>>>>>>>>>',model.most_similar(positive=['java','ide'])
|
||||
# print 'java log', '>>>>>>>>>',model.most_similar(positive=['java','log'])
|
||||
# print 'java logging', '>>>>>>>>>',model.most_similar(positive=['java','logging'])
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
###########################################################
|
||||
# so 标签的wiki excerpt和body
|
||||
# print extract_so_wiki_body("<p>12345</p><p>12345</p>")
|
||||
# sql_sel_so_body = "select tag_wiki_body from tag where id>=0 and tag_wiki_body is not null limit 1"
|
||||
# cursor.execute(sql_sel_so_body)
|
||||
# bodys = [item[0] for item in cursor.fetchall()]
|
||||
# for body in bodys:
|
||||
# try:
|
||||
# print body
|
||||
# print '-------'
|
||||
# print extract_so_wiki_body(body)
|
||||
# print '**********************'
|
||||
# except Exception,e:
|
||||
# pass
|
||||
|
||||
##########################################################
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
###########################################################
|
||||
# so帖子的model,
|
||||
# tokens = Token4DB(sql_sel_so_pos,extract_so)
|
||||
#
|
||||
# for text in tokens:
|
||||
# print text
|
||||
#
|
||||
#
|
||||
# model = Word2Vec(tokens,min_count=1)
|
||||
# print model
|
||||
# model.save('model/yield')
|
||||
# print model.similarity('question', 'java')
|
||||
# print model.most_similar('java')
|
||||
# print '--------------------------------'
|
||||
|
||||
|
||||
# tokens = Token4DB(sql_sel_so_pos,extract_so).get()
|
||||
# # for text in tokens:
|
||||
# # print text
|
||||
# #
|
||||
# model = Word2Vec(tokens,min_count=1)
|
||||
# print model
|
||||
# model.save('model/list')
|
||||
|
||||
# model = Word2Vec.load('model/list')
|
||||
# print 'machine',model.most_similar('machine')
|
||||
# print 'learning',model.most_similar('learning')
|
||||
# print 'machine learning',model.most_similar(positive = ['machine','learning'])
|
||||
#
|
||||
# print 'question-java',model.similarity('question', 'java')
|
||||
# print 'java-eclipse',model.similarity('java','eclipse')
|
||||
#
|
||||
#
|
||||
#
|
||||
# print 'java + ide',model.most_similar(positive = ['java','ide'])
|
||||
#
|
||||
# print 'java',model.most_similar('java')
|
||||
#
|
||||
# print 'ide',model.most_similar('ide')
|
||||
# print 'eclipse',model.most_similar('eclipse')
|
||||
# print 'maven',model.most_similar('maven')
|
||||
# print '--------------------------------'
|
||||
######################################################################
|
||||
|
||||
|
||||
|
||||
# model = get_so_excep_model()
|
||||
# print model
|
||||
#
|
||||
# print model.similarity('python', 'java')
|
||||
# print model.similarity('python', 'c++')
|
||||
# print model.similarity('python', 'ruby')
|
||||
|
||||
# model = get_so_post_model()
|
||||
# print model
|
||||
#
|
||||
|
||||
# print model.similarity('question', 'Object')
|
||||
# print model.similarity('python', 'c++')
|
||||
# print model.similarity('python', 'ruby')
|
||||
#
|
||||
#
|
||||
#
|
||||
# print model.most_similar(positive=['python'])
|
||||
# print model.most_similar(positive=['Object'])
|
||||
# print model.most_similar(positive=['redis'])
|
||||
# print model.most_similar(positive=['windows','java','ide'])
|
||||
# print model.most_similar(positive=['eclipse'])
|
||||
# print model.most_similar(positive=['delete'])
|
||||
# print model.most_similar(positive=['java','eclipse'],negative = ['c++'])
|
||||
|
||||
# print extract_so("<p>12345</p><p>12345</p>")
|
||||
# sql_sel_so_pos = 'select body from shit_lee limit 10'
|
||||
# cursor.execute(sql_sel_so_pos)
|
||||
# bodys = [item[0] for item in cursor.fetchall()]
|
||||
# for body in bodys:
|
||||
# try:
|
||||
# print body
|
||||
# print '-------'
|
||||
# print extract_so(body)
|
||||
# print '**********************'
|
||||
# except Exception,e:
|
||||
# pass
|
||||
|
||||
# model = get_so_post_model()
|
||||
# print model
|
||||
|
||||
|
Binary file not shown.
|
@ -0,0 +1,221 @@
|
|||
#coding:utf-8
|
||||
'''
|
||||
Created on 2016年10月6日
|
||||
|
||||
@author: StarLee
|
||||
'''
|
||||
import MySQLdb
|
||||
from _elementtree import Comment
|
||||
|
||||
|
||||
###################################################################
|
||||
conn2 = MySQLdb.connect("localhost","starlee","1234","query_expansion" )
|
||||
cursor2 = conn2.cursor()
|
||||
|
||||
#加载最后要被过滤掉的项目名
|
||||
sql_tag_is_prj = 'select name,is_prj_name from tag'
|
||||
cursor2.execute(sql_tag_is_prj)
|
||||
tag_is_prj_name = {row[0]:row[1] for row in cursor2.fetchall()}
|
||||
|
||||
#加载过滤掉的tag
|
||||
with open("stop_word_prj_name") as file:
|
||||
stop_words_prj_name = set([line.strip() for line in file.readlines() if(( not line.startswith("#")) and line.strip()) ])
|
||||
|
||||
#tag同义词
|
||||
sql_get_syns = "select from_tag,to_tag from synonym"
|
||||
cursor2.execute(sql_get_syns)
|
||||
syns = {row[0]:row[1] for row in cursor2.fetchall()}
|
||||
|
||||
def get_syns(term):
|
||||
if term not in syns:
|
||||
return term
|
||||
return syns[term]
|
||||
|
||||
|
||||
##################################################################
|
||||
|
||||
|
||||
|
||||
##################################################################
|
||||
conn = MySQLdb.connect(host="localhost",user="starlee",passwd="1234",db="sldb",charset='utf8' )
|
||||
cursor = conn.cursor()
|
||||
|
||||
sql_sel_linked_tags = 'select t2,count from tag_postlinks where t1=%s union select t1,count from tag_postlinks where t2=%s'
|
||||
# sql_sel_cf_tags = 'select t2,rv from `stdb_cf` where t1=%s union select t1,rv from `stdb_cf` where t2=%s'
|
||||
sql_sel_cf_tags = 'select t2,rv from `stdb_cf` where id1=%s union select t1,rv from `stdb_cf` where id2=%s'
|
||||
# sql_sel_co_tags = 'select t2,count from stdb_co where t1=%s union select t1,count from stdb_co where t2=%s'
|
||||
sql_sel_co_tags = 'select t2,count from stdb_co where id1=%s union select t1,count from stdb_co where id2=%s'
|
||||
sql_sel_dpl_tags = 'select t2,count from stdb_dpl where t1=%s union select t1,count from stdb_dpl where t2=%s'
|
||||
|
||||
#tag的count
|
||||
sql_sel_tags = 'select tagname,count,id from tags'
|
||||
cursor.execute(sql_sel_tags)
|
||||
result = cursor.fetchall()
|
||||
tags = {item[0]:item[1] for item in result}
|
||||
tags_id = {item[0]:item[2] for item in result}
|
||||
|
||||
tags_name_set = set(tags.keys())
|
||||
##################################################################
|
||||
|
||||
|
||||
|
||||
##################################################################
|
||||
#返回利用co-occurence得到的一个tag的相关tag和rv:(tag,rv)
|
||||
def get_co_tags(item):
|
||||
tag_id = tags_id[item]
|
||||
cursor.execute(sql_sel_co_tags,(tag_id,tag_id))
|
||||
result = cursor.fetchall()
|
||||
|
||||
# total_count = sum([item[1] for item in result])
|
||||
# return sorted([(item[0],1.0*item[1]/total_count) for item in result],lambda x, y: cmp(x[1], y[1]),reverse = True)
|
||||
return {get_syns(t_c[0]):1.0*t_c[1] * t_c[1]/(tags[get_syns(t_c[0])]) for t_c in result if get_syns(t_c[0]) in tags_name_set}#经常出现keyerror,有些tag在tags表中不存在
|
||||
|
||||
|
||||
#返回利用duplicate postlinke得到的一个tag的相关tag和rv:(tag,rv)
|
||||
def get_duplink_tags(item):
|
||||
#有可能这样总情况(4,1,1,1,1,1,1,1,1)遮掩就会把4的这个得分给分散低了
|
||||
cursor.execute(sql_sel_dpl_tags,(item,item))
|
||||
result = cursor.fetchall()
|
||||
|
||||
total_count = sum([item[1] for item in result])
|
||||
# return sorted([(item[0],1.0*item[1]/total_count) for item in result],lambda x, y: cmp(x[1], y[1]),reverse = True)
|
||||
return {get_syns(item[0]):1.0*item[1]/total_count for item in result}
|
||||
|
||||
|
||||
|
||||
#返回利用postlink得到的一个tag的相关tag和rv:(tag,rv)
|
||||
def get_linked_tags(item):
|
||||
#有可能这样总情况(4,1,1,1,1,1,1,1,1)遮掩就会把4的这个得分给分散低了
|
||||
cursor.execute(sql_sel_linked_tags,(item,item))
|
||||
result = cursor.fetchall()
|
||||
|
||||
total_count = sum([item[1] for item in result])
|
||||
# return sorted([(item[0],1.0*item[1]/total_count) for item in result],lambda x, y: cmp(x[1], y[1]),reverse = True)
|
||||
return {get_syns(item[0]):1.0*item[1]/total_count for item in result}
|
||||
|
||||
|
||||
#返回利用collaborative filterring得到的一个tag的相关tag:(tag,rv)
|
||||
def get_cf_tags(item):
|
||||
#(tag,rv)
|
||||
tag_id = tags_id[item]
|
||||
cursor.execute(sql_sel_co_tags,(tag_id,tag_id))
|
||||
result = cursor.fetchall()
|
||||
|
||||
return {get_syns(item[0]):item[1] for item in result}
|
||||
##################################################################
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
########### single search
|
||||
query = 'java ide'
|
||||
items = query.split(' ')
|
||||
items = [get_syns(term) for term in items]
|
||||
|
||||
total_related_tags = list()#用于保存每个term对应的related tags的list
|
||||
|
||||
|
||||
get_related_tags = get_co_tags#后面的获得相关tag均用这个函数名
|
||||
# get_related_tags = get_duplink_tags#后面的获得相关tag均用这个函数名
|
||||
# get_related_tags = get_cf_tags#后面的获得相关tag均用这个函数名
|
||||
|
||||
|
||||
#### 利用 get_related_tags获取相关tag的rank value #############################
|
||||
related_tags = get_related_tags(items[0])
|
||||
|
||||
total_related_tags.append(related_tags)
|
||||
commen_tags = set(related_tags.keys())#query 中各个item共同关联的tag
|
||||
|
||||
|
||||
for item in items[1:]:
|
||||
related_tags = get_related_tags(item)
|
||||
total_related_tags.append(related_tags)
|
||||
commen_tags.intersection_update(set(related_tags.keys()))
|
||||
# print sorted(related_tags.items(),lambda x, y: cmp(x[1], y[1]),reverse = True)
|
||||
|
||||
|
||||
commen_tags_score = list()
|
||||
for item in commen_tags:
|
||||
if (item in tag_is_prj_name and tag_is_prj_name[item] == 0) or item in stop_words_prj_name :
|
||||
continue
|
||||
score = 1
|
||||
for tlt in total_related_tags:
|
||||
score *= tlt[item]
|
||||
commen_tags_score.append((item,score))
|
||||
|
||||
final_result = list()
|
||||
for item in sorted(commen_tags_score,lambda x, y: cmp(x[1], y[1]),reverse = True)[:10]:
|
||||
final_result.append(item[0])
|
||||
print item[0]
|
||||
#################################
|
||||
|
||||
|
||||
# ### similarity in wordembedding model
|
||||
import other.tag_doc as td
|
||||
td.model_2 = td.Word2Vec.load('other/model/so_excep_model')
|
||||
|
||||
tag_vec = dict()
|
||||
for tag in final_result:
|
||||
if tag in td.model_2:
|
||||
tag_vec[tag] = td.get_text_vec(tag)
|
||||
|
||||
rm = list()
|
||||
valid_tag = [item for item in final_result if item in tag_vec]
|
||||
for i in range(0,len(valid_tag)):
|
||||
# if valid_tag[i] in tag_vec:
|
||||
tmp_l = list()
|
||||
for j in range(i+1,len(valid_tag)):
|
||||
# if final_result[j] in tag_vec:
|
||||
tmp_l.append("(%s,%s,%f)"%(valid_tag[i],valid_tag[j],
|
||||
td.coss(tag_vec[valid_tag[i]],tag_vec[valid_tag[j]])) )
|
||||
if len(tmp_l) > 0:
|
||||
rm.append(tmp_l)
|
||||
|
||||
|
||||
for i in range(0,len(rm)):
|
||||
line = ''
|
||||
for j in range(0,i+1):
|
||||
line = "%s\t%s"%(line,rm[j][i-j])
|
||||
print line
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# ### relevance value in stdb_dpl and stdb_cf
|
||||
# for i1 in final_result:
|
||||
# for i2 in final_result:
|
||||
# if i1 == i2:
|
||||
# continue
|
||||
# print "for %s-%s:"%(i1,i2)
|
||||
#
|
||||
#
|
||||
#
|
||||
#
|
||||
# cursor.execute("SELECT count FROM `stdb_dpl` WHERE (`t1` = %s and `t2` = %s) or (`t2` = %s and `t1` = %s)",(i1,i2,i2,i1))
|
||||
# count_dpl = cursor.fetchone()
|
||||
# if count_dpl != None:
|
||||
# print " >dpl:%d"%count_dpl[0]
|
||||
# else:
|
||||
# print " >dpl:none"
|
||||
#
|
||||
#
|
||||
# cursor.execute("SELECT rv FROM `stdb_cf_0&1` WHERE (`t1` = %s and `t2` = %s) or (`t2` = %s and `t1` = %s)",(i1,i2,i2,i1))
|
||||
# count_cf = cursor.fetchone()
|
||||
# if count_cf != None:
|
||||
# print " >cf:%f"%count_cf[0]
|
||||
# else:
|
||||
# print " >cf:none"
|
||||
# print '----------------------------------'
|
||||
### relevance value in stdb_dpl and stdb_cf
|
||||
|
||||
########### single search
|
||||
|
||||
|
||||
cursor.close()
|
||||
conn.close()
|
|
@ -0,0 +1,39 @@
|
|||
#coding:utf-8
|
||||
'''
|
||||
Created on 2016年10月19日
|
||||
|
||||
@author: StarLee
|
||||
'''
|
||||
import MySQLdb
|
||||
|
||||
|
||||
conn2 = MySQLdb.connect("localhost","starlee","1234","query_expansion" )
|
||||
cursor2 = conn2.cursor()
|
||||
|
||||
#加载最后要被过滤掉的项目名
|
||||
sql_tag_is_prj = 'select name,is_prj_name from tag'
|
||||
cursor2.execute(sql_tag_is_prj)
|
||||
tag_is_prj_name = {row[0]:row[1] for row in cursor2.fetchall()}
|
||||
|
||||
|
||||
|
||||
conn = MySQLdb.connect(host="localhost",user="starlee",passwd="1234",db="sldb",charset='utf8' )
|
||||
cursor = conn.cursor()
|
||||
|
||||
sql_dpl = "(SELECT t2,count FROM `stdb_dpl` WHERE `t1` = %s order by count desc limit 20) union (SELECT t1,count FROM `stdb_dpl` WHERE `t2` = %s order by count desc limit 20)"
|
||||
sql_cf = "(SELECT t2,rv FROM `stdb_cf_0&1` WHERE `t1` = %s order by rv desc limit 20) union (SELECT t1,rv FROM `stdb_cf_0&1` WHERE `t2` = %s order by rv desc limit 20)"
|
||||
|
||||
tag = 'eclipse'
|
||||
cursor.execute(sql_dpl,(tag,tag))
|
||||
result = [item[0] for item in cursor.fetchall() if item[0] in tag_is_prj_name]
|
||||
|
||||
for r in result[:20]:
|
||||
print r
|
||||
|
||||
print '\n---------------------\n'
|
||||
|
||||
cursor.execute(sql_cf,(tag,tag))
|
||||
result = [item[0] for item in cursor.fetchall() if item[0] in tag_is_prj_name]
|
||||
|
||||
for r in result[:20]:
|
||||
print r
|
|
@ -0,0 +1,124 @@
|
|||
#coding:utf-8
|
||||
'''
|
||||
Created on 2016年10月7日
|
||||
|
||||
@author: StarLee
|
||||
'''
|
||||
import MySQLdb
|
||||
import logging
|
||||
import numpy
|
||||
import math
|
||||
logger = logging.getLogger()
|
||||
hdlr = logging.FileHandler("top_cf_2.log")
|
||||
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
|
||||
hdlr.setFormatter(formatter)
|
||||
logger.addHandler(hdlr)
|
||||
logger.setLevel(logging.NOTSET)
|
||||
|
||||
|
||||
conn = MySQLdb.connect(host="localhost",user="starlee",passwd="1234",db="sldb",charset='utf8' )
|
||||
cursor = conn.cursor()
|
||||
|
||||
|
||||
sql_sel_tag = 'select id,tagname from tags'
|
||||
sql_ins_tag_cf = "insert into stdb_cf(id1,t1,id2,t2,rv) values(%s,%s,%s,%s,%s)"
|
||||
|
||||
|
||||
cursor.execute(sql_sel_tag)
|
||||
total_tags = {item[1]:item[0] for item in cursor.fetchall()}#所有的tag<name,id>
|
||||
tags = total_tags.keys()
|
||||
|
||||
#
|
||||
# top_users = dict()# 所有tag对应的top user
|
||||
#
|
||||
# for tag in tags:
|
||||
# cursor.execute(sql_sel_top_user,(tag,))
|
||||
# result = cursor.fetchall()
|
||||
# top_users[tag] = set([item[0] for item in result])
|
||||
# logger.info("%s is done"%tag)
|
||||
#
|
||||
# logger.info('top_users is done')
|
||||
|
||||
|
||||
|
||||
#####################################
|
||||
top_users = dict()# 所有tag对应的top user
|
||||
sql_sel_utc = 'select id,tag,user_id,count from user_tag_count where id>%s limit 1000'
|
||||
start = 0
|
||||
while True:
|
||||
cursor.execute(sql_sel_utc,(start,))
|
||||
result = cursor.fetchall()
|
||||
|
||||
lens = len(result)
|
||||
logger.info(">=%d:%d"%(start,lens))
|
||||
if lens==0:
|
||||
logger.info("done top_users")
|
||||
print "done top_users"
|
||||
break
|
||||
|
||||
for item in result:
|
||||
start = item[0]
|
||||
if item[1] not in top_users:
|
||||
top_users[item[1]] = list()
|
||||
top_users[item[1]].append((item[2],item[3]))
|
||||
|
||||
|
||||
for tag,u_c in top_users.items():
|
||||
top_users[tag] = set([s_uc[0] for s_uc in sorted(u_c,key=lambda x:x[1],reverse=True)[:50]])
|
||||
|
||||
logger.info('done sorted top_users')
|
||||
print 'done sorted top_users'
|
||||
#####################################
|
||||
|
||||
|
||||
for i in range(0,len(tags)):
|
||||
if tags[i] not in top_users:
|
||||
continue
|
||||
top_user_i = top_users[tags[i]]
|
||||
# logger.info(total_tags[tags[i]])
|
||||
# print top_user_i
|
||||
|
||||
for j in range(i+1,len(tags)):
|
||||
if tags[j] not in top_users:
|
||||
continue
|
||||
top_user_j = top_users[tags[j]]
|
||||
# print top_user_j
|
||||
|
||||
# #求并集
|
||||
# commen_users = top_user_i | top_user_j
|
||||
# #构造向量
|
||||
# vec_i,vec_j = list(),list()
|
||||
# for commen_user in commen_users:
|
||||
# if commen_user in top_user_i:
|
||||
# vec_i.append(1)
|
||||
# else:
|
||||
# vec_i.append(0)
|
||||
#
|
||||
# if commen_user in top_user_j:
|
||||
# vec_j.append(1)
|
||||
# else:
|
||||
# vec_j.append(0)
|
||||
#
|
||||
# n_vec_i = numpy.matrix(vec_i)
|
||||
# n_vec_j = numpy.matrix(vec_j)
|
||||
# num = float(n_vec_i * n_vec_j.T)
|
||||
#
|
||||
# denom = numpy.linalg.norm(n_vec_i ) * numpy.linalg.norm( n_vec_j )
|
||||
#
|
||||
# cos = num / denom #余弦值
|
||||
|
||||
|
||||
|
||||
commen_users = top_user_i & top_user_j
|
||||
cos = len(commen_users) / math.sqrt(len(top_user_i) * len(top_user_j))
|
||||
|
||||
|
||||
if cos > 0.00001:
|
||||
cursor.execute(sql_ins_tag_cf,(total_tags[tags[i]],tags[i],total_tags[tags[j]],tags[j],cos))
|
||||
# print num,denom,cos
|
||||
conn.commit()
|
||||
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
|
|
@ -0,0 +1,81 @@
|
|||
#coding:utf-8
|
||||
'''
|
||||
Created on 2016年10月4日
|
||||
|
||||
@author: StarLee
|
||||
'''
|
||||
import logging
|
||||
logger = logging.getLogger()
|
||||
hdlr = logging.FileHandler("tag_coocur.log")
|
||||
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
|
||||
hdlr.setFormatter(formatter)
|
||||
logger.addHandler(hdlr)
|
||||
logger.setLevel(logging.NOTSET)
|
||||
|
||||
import MySQLdb
|
||||
from _mysql import NULL
|
||||
conn = MySQLdb.connect(host="localhost",user="starlee",passwd="1234",db="sldb",charset='utf8' )
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute('select id,tagname from tags')
|
||||
raw_tags = {item[1]:item[0] for item in cursor.fetchall()}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
import re
|
||||
|
||||
def get_tags():
|
||||
|
||||
cursor.execute('select value from pointers where name = "tag_cooccur_posts"')
|
||||
pointer = cursor.fetchone()[0]
|
||||
logger.info("last pointer: %d"%pointer)
|
||||
|
||||
cursor.execute('select id,tags from posts where id>%s and tags is not null limit %s',(pointer,10000))
|
||||
posts = cursor.fetchall()
|
||||
|
||||
if posts is None or len(posts) == 0:
|
||||
return None
|
||||
new_pointer = posts[-1][0]
|
||||
cursor.execute('update pointers set value=%s where name = "tag_cooccur_posts"',(new_pointer,))
|
||||
conn.commit()
|
||||
|
||||
return [re.findall('<([\s\S]*?)>', item[1])for item in posts]
|
||||
|
||||
|
||||
|
||||
def save_tag_co_occure(tag_cooccure):
|
||||
|
||||
for key,value in tag_cooccure.items():
|
||||
t1,t2 = key.split("-&-")
|
||||
try:
|
||||
id1 = raw_tags[t1]
|
||||
except Exception,e:
|
||||
id1 = -1
|
||||
try:
|
||||
id2 = raw_tags[t2]
|
||||
except Exception,e:
|
||||
id2 = -1
|
||||
cursor.execute('insert into stdb_co(id1,t1,id2,t2,count) values(%s,%s,%s,%s,%s)',(id1,t1,id2,t2,value))
|
||||
conn.commit()
|
||||
|
||||
tag_co_occure = dict() #already processed tag-pair: <tag-&-tag,count>
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
tagss = get_tags()
|
||||
# print tagss
|
||||
while tagss is not None :
|
||||
for tags in tagss:
|
||||
|
||||
length = len(tags)
|
||||
for i in range(0,length):
|
||||
for j in range(i+1,length):
|
||||
repre = '-&-'.join(sorted([tags[i],tags[j]]))
|
||||
if repre not in tag_co_occure:
|
||||
tag_co_occure[repre] = 0
|
||||
tag_co_occure[repre] += 1
|
||||
tagss = get_tags()
|
||||
|
||||
save_tag_co_occure(tag_co_occure)
|
|
@ -0,0 +1,82 @@
|
|||
#coding:utf-8
|
||||
'''
|
||||
Created on 2016年10月4日
|
||||
|
||||
@author: StarLee
|
||||
'''
|
||||
# 统计 duplicate post间的tag关系
|
||||
import logging
|
||||
logger = logging.getLogger()
|
||||
hdlr = logging.FileHandler("post_duplicate.log")
|
||||
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
|
||||
hdlr.setFormatter(formatter)
|
||||
logger.addHandler(hdlr)
|
||||
logger.setLevel(logging.NOTSET)
|
||||
|
||||
import MySQLdb
|
||||
from _mysql import NULL
|
||||
conn = MySQLdb.connect(host="localhost",user="starlee",passwd="1234",db="sldb",charset='utf8' )
|
||||
cursor = conn.cursor()
|
||||
|
||||
|
||||
import re
|
||||
|
||||
|
||||
#取出duplicate posts
|
||||
cursor.execute("SELECT Id,PostId,RelatedPostId FROM `postlinks` WHERE `LinkTypeId` = '3' AND `PostId` < '25829714' ORDER BY `PostId`")
|
||||
postlinks = cursor.fetchall()
|
||||
|
||||
tag_dpl = dict() #already processed tag-pair: <tag-&-tag,count>
|
||||
|
||||
for postlink in postlinks:
|
||||
post_id, related_post_id = postlink[1],postlink[2]
|
||||
cursor.execute('select tags from posts where Id = %s',(post_id,))
|
||||
tags1 = cursor.fetchone()
|
||||
|
||||
cursor.execute('select tags from posts where Id = %s',(related_post_id,))
|
||||
tags2 = cursor.fetchone()
|
||||
if tags1 == None or tags2==None:
|
||||
pass
|
||||
else:
|
||||
|
||||
########################### comment for log view
|
||||
ts1 = set(re.findall('<([\s\S]*?)>',tags1[0]))
|
||||
ts2 = set(re.findall('<([\s\S]*?)>',tags2[0]))
|
||||
com = ts1 & ts2
|
||||
if len(ts1-com)>0 and len(ts2-com)>0:
|
||||
logger.info("%s-%s>> %s - %s"%(post_id,related_post_id,','.join(list(ts1-com)),','.join(list(ts2-com))))
|
||||
for item1 in list(ts1-com):
|
||||
for item2 in list(ts2-com):
|
||||
repre = '-&-'.join(sorted([item1,item2]))
|
||||
if repre not in tag_dpl:
|
||||
tag_dpl[repre] = 0
|
||||
tag_dpl[repre] += 1
|
||||
|
||||
|
||||
#
|
||||
# cursor.execute('select id,tagname from tags')
|
||||
# raw_tags = {item[1]:item[0] for item in cursor.fetchall()}
|
||||
#
|
||||
# for key, value in tag_dpl.items():
|
||||
# t1, t2 = key.split("-&-")
|
||||
# try:
|
||||
# id1 = raw_tags[t1]
|
||||
# except Exception, e:
|
||||
# id1 = -1
|
||||
# try:
|
||||
# id2 = raw_tags[t2]
|
||||
# except Exception, e:
|
||||
# id2 = -1
|
||||
# cursor.execute('insert into stdb_dpl(id1,t1,id2,t2,count) values(%s,%s,%s,%s,%s)', (id1, t1, id2, t2, value))
|
||||
# conn.commit()
|
||||
########################### comment for log view
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# fp.write("%s-%s >> %s - %s\n"%(tags1[0],tags2[0],','.join(),','.join(list(ts2 - com))))
|
||||
# count += 1
|
||||
# print count
|
||||
|
||||
|
|
@ -0,0 +1,126 @@
|
|||
#coding:utf-8
|
||||
'''
|
||||
Created on 2016年10月4日
|
||||
|
||||
@author: StarLee
|
||||
'''
|
||||
import logging
|
||||
logger = logging.getLogger()
|
||||
hdlr = logging.FileHandler("tag_postlinks.log")
|
||||
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
|
||||
hdlr.setFormatter(formatter)
|
||||
logger.addHandler(hdlr)
|
||||
logger.setLevel(logging.NOTSET)
|
||||
|
||||
import MySQLdb
|
||||
from _mysql import NULL
|
||||
conn = MySQLdb.connect(host="localhost",user="starlee",passwd="1234",db="sldb",charset='utf8' )
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute('select id,name from tag')
|
||||
raw_tags = {item[1]:item[0] for item in cursor.fetchall()}
|
||||
|
||||
|
||||
import re
|
||||
|
||||
def get_post_links():
|
||||
|
||||
cursor.execute('select value from pointers where name = "postlinks"')
|
||||
pointer = cursor.fetchone()[0]
|
||||
logger.info("last pointer: %d"%pointer)
|
||||
|
||||
cursor.execute('select Id,PostId,RelatedPostId from postlinks where id>%s limit %s',(pointer,10000))
|
||||
postlinks = cursor.fetchall()
|
||||
# print postlinks
|
||||
if postlinks is None or len(postlinks) == 0:
|
||||
return None
|
||||
new_pointer = postlinks[-1][0]
|
||||
cursor.execute('update pointers set value=%s where name = "postlinks"',(new_pointer,))
|
||||
conn.commit()
|
||||
|
||||
return postlinks
|
||||
# return [ (item[1],item[2]) for item in postlinks]
|
||||
|
||||
|
||||
def get_tags(post_id):
|
||||
cursor.execute('select tags from posts where Id = %s',(post_id,))
|
||||
tags = cursor.fetchone()
|
||||
# print post_id
|
||||
# print tags
|
||||
|
||||
if tags == None: #post找不到,被删除了
|
||||
return None
|
||||
else:
|
||||
tags = tags[0]
|
||||
if tags == None:#该post类型是answer,tags 为空
|
||||
return None
|
||||
return re.findall('<([\s\S]*?)>',tags)
|
||||
|
||||
def update_bad_post_links(bad_post_links):
|
||||
cursor.execute('select value from pointers where name = "bad_postlinks"')
|
||||
count = cursor.fetchone()[0]
|
||||
cursor.execute('update pointers set value=%s where name = "bad_postlinks"',(count + bad_post_links,))
|
||||
conn.commit()
|
||||
def save_tag_postlinks(tag_post_link):
|
||||
|
||||
for key,value in tag_post_link.items():
|
||||
t1,t2 = key.split("-&-")
|
||||
try:
|
||||
id1 = raw_tags[t1]
|
||||
except Exception,e:
|
||||
id1 = -1
|
||||
try:
|
||||
id2 = raw_tags[t2]
|
||||
except Exception,e:
|
||||
id2 = -1
|
||||
cursor.execute('insert into tag_postlinks(id1,t1,id2,t2,count) values(%s,%s,%s,%s,%s)',(id1,t1,id2,t2,value))
|
||||
conn.commit()
|
||||
|
||||
tag_post_link = dict() #already processed tag-pair: <tag-&-tag,count>
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/2010010 1 Firefox/34.0"}
|
||||
#step 1 load db of processed tag-postlikn
|
||||
|
||||
#step 3.1 select batch-size postlinks
|
||||
postlinks = get_post_links()
|
||||
while postlinks is not None :
|
||||
# bad_post_links = 0
|
||||
|
||||
for postlink in postlinks:
|
||||
|
||||
# logger.info(postlink[0])
|
||||
post_id, related_post_id = postlink[1],postlink[2]
|
||||
|
||||
#step 3.2 get corresponding tags
|
||||
tags = get_tags(post_id)
|
||||
related_tags = get_tags(related_post_id)
|
||||
|
||||
|
||||
|
||||
if tags is None or related_tags is None:
|
||||
# logger.info('!!bad postlinks')
|
||||
cursor.execute('insert into bad_tag_postlinks(post_id,related_post_id) values(%s,%s)',(post_id,related_post_id))
|
||||
# bad_post_links += 1
|
||||
continue
|
||||
# print '%s <-> %s'%(','.join(tags),','.join(related_tags))
|
||||
|
||||
#step 3.2 match and count corresponding tags
|
||||
for tag in tags:
|
||||
for related_tag in related_tags:
|
||||
|
||||
# print "\t%s-%s"%(tag,related_tag)
|
||||
if tag == related_tag:
|
||||
#print "\tignore"
|
||||
pass
|
||||
else:
|
||||
repre = '-&-'.join(sorted([tag,related_tag]))
|
||||
if repre not in tag_post_link:
|
||||
tag_post_link[repre] = 0
|
||||
tag_post_link[repre] += 1
|
||||
# print 'processed'
|
||||
# update_bad_post_links(bad_post_links)
|
||||
postlinks = get_post_links()
|
||||
|
||||
save_tag_postlinks(tag_post_link)
|
Loading…
Reference in New Issue