This commit is contained in:
StarLee 2016-10-21 09:48:36 +08:00
parent ac1a65d88f
commit 5c8f7ddcf8
29 changed files with 32164 additions and 1 deletions

2
.gitignore vendored
View File

@ -2,7 +2,7 @@
.project
.pydevproject
*.png
model/*
other/model/*
*.log
files/*
*.csv

View File

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,102 @@
#coding:utf-8
'''
Created on 2016年10月7日
@author: StarLee
'''
import MySQLdb
import logging
logger = logging.getLogger()
hdlr = logging.FileHandler("num_answer_stastics.log")
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr)
logger.setLevel(logging.NOTSET)
conn = MySQLdb.connect(host="localhost",user="starlee",passwd="1234",db="sldb",charset='utf8' )
cursor = conn.cursor()
def get_answers():
cursor.execute('select value from pointers where name = "num_answer_stastics"')
pointer = cursor.fetchone()[0]
logger.info("last pointer: %d"%pointer)
cursor.execute('select id,posttypeid,owneruserid from posts where id>%s limit %s',(pointer,1000))
posts = cursor.fetchall()
if posts is None or len(posts) == 0:
return None
new_pointer = posts[-1][0]
cursor.execute('update pointers set value=%s where name = "num_answer_stastics"',(new_pointer,))
conn.commit()
return [item[2] for item in posts if item[1] == 2 and item[2] is not None]
user_answer_count = dict() #already processed tag-pair: <tag-&-tag,count>
if __name__ == '__main__':
####count
# answers = get_answers()
#
# while answers is not None :
# for answer in answers:
# if answer not in user_answer_count:
# user_answer_count[answer] = 0
# user_answer_count[answer] += 1
# answers = get_answers()
#
# for key,value in user_answer_count.items():
# cursor.execute('insert into answer_stastics(user_id,count_answer) values(%s,%s)',(key,value))
# conn.commit()
############################3
########stastics
# with open('answer_stastics.txt','w+') as fp:
# sql_sel_count = 'SELECT count_answer FROM `answer_stastics` ORDER BY `count_answer` DESC'
# cursor.execute(sql_sel_count)
# counts = [item[0] for item in cursor.fetchall()]
# last_count = counts[-1]
# print last_count
# fp.write("%d,%d\n"%(counts[-1],len(counts)))
# for i in range(len(counts)-2,-1,-1):
# for c in range(last_count + 1,counts[i]):
# fp.write("%d,%d\n"%(c,i+1))
# if counts[i]!=counts[i+1]:
# fp.write("%d,%d\n"%(counts[i],i+1))
# last_count = counts[i]
# print last_count
# ##########################
########plot
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1.axes_divider import make_axes_area_auto_adjustable
plt.figure(figsize=(16,8),num="ap")
plt.subplots_adjust(wspace=0.2)
plt.subplot(111)
num_count = dict()
with open('answer_stastics.txt','r+') as fp:
for line in fp.readlines():
items = line.split(',')
num_count[int(items[0])] = int(items[1])
start,end = 0,100
num_count[0] = num_count[1]
for i in range(0,100):
print "between [%d,%d) has: %d"%(start + i*100,end + i*100,num_count[start + i*100] - num_count[end + i*100])

19
data_collecting/so_api.py Normal file
View File

@ -0,0 +1,19 @@
#coding:utf-8
'''
Created on 2016年9月10日
@author: StarLee
'''
import urllib2
import json
import gzip
from StringIO import StringIO
def api_http(url):
response = urllib2.urlopen(url,timeout=600)
#decode gzip
buf = StringIO(response.read())
f = gzip.GzipFile(fileobj=buf)
plain_data = f.read()
# print plain_data
return json.loads(plain_data)

View File

@ -0,0 +1,59 @@
#coding:utf-8
'''
Created on 2016年9月10日
@author: StarLee
'''
from so_api import api_http
import logging
import time
import MySQLdb
conn = MySQLdb.connect(host="localhost",user="starlee",passwd="1234",db="sldb",charset='utf8' )
cursor = conn.cursor()
logger = logging.getLogger()
hdlr = logging.FileHandler("tag_wiki.log")
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr)
logger.setLevel(logging.NOTSET)
####################
#为节省网络访问时间可以一次访问多个tag如/2.2/tags/redis;java/wikis?site=stackoverflow。每个tag之间用;隔开。最后的filter是把wiki的body也爬取下来
api_tag_wiki_so = "http://api.stackexchange.com:80/2.2/tags/%s/wikis?client_id=6781&key=6HF6aOk)jUbSHpRXUrVCFg((&site=stackoverflow&filter=!9YdnSD8kT"
####################
sql_tag_name = "select name from tag where id>=%s and id<%s"
sql_insert_wiki = "update tag set tag_wiki_excerpt=%s,tag_wiki_body=%s where name=%s"
index = 4520
logger.info("start>>>>>")
step = 5
while True:
try:
cursor.execute(sql_tag_name,(index,index+step))
logger.info( "%s-%s",(index,index+step))
index += step
result = cursor.fetchall()
if len(result) == 0:
break;
tags = ";".join([item[0] for item in result])
print api_tag_wiki_so%tags
wikis = api_http(api_tag_wiki_so%tags)
for item in wikis["items"]:
try:
cursor.execute(sql_insert_wiki,(item["excerpt"],item["body"],item["tag_name"]))
conn.commit()
except Exception,e:
continue
except Exception,e:
continue
logger.info("done<<<<<<<<")
cursor.close()
conn.close()
print '--------------------------------'

128
data_collecting/top_tag.py Normal file
View File

@ -0,0 +1,128 @@
#coding:utf-8
'''
Created on 2016年9月10日
@author: StarLee
'''
import MySQLdb
from so_api import api_http
import json
import logging
import urllib2
from time import sleep
from lxml import etree
logger = logging.getLogger()
hdlr = logging.FileHandler("top_tag.log")
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr)
logger.setLevel(logging.NOTSET)
conn = MySQLdb.connect(host="localhost",user="starlee",passwd="1234",db="sldb",charset='utf8' )
cursor = conn.cursor()
sql_sel_all_user = 'select distinct top_user_id from top_user'
sql_sel_done_user = 'select distinct user_id from top_tag'
sql_ins_tt = 'insert into top_tag(user_id,tag_name,count,score) values(%s,%s,%s,%s)'
sql_ins_tt_rj = 'insert into top_tag_raw_json(user_id,result_json) values(%s,%s)'
api_top_user = "http://api.stackexchange.com:80/2.2/users/%s/top-answer-tags?client_id=6781&key=6HF6aOk)jUbSHpRXUrVCFg((&site=stackoverflow&page=%d&pagesize=%d"
top_tags_url = 'http://stackoverflow.com/users/%s?tab=tags&sort=votes'
headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/2010010 1 Firefox/34.0"}
def do_job():
print 'start'
cursor.execute(sql_sel_all_user)
all_users = [item[0] for item in cursor.fetchall()]
cursor.execute(sql_sel_done_user)
done_users = [item[0] for item in cursor.fetchall()]
left_users = set(all_users) - set(done_users)
print 'all-done-left:%d-%d-%d'%(len(all_users),len(done_users),len(left_users))
left_users = sorted(list(left_users))
for user in left_users:
logger.info(user)
print user
try:
# page = 1
# data= api_http(api_top_user%(user,page,100))
# page += 1
# cursor.execute(sql_ins_tt_rj, (user,json.dumps(data)))
#
# tags = data['items']
# for tag in tags:
# cursor.execute(sql_ins_tt,(user,tag['tag_name'],tag['answer_count'],tag['answer_score']))
# conn.commit()
#
# logger.info(data['has_more'])
#利用 html自己抽
#注意事项有的好几个page不过api返回的都是score>0的因此 在存的时候也要判断是否大于0
top_tags = list()
req = urllib2.Request(top_tags_url%user,headers=headers)
ini_html = urllib2.urlopen(req).read().lower().decode('utf-8');
page = etree.HTML(ini_html)
#先获取第一页的tags
tags = page.xpath('//*[@id="user-tab-tags"]/div[2]/table/tbody/tr/td')
for tag in tags:
if tag.find('div').text == '0':
zero_score = True
continue
else:
name = tag.find('a').text
value_text = tag.find('div').get('title')
count_pos = value_text.find("gave")
count = int(value_text[count_pos+5:value_text.find('non-wiki answer')-1])
score = int(value_text[value_text.find("score of",count_pos+5)+9:-1])
top_tags.append((name,count,score))
#通过header看有多少个tag判断多少页
num_tags = page.xpath('//*[@id="user-tab-tags"]/div[1]/h1/span')[0]
num_page = int(num_tags.text) / 52 + 1
#获取其他页的tag
zero_score = False
for i in range(2,num_page + 1):
req = urllib2.Request("%s&page=%s"%(top_tags_url%user,i),headers=headers)
ini_html = urllib2.urlopen(req).read().lower().decode('utf-8');
page = etree.HTML(ini_html)
#判断最后一个是否score为0如果是就不往后面看了
tags = page.xpath('//*[@id="user-tab-tags"]/div[2]/table/tbody/tr/td')
for tag in tags:
if tag.find('div').text == '0':
zero_score = True
continue
else:
name = tag.find('a').text
value_text = tag.find('div').get('title')
count_pos = value_text.find("gave")
count = int(value_text[count_pos+5:value_text.find('non-wiki answer')-1])
score = int(value_text[value_text.find("score of",count_pos+5)+9:-1])
top_tags.append((name,count,score))
if zero_score:
break
for top_tag in top_tags:
cursor.execute(sql_ins_tt,(user,top_tag[0],top_tag[1],top_tag[2]))
conn.commit()
except Exception,e:
sleep(1)
print e
pass
cursor.close()
conn.close()
if __name__ == "__main__":
do_job()

View File

@ -0,0 +1,103 @@
#coding:utf-8
'''
Created on 2016年10月7日
compute top tag from posts
@author: StarLee
'''
import MySQLdb
import logging
import re
logger = logging.getLogger()
hdlr = logging.FileHandler("top_tag_late.log")
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr)
logger.setLevel(logging.NOTSET)
conn = MySQLdb.connect(host="localhost",user="starlee",passwd="1234",db="sldb",charset='utf8' )
cursor = conn.cursor()
def get_answers():
cursor.execute('select value from pointers where name = "top_tag_late_answer"')
pointer = cursor.fetchone()[0]
logger.info("last pointer: %d"%pointer)
cursor.execute('select id,posttypeid,parentid,owneruserid from posts where id>%s limit %s',(pointer,1000))
posts = cursor.fetchall()
if posts is None or len(posts) == 0:
return None
new_pointer = posts[-1][0]
cursor.execute('update pointers set value=%s where name = "top_tag_late_answer"',(new_pointer,))
conn.commit()
return [(item[2],item[3]) for item in posts if item[1] == 2 and item[3] is not None]
def get_questions():
cursor.execute('select value from pointers where name = "top_tag_late_question"')
pointer = cursor.fetchone()[0]
logger.info("last pointer: %d"%pointer)
cursor.execute('select id,posttypeid,tags from posts where id>%s limit %s',(pointer,1000))
posts = cursor.fetchall()
if posts is None or len(posts) == 0:
return None
new_pointer = posts[-1][0]
cursor.execute('update pointers set value=%s where name = "top_tag_late_question"',(new_pointer,))
conn.commit()
return [(item[0],item[2]) for item in posts if item[1] == 1 and item[2] is not None]
if __name__ == '__main__':
# #step 1: select users whose answers count > 100
# sql_sel_top_user = 'select user_id from answer_stastics where count_answer>=100'
# cursor.execute(sql_sel_top_user)
# users = cursor.fetchall()
#
# #step 2: fetch answers for each user
# for user in users:
# sql_sel_answers = 'select '
#step 1: get post-[user,user,,,.....]
q_u = dict()# 每一个quesion对应的回答者列表(questionid:[userid,userid......])
answers = get_answers()
while answers is not None :
for answer in answers:
if answer[0] not in q_u:
q_u[answer[0]] = list()
q_u[answer[0]].append(answer[1])
answers = get_answers()
#step 1: get user-tag
u_t = dict()# 每一个user 对应的tag列表(userid:[(tag:count),(tag:count)......])
print 'answer is done'
questions = get_questions()
while questions is not None :
for question in questions:
if question[0] in q_u:
tags = re.findall('<([\s\S]*?)>',question[1]) #获取该问题的tag
users = q_u[question[0]] #该问题的回答者
for user in users:
if user not in u_t:
u_t[user] = dict()
for tag in tags:
if tag not in u_t[user]:
u_t[user][tag] = 0
u_t[user][tag] += 1
questions= get_questions()
print 'question is done'
for user,value in u_t.items():
for tag,count in value.items():
cursor.execute('insert into user_tag_count(user_id,tag,count) values(%s,%s,%s)',(user,tag,count))
conn.commit()
print "%d is done"%user

View File

@ -0,0 +1,141 @@
#coding:utf-8
'''
Created on 2016年10月7日
compute top tag from posts
@author: StarLee
'''
import MySQLdb
import logging
import re
logger = logging.getLogger()
hdlr = logging.FileHandler("top_tag_late.log")
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr)
logger.setLevel(logging.NOTSET)
conn = MySQLdb.connect(host="localhost",user="starlee",passwd="1234",db="sldb",charset='utf8' )
cursor = conn.cursor()
def get_answers():
cursor.execute('select value from pointers where name = "top_tag_score_answer"')
pointer = cursor.fetchone()[0]
logger.info("last pointer: %d"%pointer)
cursor.execute('select id,posttypeid,parentid,owneruserid from posts where id>%s limit %s',(pointer,1000))
posts = cursor.fetchall()
if posts is None or len(posts) == 0:
return None
new_pointer = posts[-1][0]
cursor.execute('update pointers set value=%s where name = "top_tag_score_answer"',(new_pointer,))
conn.commit()
return [(item[2],item[3],item[0]) for item in posts if item[1] == 2 and item[3] is not None]
def get_questions():
cursor.execute('select value from pointers where name = "top_tag_score_question"')
pointer = cursor.fetchone()[0]
logger.info("last pointer: %d"%pointer)
cursor.execute('select id,posttypeid,tags from posts where id>%s limit %s',(pointer,1000))
posts = cursor.fetchall()
if posts is None or len(posts) == 0:
return None
new_pointer = posts[-1][0]
cursor.execute('update pointers set value=%s where name = "top_tag_score_question"',(new_pointer,))
conn.commit()
return [(item[0],item[2]) for item in posts if item[1] == 1 and item[2] is not None]
def get_votes():
cursor.execute('select value from pointers where name = "top_tag_score_vote"')
pointer = cursor.fetchone()[0]
logger.info("last pointer: %d"%pointer)
cursor.execute('select id,postid,votetypeid from votes where id>%s limit %s',(pointer,1000))
posts = cursor.fetchall()
if posts is None or len(posts) == 0:
return None
new_pointer = posts[-1][0]
cursor.execute('update pointers set value=%s where name = "top_tag_score_vote"',(new_pointer,))
conn.commit()
return [(item[1],item[2]) for item in posts if item[2] is not None]
if __name__ == '__main__':
# #step 1: select users whose answers count > 100
# sql_sel_top_user = 'select user_id from answer_stastics where count_answer>=100'
# cursor.execute(sql_sel_top_user)
# users = cursor.fetchall()
#
# #step 2: fetch answers for each user
# for user in users:
# sql_sel_answers = 'select '
#step 0: get answer - score
a_s = dict()
votes = get_votes()
while votes is not None :
for vote in votes:
if vote[0] not in a_s:
a_s[vote[0]] = 0
if vote[1] == 2:
a_s[vote[0]] += 1
elif vote[1] == 3:
a_s[vote[0]] += -1
else:
pass
votes = get_votes()
#step 1: get post-[user,user,,,.....]
q_u = dict()# 每一个quesion对应的回答者列表(questionid:[userid,userid......])
answers = get_answers()
while answers is not None :
for answer in answers:
if answer[0] not in q_u:
q_u[answer[0]] = list()
if answer[2] in a_s:
q_u[answer[0]].append((answer[1],a_s[answer[2]]))
answers = get_answers()
del a_s
#step 1: get user-tag
u_t = dict()# 每一个user 对应的tag列表(userid:[(tag:score),(tag:score)......])
print 'answer is done'
questions = get_questions()
while questions is not None :
for question in questions:
if question[0] in q_u:
tags = re.findall('<([\s\S]*?)>',question[1]) #获取该问题的tag
users = q_u[question[0]] #该问题的回答者
for user in users:
if user[0] not in u_t:
u_t[user[0]] = dict()
for tag in tags:
if tag not in u_t[user[0]]:
u_t[user[0]][tag] = 0
u_t[user][tag] += user[1]
questions= get_questions()
print 'question is done'
del q_u
for user,value in u_t.iteritems():
for tag,count in value.iteritems():
cursor.execute('insert into user_tag_score(user_id,tag,count) values(%s,%s,%s)',(user,tag,count))
conn.commit()
print "%d is done"%user

114
data_collecting/top_user.py Normal file
View File

@ -0,0 +1,114 @@
#coding:utf-8
'''
Created on 2016年9月10日
@author: StarLee
'''
import MySQLdb
from so_api import api_http
import json
import logging
logger = logging.getLogger()
hdlr = logging.FileHandler("%s.log"%__name__)
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr)
logger.setLevel(logging.NOTSET)
conn = MySQLdb.connect(host="localhost",user="starlee",passwd="1234",db="sldb",charset='utf8' )
cursor = conn.cursor()
sql_sel_tag = 'select id,name from tag where id>=43786'
sql_ins_tu = 'insert into top_user(tag_id,top_user_id) values(%s,%s)'
sql_ins_tu_rj = 'insert into top_user_raw_json(tag_id,result_json) values(%s,%s)'
sql_sel_all_tag = 'select id,name from tag'
sql_sel_done_tag = 'select distinct tag_id from top_user'
api_top_user = "http://api.stackexchange.com:80/2.2/tags/%s/top-answerers/all_time?client_id=6781&key=6HF6aOk)jUbSHpRXUrVCFg((&site=stackoverflow&page=%d&pagesize=%d"
# (6140L, u'clash')
# [Errno 10053]
# (6141L, u'class')
# <urlopen error [Errno 11001] getaddrinfo failed>
# (6142L, u'class-attribute')
# <urlopen error [Errno 11001] getaddrinfo failed>
#
def do_job():
print 'start'
cursor.execute(sql_sel_tag)
tags = cursor.fetchall()
print 'start'
count = 0
for tag in tags:
logger.info(tag)
print tag
try:
page = 1
has_more = True
while has_more:
data= api_http(api_top_user%(tag[1],page,100))
page += 1
cursor.execute(sql_ins_tu_rj, (tag[0],json.dumps(data)))
answerers = data['items']
for answerer in answerers:
cursor.execute(sql_ins_tu,(tag[0],answerer['user']['user_id']))
conn.commit()
logger.info(data['has_more'])
has_more = data["has_more"]
except Exception,e:
print e
pass
cursor.close()
conn.close()
def get_top_user(tag_id,tag_name):
logger.info(tag)
print tag
try:
page = 1
has_more = True
while has_more:
data= api_http(api_top_user%(tag_name,page,100))
page += 1
cursor.execute(sql_ins_tu_rj, (tag_id,json.dumps(data)))
answerers = data['items']
for answerer in answerers:
cursor.execute(sql_ins_tu,(tag_id,answerer['user']['user_id']))
conn.commit()
logger.info(data['has_more'])
has_more = data["has_more"]
except Exception,e:
print e
pass
if __name__ == "__main__":
cursor.execute(sql_sel_all_tag)
tags = cursor.fetchall()
tags = {tag[0]:tag[1] for tag in tags}
cursor.execute(sql_sel_done_tag)
done_tags = cursor.fetchall()
for tag in done_tags:
del tags[tag[0]]
for tag in tags.iteritems():
get_top_user(tag[0], tag[1])

39
other/SEWordSm.py Normal file
View File

@ -0,0 +1,39 @@
#coding:utf-8
'''
Created on 2016年9月29日
@author: StarLee
'''
import MySQLdb
conn = MySQLdb.connect(host="localhost",user="starlee",passwd="1234",db="sldb",charset='utf8' )
cursor = conn.cursor()
sql_sewm = 'select term_2,similarity from word_similarity where term_1=%s order by similarity DESC limit 10'
queries = ['python','java','c','c++','ruby','scala',
'windows','linux','mac','android','ios','win8','x86',
'html','json','css','xml','http',
'redis','eclipse', 'mysql', 'scikit-learn', 'numpy', 'panda', 'django','spring', 'git',
'machine','learning', 'ide','directory','class', 'apache', 'maven'
]
import csv
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
with open('sews.csv','w+') as fp:
cfp = csv.writer(fp,dialect='excel')
for query in queries:
row_data = list()
row_data.append(query)
cursor.execute(sql_sewm,(query,))
try:
row_data.append('\n'.join([item[0] for item in cursor.fetchall()]))
except Exception,e:
row_data.append("null")
cfp.writerow(row_data)

BIN
other/SEWordSm.pyc Normal file

Binary file not shown.

0
other/__init__.py Normal file
View File

BIN
other/__init__.pyc Normal file

Binary file not shown.

151
other/tag_doc.py Normal file
View File

@ -0,0 +1,151 @@
#coding: utf-8
'''
Created on 2016年9月15日
@author: StarLee
'''
import nltk
# #
# # # nltk.app.chunkparser()
from gensim.models import word2vec
from gensim.models import Word2Vec
from nltk.corpus import stopwords
stpw = stopwords.words('english')
from string import punctuation
stpw.extend(punctuation)
import numpy as np
import MySQLdb
import scipy.cluster.hierarchy as hcluster
conn = MySQLdb.connect(host="localhost",user="starlee",passwd="1234",db="sldb",charset='utf8' )
cursor = conn.cursor()
# model_2 = Word2Vec.load('model/so_excep_model')
import re
def extract_so_wiki_body(text):
#step 1: extract text between <p> and </p>
texts = re.findall('<p>([\s\S]*?)</p>', text)
#step 2: remove href and code
texts = [ re.sub('<a href[\s\S]*?>|</a>|<pre>>[\s\S]*?</pre>|<code>[\s\S]*?</code>','',text) for text in texts]
return ''.join(texts)
def get_tag_tokens(text,text_filter):
token_list = list()
sens = nltk.sent_tokenize(text_filter(text))
for i in range(0,len(sens)):
tokens = nltk.word_tokenize(sens[i])
tokens =[token.lower() for token in tokens if token.lower() not in stpw]
token_list.extend(tokens)
return token_list
def get_text_vec(tag_name):
sql_sel_so_excep = "select tag_wiki_excerpt from tag_old where name=%s"
# sql_sel_so_body = "select tag_wiki_body from tag_old where name=%s "
cursor.execute(sql_sel_so_excep,(tag_name,))
excep = cursor.fetchone()[0]
# cursor.execute(sql_sel_so_body,(tag_name,))
# body = cursor.fetchone()[0]
all_token = get_tag_tokens(excep, lambda x:x)
# all_token.extend(get_tag_tokens(body, extract_so_wiki_body))
all_vec = list()
for token in all_token:
if token in model_2:
all_vec.append(model_2[token])
else:
print token
final_vec = np.array([0.0] * len(all_vec[0]))
for vec in all_vec:
final_vec += vec
return final_vec / len(all_vec)
tags = ['java', 'ruby', 'python', 'javascript',
'html', 'xml', 'json', 'ajax',
'windows', 'android', 'linux',
'redis', 'maven', 'sql',
'eclipse', 'netbeans', 'ide']
import numpy
def coss(m1,m2):
n_vec_i = numpy.matrix(m1)
n_vec_j = numpy.matrix(m2)
num = float(n_vec_i * n_vec_j.T)
denom = numpy.linalg.norm(n_vec_i ) * numpy.linalg.norm( n_vec_j )
return num / denom #余弦值
# tag_vec = dict()
# for tag in tags[0:]:
# if tag in model_2:
# tag_vec[tag] = get_text_vec(tag)
#
#
# for k1,v1 in tag_vec.items():
# for k2,v2 in tag_vec.items():
# print "%s-%s:%f"%(k1,k2,coss(v1,v2))
# print '--------------------'
# X = []
# valid_tag = list()
# for tag in tags[0:]:
# if tag in model_2:
# valid_tag.append(tag)
# X.append(list(get_text_vec(tag)))
#
# cp = hcluster.fclusterdata(np.array(X),t=1)
# ct = dict()
# for pos in range(0,len(cp)):
# if cp[pos] not in ct:
# ct[cp[pos]] = list()
# ct[cp[pos]].append((pos,valid_tag[pos]))
#
# for key,value in ct.iteritems():
# print value
#
# X =[[1,1],
# [1,2],
# [2,1],
# [2,2],
#
# [3,7],
# [3,8],
# [4,7],
# [4,8],
#
# [7,2],
# [7,3],
# [8,2],
# [8,3]
#
# ]
#
# print hcluster.fclusterdata(np.array(X),t=1)
# import matplotlib.pylab as plt
#
# d = hcluster.distance.pdist(X)
#
# Z= hcluster.linkage(d,method='complete')
#
# P =hcluster.dendrogram(Z)
#
# plt.savefig('plot_dendrogram.png')
# print 'done'

BIN
other/tag_doc.pyc Normal file

Binary file not shown.

133
other/test.py Normal file
View File

@ -0,0 +1,133 @@
#coding: utf-8
'''
Created on 2016年9月15日
@author: StarLee
'''
import nltk
# #
# # # nltk.app.chunkparser()
from gensim.models import word2vec
from gensim.models import Word2Vec
from nltk.corpus import stopwords
stpw = stopwords.words('english')
from string import punctuation
stpw.extend(punctuation)
import numpy as np
import MySQLdb
import scipy.cluster.hierarchy as hcluster
conn = MySQLdb.connect(host="localhost",user="starlee",passwd="1234",db="sldb",charset='utf8' )
cursor = conn.cursor()
model_2 = Word2Vec.load('../model/so_excep_model')
#
# import re
# def extract_so_wiki_body(text):
# #step 1: extract text between <p> and </p>
# texts = re.findall('<p>([\s\S]*?)</p>', text)
#
# #step 2: remove href and code
# texts = [ re.sub('<a href[\s\S]*?>|</a>|<pre>>[\s\S]*?</pre>|<code>[\s\S]*?</code>','',text) for text in texts]
#
# return ''.join(texts)
#
# def get_tag_tokens(text,text_filter):
# token_list = list()
# sens = nltk.sent_tokenize(text_filter(text))
# for i in range(0,len(sens)):
# tokens = nltk.word_tokenize(sens[i])
# tokens =[token.lower() for token in tokens if token.lower() not in stpw]
# token_list.extend(tokens)
# return token_list
#
#
# def get_text_vec(tag_name):
# sql_sel_so_excep = "select tag_wiki_excerpt from tag_old where name=%s"
# # sql_sel_so_body = "select tag_wiki_body from tag_old where name=%s "
#
# cursor.execute(sql_sel_so_excep,(tag_name,))
# excep = cursor.fetchone()[0]
#
# # cursor.execute(sql_sel_so_body,(tag_name,))
# # body = cursor.fetchone()[0]
#
# all_token = get_tag_tokens(excep, lambda x:x)
# # all_token.extend(get_tag_tokens(body, extract_so_wiki_body))
#
# all_vec = list()
# for token in all_token:
# if token in model_2:
# all_vec.append(model_2[token])
# else:
# print token
#
# final_vec = np.array([0.0] * len(all_vec[0]))
# for vec in all_vec:
# final_vec += vec
# return final_vec / len(all_vec)
#
#
# tags = ['java', 'ruby', 'python', 'javascript',
# 'html', 'xml', 'json', 'ajax',
# 'windows', 'android', 'linux',
# 'redis', 'maven', 'sql',
# 'eclipse', 'netbeans', 'ide']
#
#
# X = []
# valid_tag = list()
# for tag in tags[0:]:
# if tag in model_2:
# valid_tag.append(tag)
# X.append(list(get_text_vec(tag)))
#
# cp = hcluster.fclusterdata(np.array(X),t=1)
# ct = dict()
# for pos in range(0,len(cp)):
# if cp[pos] not in ct:
# ct[cp[pos]] = list()
# ct[cp[pos]].append((pos,valid_tag[pos]))
#
# for key,value in ct.iteritems():
# print value
#
#
#
# # X =[[1,1],
# # [1,2],
# # [2,1],
# # [2,2],
# #
# # [3,7],
# # [3,8],
# # [4,7],
# # [4,8],
# #
# # [7,2],
# # [7,3],
# # [8,2],
# # [8,3]
# #
# # ]
# #
# # print hcluster.fclusterdata(np.array(X),t=1)
#
#
# import matplotlib.pylab as plt
#
# d = hcluster.distance.pdist(X)
#
# Z= hcluster.linkage(d,method='complete')
#
# P =hcluster.dendrogram(Z)
#
# plt.savefig('plot_dendrogram.png')
# print 'done'
import numpy as np
aa = np.array([2,4,6,8])
print aa/2

BIN
other/test.pyc Normal file

Binary file not shown.

102
other/top_top.py Normal file
View File

@ -0,0 +1,102 @@
#coding:utf-8
'''
Created on 2016年9月10日
@author: StarLee
'''
from so_api import api_http
from boto.cloudfront.origin import get_oai_value
class Answer:
def __init__(self):
self.id = -1
self.top_tags = dict()
def order_tag_by_count(self):
return sorted(self.top_tags.iteritems(), key=lambda d:d[1][0], reverse = True)
def order_tag_by_score(self):
return sorted(self.top_tags.iteritems(), key=lambda d:d[1][1], reverse = True)
def tag_count(self,tag_name):
return self.top_tags[tag_name][0]
def tag_score(self,tag_name):
return self.top_tags[tag_name][1]
api_top_user = "http://api.stackexchange.com:80/2.2/tags/%s/top-answerers/all_time?client_id=6781&key=6HF6aOk)jUbSHpRXUrVCFg((&site=stackoverflow"
api_top_tag = "http://api.stackexchange.com:80/2.2/users/%s/top-tags?client_id=6781&key=6HF6aOk)jUbSHpRXUrVCFg((&site=stackoverflow"
def get_top_tags(tag_name,sort_by_score = 1):
top_users = list()
print 'for tag: %s'%tag_name
data= api_http(api_top_user%tag_name)
answerers = data['items']
for ans in answerers:
user_id = ans['user']['user_id']
# print user_id
top_user = Answer()
top_user.id = user_id
top_user.top_tags[tag_name] = (0,0) #in case, tag_name is not in the top-tag list(didn't check the has_more)
data = api_http(api_top_tag%user_id)
tags = data['items']
for tag in tags:
# print tag['tag_name'],'%d-%d'%(tag['answer_count'],tag['answer_score']),'%d-%d'%(tag['question_count'],tag['question_score'])
top_user.top_tags[tag['tag_name']] = (tag['answer_count'],tag['answer_score'])
top_users.append(top_user)
sorted_tu = None
if sort_by_score == 1:
sorted_tu = sorted(top_users, key=lambda d:d.tag_score(tag_name), reverse = True)
else:
sorted_tu = sorted(top_users, key=lambda d:d.tag_count(tag_name), reverse = True)
# for u in sorted_tu:
# print u.id,u.tag_count(tag_name)
# print [item[0] for item in u.order_tag_by_count()]
# print '--------------------------'
print '********************************'
return sorted_tu
if __name__ == "__main__":
import csv
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
queries = ['python','java',
'linux','android',
'html','json','css',
'redis','eclipse', 'mysql', 'scikit-learn', 'django','spring', 'git',
'ide',
]
with open('top_tag.csv','w+') as fp:
cfp = csv.writer(fp,dialect='excel')
for query in queries:
row_data = list()
row_data.append(query)
sorted_tu = get_top_tags(query)
for u in sorted_tu:
row_data.append('\n'.join([item[0] for item in u.order_tag_by_score()]))
cfp.writerow(row_data)
#
# an1 = answerers[0]
# user_id = an1['user']['user_id']
# print "top user: %d>>>>>>>"%user_id
#
# data = api_http(api_top_tag%user_id)
# tags = data['items']
#
# for tag in tags:
# print tag['tag_name'],tag['answer_count'],tag['question_count']
# print '--------------------------------'

BIN
other/top_top.pyc Normal file

Binary file not shown.

390
other/word_embedding.py Normal file
View File

@ -0,0 +1,390 @@
#coding:utf-8
import nltk
from nltk.text import Text
from nltk.corpus import stopwords
import string
from string import punctuation
from gensim.models import Word2Vec
import os
from gensim.models import Phrases
import re
stpw = stopwords.words('english')
stpw.extend(punctuation)
# nltk.download()
# text_in_gh = 'Redis is an in-memory database that persists on disk. The data model is key-value, but many different kind of values are supported: Strings, Lists, Sets, Sorted Sets, Hashes, HyperLogLogs, Bitmaps. http://redis.io'
# text_in_so = 'An open source BSD-licensed in-memory data structure store used as database, cache and message broker. Supports data structures such as strings, hashes, lists, sets, sorted sets with range queries, bitmaps, hyperloglogs and geospatial indexes with radius queries. Has built-in replication, Lua scripting, LRU eviction, transactions and different levels of on-disk persistence, high availability via Redis Sentinel and automatic partitioning with Redis Cluster.'
import MySQLdb
conn = MySQLdb.connect(host="localhost",user="starlee",passwd="1234",db="sldb",charset='utf8' )
cursor = conn.cursor()
sql_sel_so_pos = 'select id,body from shit_lee where id>%s limit %s'
sql_sel_text = "select text from text where id>%s id< %s"
class Token4DB(object):
def __init__(self, sql,text_filter=lambda a:a):
self.sql = sql
self.flag = True
self.text_filter = text_filter
def __iter__(self):
start = 0
limit = 382
count = 0;
while self.flag:
# print start,start + limit
cursor.execute(self.sql,(start,limit))
texts = cursor.fetchall()
# texts = [self.text_filter(item[0]) for item in cursor.fetchall() if item[0] is not None]
for text in texts:
sens = nltk.sent_tokenize(self.text_filter(text[1]))
for i in range(0,len(sens)):
tokens = nltk.word_tokenize(sens[i])
tokens =[token.lower() for token in tokens if token.lower() not in stpw ]
yield tokens
start = text[0]
count += 1
##############
if count >= 50000 :
self.flag = False
break
##############
if len(texts) == 0:
self.flag = False
def get(self,text_count= 100000):
print 'get fun'
start = 0
limit = 1000
count = 0
token_list = list()
while self.flag:
print start,count
cursor.execute(self.sql,(start,limit))
texts = cursor.fetchall()
for text in texts:
sens = nltk.sent_tokenize(self.text_filter(text[1]))
for i in range(0,len(sens)):
tokens = nltk.word_tokenize(sens[i])
tokens =[token.lower() for token in tokens if token.lower() not in stpw]
token_list.append(tokens)
start = text[0]
count += 1
##############
if count >= text_count :
self.flag = False
break
##############
if len(texts) == 0:
self.flag = False
return token_list
class Token4File(object):
def __init__(self, dirname):
self.dirname = dirname
self.flag = True
def __iter__(self):
for fname in os.listdir(self.dirname):
sens = nltk.sent_tokenize(''.join(open(os.path.join(self.dirname, fname))))
for i in range(0,len(sens)):
tokens = nltk.word_tokenize(sens[i])
tokens =[token for token in tokens if token not in punctuation]
yield tokens
def get_so_excep_model():
sql_sel_so_excep = "select id,tag_wiki_excerpt from tag where id>%s and tag_wiki_excerpt is not null limit %s"
if(os.path.exists('model/so_excep_model')):
return Word2Vec.load('model/so_excep_model')
else:
model = Word2Vec(Token4DB(sql_sel_so_excep).get(),min_count=1)
model.save('model/so_excep_model')
return model
def get_so_tag_model():
sql_sel_so_excep = "select id,tag_wiki_excerpt from tag where id>%s and tag_wiki_excerpt is not null limit %s"
sql_sel_so_body = "select id,tag_wiki_body from tag where id>%s and tag_wiki_body is not null limit %s"
if(os.path.exists('model/so_tag_model')):
return Word2Vec.load('model/so_tag_model')
else:
tokens = list()
tokens.extend(Token4DB(sql_sel_so_excep).get())
tokens.extend(Token4DB(sql_sel_so_body,extract_so_wiki_body).get())
model = Word2Vec(tokens,min_count=1)
model.save('model/so_tag_model')
return model
def get_so_post_model():
if(os.path.exists('model/so_post_model')):
return Word2Vec.load('model/so_post_model')
else:
model = Word2Vec(Token4DB(sql_sel_so_pos,extract_so).get(200000),min_count=1)
model.save('model/so_post_model')
return model
def extract_so_wiki_body(text):
#step 1: extract text between <p> and </p>
texts = re.findall('<p>([\s\S]*?)</p>', text)
#step 2: remove href and code
texts = [ re.sub('<a href[\s\S]*?>|</a>|<pre>>[\s\S]*?</pre>|<code>[\s\S]*?</code>','',text) for text in texts]
return ''.join(texts)
def extract_so(text):
#step 1: extract text between <p> and </p>
texts = re.findall('<p>([\s\S]*?)</p>', text)
#step 2: remove href and code
texts = [ re.sub('<a href[\s\S]*?>|</a>|<code>|</code>|<pre>|</pre>','',text) for text in texts]
return ''.join(texts)
if __name__ == "__main__":
################### so 利用tag的wiki 摘要和body训练的##########################
# stemmer = nltk.PorterStemmer()
# model = get_so_tag_model()
# print model
##########################################################################
################### so 利用tag的wiki 摘要 训练##########################
# model = get_so_excep_model()
# print model
# ##########################################################################
# ################### so 利用post 20w##########################
# model = get_so_post_model()
# print model
# # ##########################################################################
# model = Word2Vec.load('model/so_post_model-1w')
# model = model = get_so_excep_model()
# ################### 写入csv ##########################
import csv
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
model_so_tag_excerpt = Word2Vec.load('model/so_excep_model')
model_so_tag = Word2Vec.load('model/so_tag_model')
model_so_post_1w = Word2Vec.load('model/so_post_model-1w')
model_so_post_20w = Word2Vec.load('model/so_post_model_20w')
print 'model is loaded'
queries = ['python','java','c','c++','ruby','scala',
'windows','linux','mac','android','ios','win8','x86',
'html','json','css','xml','http',
'redis','eclipse', 'mysql', 'scikit-learn', 'numpy', 'panda', 'django','spring', 'git',
'machine','learning', 'ide','directory','class', 'apache', 'maven'
]
titles = ['query','so_tag_excerpt','so_tag','so_post_1w','so_post_20w']
models = [model_so_tag_excerpt,model_so_tag,model_so_post_1w,model_so_post_20w]
with open('result.csv','w+') as fp:
cfp = csv.writer(fp,dialect='excel')
cfp.writerow(titles)
for query in queries:
row_data = list()
row_data.append(query)
for model in models:
try:
row_data.append('\n'.join([item[0] for item in model.most_similar(query)]))
except Exception,e:
row_data.append("null")
cfp.writerow(row_data)
# # ##########################################################################
# ################## 语言 ##################
# print 'python','>>>>>>>>>', model.most_similar(positive=['python'])
# print 'java','>>>>>>>>>', model.most_similar('java')
# print 'c','>>>>>>>>>', model.most_similar('c')
# print 'c++','>>>>>>>>>', model.most_similar('c++')
# print 'ruby','>>>>>>>>>', model.most_similar('ruby')
# # print 'c#','>>>>>>>>>', model.most_similar('c#')
# print 'scala','>>>>>>>>>', model.most_similar('scala')
#
#
# print '**********************'
# ################## 平台 ##################
# print 'windows','>>>>>>>>>', model.most_similar(positive=['windows'])
# print 'linux','>>>>>>>>>', model.most_similar('linux')
# print 'mac','>>>>>>>>>', model.most_similar('mac')
# print 'android','>>>>>>>>>', model.most_similar('android')
# print 'ios','>>>>>>>>>', model.most_similar('ios')
# # print 'win8','>>>>>>>>>', model.most_similar('win8')
# print 'x86','>>>>>>>>>', model.most_similar('x86')
#
#
# print '**********************'
# ################## 标准 ##################
# print 'html','>>>>>>>>>', model.most_similar(positive=['html'])
# print 'json','>>>>>>>>>', model.most_similar('json')
# print 'css','>>>>>>>>>', model.most_similar('css')
# print 'xml','>>>>>>>>>', model.most_similar('xml')
# print 'http','>>>>>>>>>', model.most_similar('http')
#
#
#
# print '**********************'
# ################## 软件 ##################
# # print 'redis','>>>>>>>>>',model.most_similar(positive=['redis'])
# print 'eclipse', '>>>>>>>>>',model.most_similar(positive=['eclipse'])
# print 'mysql', '>>>>>>>>>',model.most_similar(positive=['mysql'])
# # print 'scikit-learn', '>>>>>>>>>',model.most_similar(positive=['scikit-learn'])
# print 'numpy', '>>>>>>>>>',model.most_similar(positive=['numpy'])
# # print 'panda', '>>>>>>>>>',model.most_similar(positive=['panda'])
# print 'django', '>>>>>>>>>',model.most_similar(positive=['django'])
# print 'spring', '>>>>>>>>>',model.most_similar(positive=['spring'])
# print 'git', '>>>>>>>>>',model.most_similar(positive=['git'])
#
# print '**********************'
# ################## 术语 ##################
# print 'machine','>>>>>>>>>', model.most_similar('machine')
# print 'learning', '>>>>>>>>>',model.most_similar('learning')
# print 'ide','>>>>>>>>>', model.most_similar('ide')
# print 'directory', '>>>>>>>>>',model.most_similar(positive=['directory'])
# print 'class', '>>>>>>>>>',model.most_similar(positive=['class'])
# print 'apache', '>>>>>>>>>',model.most_similar(positive=['apache'])
# print 'maven', '>>>>>>>>>',model.most_similar(positive=['maven'])
#
#
# print '**********************'
# ################## 组合 ##################
# print 'machine learning', '>>>>>>>>>',model.most_similar(positive=['machine','learning'])
# print 'python machine learning', '>>>>>>>>>',model.most_similar(positive=['python','machine','learning'])
# print 'java ide', '>>>>>>>>>',model.most_similar(positive=['java','ide'])
# print 'java log', '>>>>>>>>>',model.most_similar(positive=['java','log'])
# print 'java logging', '>>>>>>>>>',model.most_similar(positive=['java','logging'])
###########################################################
# so 标签的wiki excerpt和body
# print extract_so_wiki_body("<p>12345</p><p>12345</p>")
# sql_sel_so_body = "select tag_wiki_body from tag where id>=0 and tag_wiki_body is not null limit 1"
# cursor.execute(sql_sel_so_body)
# bodys = [item[0] for item in cursor.fetchall()]
# for body in bodys:
# try:
# print body
# print '-------'
# print extract_so_wiki_body(body)
# print '**********************'
# except Exception,e:
# pass
##########################################################
###########################################################
# so帖子的model
# tokens = Token4DB(sql_sel_so_pos,extract_so)
#
# for text in tokens:
# print text
#
#
# model = Word2Vec(tokens,min_count=1)
# print model
# model.save('model/yield')
# print model.similarity('question', 'java')
# print model.most_similar('java')
# print '--------------------------------'
# tokens = Token4DB(sql_sel_so_pos,extract_so).get()
# # for text in tokens:
# # print text
# #
# model = Word2Vec(tokens,min_count=1)
# print model
# model.save('model/list')
# model = Word2Vec.load('model/list')
# print 'machine',model.most_similar('machine')
# print 'learning',model.most_similar('learning')
# print 'machine learning',model.most_similar(positive = ['machine','learning'])
#
# print 'question-java',model.similarity('question', 'java')
# print 'java-eclipse',model.similarity('java','eclipse')
#
#
#
# print 'java + ide',model.most_similar(positive = ['java','ide'])
#
# print 'java',model.most_similar('java')
#
# print 'ide',model.most_similar('ide')
# print 'eclipse',model.most_similar('eclipse')
# print 'maven',model.most_similar('maven')
# print '--------------------------------'
######################################################################
# model = get_so_excep_model()
# print model
#
# print model.similarity('python', 'java')
# print model.similarity('python', 'c++')
# print model.similarity('python', 'ruby')
# model = get_so_post_model()
# print model
#
# print model.similarity('question', 'Object')
# print model.similarity('python', 'c++')
# print model.similarity('python', 'ruby')
#
#
#
# print model.most_similar(positive=['python'])
# print model.most_similar(positive=['Object'])
# print model.most_similar(positive=['redis'])
# print model.most_similar(positive=['windows','java','ide'])
# print model.most_similar(positive=['eclipse'])
# print model.most_similar(positive=['delete'])
# print model.most_similar(positive=['java','eclipse'],negative = ['c++'])
# print extract_so("<p>12345</p><p>12345</p>")
# sql_sel_so_pos = 'select body from shit_lee limit 10'
# cursor.execute(sql_sel_so_pos)
# bodys = [item[0] for item in cursor.fetchall()]
# for body in bodys:
# try:
# print body
# print '-------'
# print extract_so(body)
# print '**********************'
# except Exception,e:
# pass
# model = get_so_post_model()
# print model

BIN
other/word_embedding.pyc Normal file

Binary file not shown.

221
search.py Normal file
View File

@ -0,0 +1,221 @@
#coding:utf-8
'''
Created on 2016年10月6日
@author: StarLee
'''
import MySQLdb
from _elementtree import Comment
###################################################################
conn2 = MySQLdb.connect("localhost","starlee","1234","query_expansion" )
cursor2 = conn2.cursor()
#加载最后要被过滤掉的项目名
sql_tag_is_prj = 'select name,is_prj_name from tag'
cursor2.execute(sql_tag_is_prj)
tag_is_prj_name = {row[0]:row[1] for row in cursor2.fetchall()}
#加载过滤掉的tag
with open("stop_word_prj_name") as file:
stop_words_prj_name = set([line.strip() for line in file.readlines() if(( not line.startswith("#")) and line.strip()) ])
#tag同义词
sql_get_syns = "select from_tag,to_tag from synonym"
cursor2.execute(sql_get_syns)
syns = {row[0]:row[1] for row in cursor2.fetchall()}
def get_syns(term):
if term not in syns:
return term
return syns[term]
##################################################################
##################################################################
conn = MySQLdb.connect(host="localhost",user="starlee",passwd="1234",db="sldb",charset='utf8' )
cursor = conn.cursor()
sql_sel_linked_tags = 'select t2,count from tag_postlinks where t1=%s union select t1,count from tag_postlinks where t2=%s'
# sql_sel_cf_tags = 'select t2,rv from `stdb_cf` where t1=%s union select t1,rv from `stdb_cf` where t2=%s'
sql_sel_cf_tags = 'select t2,rv from `stdb_cf` where id1=%s union select t1,rv from `stdb_cf` where id2=%s'
# sql_sel_co_tags = 'select t2,count from stdb_co where t1=%s union select t1,count from stdb_co where t2=%s'
sql_sel_co_tags = 'select t2,count from stdb_co where id1=%s union select t1,count from stdb_co where id2=%s'
sql_sel_dpl_tags = 'select t2,count from stdb_dpl where t1=%s union select t1,count from stdb_dpl where t2=%s'
#tag的count
sql_sel_tags = 'select tagname,count,id from tags'
cursor.execute(sql_sel_tags)
result = cursor.fetchall()
tags = {item[0]:item[1] for item in result}
tags_id = {item[0]:item[2] for item in result}
tags_name_set = set(tags.keys())
##################################################################
##################################################################
#返回利用co-occurence得到的一个tag的相关tag和rv:(tag,rv)
def get_co_tags(item):
tag_id = tags_id[item]
cursor.execute(sql_sel_co_tags,(tag_id,tag_id))
result = cursor.fetchall()
# total_count = sum([item[1] for item in result])
# return sorted([(item[0],1.0*item[1]/total_count) for item in result],lambda x, y: cmp(x[1], y[1]),reverse = True)
return {get_syns(t_c[0]):1.0*t_c[1] * t_c[1]/(tags[get_syns(t_c[0])]) for t_c in result if get_syns(t_c[0]) in tags_name_set}#经常出现keyerror有些tag在tags表中不存在
#返回利用duplicate postlinke得到的一个tag的相关tag和rv:(tag,rv)
def get_duplink_tags(item):
#有可能这样总情况4,1,1,1,1,1,1,1,1遮掩就会把4的这个得分给分散低了
cursor.execute(sql_sel_dpl_tags,(item,item))
result = cursor.fetchall()
total_count = sum([item[1] for item in result])
# return sorted([(item[0],1.0*item[1]/total_count) for item in result],lambda x, y: cmp(x[1], y[1]),reverse = True)
return {get_syns(item[0]):1.0*item[1]/total_count for item in result}
#返回利用postlink得到的一个tag的相关tag和rv:(tag,rv)
def get_linked_tags(item):
#有可能这样总情况4,1,1,1,1,1,1,1,1遮掩就会把4的这个得分给分散低了
cursor.execute(sql_sel_linked_tags,(item,item))
result = cursor.fetchall()
total_count = sum([item[1] for item in result])
# return sorted([(item[0],1.0*item[1]/total_count) for item in result],lambda x, y: cmp(x[1], y[1]),reverse = True)
return {get_syns(item[0]):1.0*item[1]/total_count for item in result}
#返回利用collaborative filterring得到的一个tag的相关tag:(tag,rv)
def get_cf_tags(item):
#(tag,rv)
tag_id = tags_id[item]
cursor.execute(sql_sel_co_tags,(tag_id,tag_id))
result = cursor.fetchall()
return {get_syns(item[0]):item[1] for item in result}
##################################################################
if __name__ == '__main__':
########### single search
query = 'java ide'
items = query.split(' ')
items = [get_syns(term) for term in items]
total_related_tags = list()#用于保存每个term对应的related tags的list
get_related_tags = get_co_tags#后面的获得相关tag均用这个函数名
# get_related_tags = get_duplink_tags#后面的获得相关tag均用这个函数名
# get_related_tags = get_cf_tags#后面的获得相关tag均用这个函数名
#### 利用 get_related_tags获取相关tag的rank value #############################
related_tags = get_related_tags(items[0])
total_related_tags.append(related_tags)
commen_tags = set(related_tags.keys())#query 中各个item共同关联的tag
for item in items[1:]:
related_tags = get_related_tags(item)
total_related_tags.append(related_tags)
commen_tags.intersection_update(set(related_tags.keys()))
# print sorted(related_tags.items(),lambda x, y: cmp(x[1], y[1]),reverse = True)
commen_tags_score = list()
for item in commen_tags:
if (item in tag_is_prj_name and tag_is_prj_name[item] == 0) or item in stop_words_prj_name :
continue
score = 1
for tlt in total_related_tags:
score *= tlt[item]
commen_tags_score.append((item,score))
final_result = list()
for item in sorted(commen_tags_score,lambda x, y: cmp(x[1], y[1]),reverse = True)[:10]:
final_result.append(item[0])
print item[0]
#################################
# ### similarity in wordembedding model
import other.tag_doc as td
td.model_2 = td.Word2Vec.load('other/model/so_excep_model')
tag_vec = dict()
for tag in final_result:
if tag in td.model_2:
tag_vec[tag] = td.get_text_vec(tag)
rm = list()
valid_tag = [item for item in final_result if item in tag_vec]
for i in range(0,len(valid_tag)):
# if valid_tag[i] in tag_vec:
tmp_l = list()
for j in range(i+1,len(valid_tag)):
# if final_result[j] in tag_vec:
tmp_l.append("(%s,%s,%f)"%(valid_tag[i],valid_tag[j],
td.coss(tag_vec[valid_tag[i]],tag_vec[valid_tag[j]])) )
if len(tmp_l) > 0:
rm.append(tmp_l)
for i in range(0,len(rm)):
line = ''
for j in range(0,i+1):
line = "%s\t%s"%(line,rm[j][i-j])
print line
# ### relevance value in stdb_dpl and stdb_cf
# for i1 in final_result:
# for i2 in final_result:
# if i1 == i2:
# continue
# print "for %s-%s:"%(i1,i2)
#
#
#
#
# cursor.execute("SELECT count FROM `stdb_dpl` WHERE (`t1` = %s and `t2` = %s) or (`t2` = %s and `t1` = %s)",(i1,i2,i2,i1))
# count_dpl = cursor.fetchone()
# if count_dpl != None:
# print " >dpl:%d"%count_dpl[0]
# else:
# print " >dpl:none"
#
#
# cursor.execute("SELECT rv FROM `stdb_cf_0&1` WHERE (`t1` = %s and `t2` = %s) or (`t2` = %s and `t1` = %s)",(i1,i2,i2,i1))
# count_cf = cursor.fetchone()
# if count_cf != None:
# print " >cf:%f"%count_cf[0]
# else:
# print " >cf:none"
# print '----------------------------------'
### relevance value in stdb_dpl and stdb_cf
########### single search
cursor.close()
conn.close()

0
stdb/__init__.py Normal file
View File

39
stdb/dpl_cf.py Normal file
View File

@ -0,0 +1,39 @@
#coding:utf-8
'''
Created on 2016年10月19日
@author: StarLee
'''
import MySQLdb
conn2 = MySQLdb.connect("localhost","starlee","1234","query_expansion" )
cursor2 = conn2.cursor()
#加载最后要被过滤掉的项目名
sql_tag_is_prj = 'select name,is_prj_name from tag'
cursor2.execute(sql_tag_is_prj)
tag_is_prj_name = {row[0]:row[1] for row in cursor2.fetchall()}
conn = MySQLdb.connect(host="localhost",user="starlee",passwd="1234",db="sldb",charset='utf8' )
cursor = conn.cursor()
sql_dpl = "(SELECT t2,count FROM `stdb_dpl` WHERE `t1` = %s order by count desc limit 20) union (SELECT t1,count FROM `stdb_dpl` WHERE `t2` = %s order by count desc limit 20)"
sql_cf = "(SELECT t2,rv FROM `stdb_cf_0&1` WHERE `t1` = %s order by rv desc limit 20) union (SELECT t1,rv FROM `stdb_cf_0&1` WHERE `t2` = %s order by rv desc limit 20)"
tag = 'eclipse'
cursor.execute(sql_dpl,(tag,tag))
result = [item[0] for item in cursor.fetchall() if item[0] in tag_is_prj_name]
for r in result[:20]:
print r
print '\n---------------------\n'
cursor.execute(sql_cf,(tag,tag))
result = [item[0] for item in cursor.fetchall() if item[0] in tag_is_prj_name]
for r in result[:20]:
print r

124
stdb/stdb_cf.py Normal file
View File

@ -0,0 +1,124 @@
#coding:utf-8
'''
Created on 2016年10月7日
@author: StarLee
'''
import MySQLdb
import logging
import numpy
import math
logger = logging.getLogger()
hdlr = logging.FileHandler("top_cf_2.log")
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr)
logger.setLevel(logging.NOTSET)
conn = MySQLdb.connect(host="localhost",user="starlee",passwd="1234",db="sldb",charset='utf8' )
cursor = conn.cursor()
sql_sel_tag = 'select id,tagname from tags'
sql_ins_tag_cf = "insert into stdb_cf(id1,t1,id2,t2,rv) values(%s,%s,%s,%s,%s)"
cursor.execute(sql_sel_tag)
total_tags = {item[1]:item[0] for item in cursor.fetchall()}#所有的tag<name,id>
tags = total_tags.keys()
#
# top_users = dict()# 所有tag对应的top user
#
# for tag in tags:
# cursor.execute(sql_sel_top_user,(tag,))
# result = cursor.fetchall()
# top_users[tag] = set([item[0] for item in result])
# logger.info("%s is done"%tag)
#
# logger.info('top_users is done')
#####################################
top_users = dict()# 所有tag对应的top user
sql_sel_utc = 'select id,tag,user_id,count from user_tag_count where id>%s limit 1000'
start = 0
while True:
cursor.execute(sql_sel_utc,(start,))
result = cursor.fetchall()
lens = len(result)
logger.info(">=%d:%d"%(start,lens))
if lens==0:
logger.info("done top_users")
print "done top_users"
break
for item in result:
start = item[0]
if item[1] not in top_users:
top_users[item[1]] = list()
top_users[item[1]].append((item[2],item[3]))
for tag,u_c in top_users.items():
top_users[tag] = set([s_uc[0] for s_uc in sorted(u_c,key=lambda x:x[1],reverse=True)[:50]])
logger.info('done sorted top_users')
print 'done sorted top_users'
#####################################
for i in range(0,len(tags)):
if tags[i] not in top_users:
continue
top_user_i = top_users[tags[i]]
# logger.info(total_tags[tags[i]])
# print top_user_i
for j in range(i+1,len(tags)):
if tags[j] not in top_users:
continue
top_user_j = top_users[tags[j]]
# print top_user_j
# #求并集
# commen_users = top_user_i | top_user_j
# #构造向量
# vec_i,vec_j = list(),list()
# for commen_user in commen_users:
# if commen_user in top_user_i:
# vec_i.append(1)
# else:
# vec_i.append(0)
#
# if commen_user in top_user_j:
# vec_j.append(1)
# else:
# vec_j.append(0)
#
# n_vec_i = numpy.matrix(vec_i)
# n_vec_j = numpy.matrix(vec_j)
# num = float(n_vec_i * n_vec_j.T)
#
# denom = numpy.linalg.norm(n_vec_i ) * numpy.linalg.norm( n_vec_j )
#
# cos = num / denom #余弦值
commen_users = top_user_i & top_user_j
cos = len(commen_users) / math.sqrt(len(top_user_i) * len(top_user_j))
if cos > 0.00001:
cursor.execute(sql_ins_tag_cf,(total_tags[tags[i]],tags[i],total_tags[tags[j]],tags[j],cos))
# print num,denom,cos
conn.commit()
cursor.close()
conn.close()

81
stdb/stdb_co.py Normal file
View File

@ -0,0 +1,81 @@
#coding:utf-8
'''
Created on 2016年10月4日
@author: StarLee
'''
import logging
logger = logging.getLogger()
hdlr = logging.FileHandler("tag_coocur.log")
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr)
logger.setLevel(logging.NOTSET)
import MySQLdb
from _mysql import NULL
conn = MySQLdb.connect(host="localhost",user="starlee",passwd="1234",db="sldb",charset='utf8' )
cursor = conn.cursor()
cursor.execute('select id,tagname from tags')
raw_tags = {item[1]:item[0] for item in cursor.fetchall()}
import re
def get_tags():
cursor.execute('select value from pointers where name = "tag_cooccur_posts"')
pointer = cursor.fetchone()[0]
logger.info("last pointer: %d"%pointer)
cursor.execute('select id,tags from posts where id>%s and tags is not null limit %s',(pointer,10000))
posts = cursor.fetchall()
if posts is None or len(posts) == 0:
return None
new_pointer = posts[-1][0]
cursor.execute('update pointers set value=%s where name = "tag_cooccur_posts"',(new_pointer,))
conn.commit()
return [re.findall('<([\s\S]*?)>', item[1])for item in posts]
def save_tag_co_occure(tag_cooccure):
for key,value in tag_cooccure.items():
t1,t2 = key.split("-&-")
try:
id1 = raw_tags[t1]
except Exception,e:
id1 = -1
try:
id2 = raw_tags[t2]
except Exception,e:
id2 = -1
cursor.execute('insert into stdb_co(id1,t1,id2,t2,count) values(%s,%s,%s,%s,%s)',(id1,t1,id2,t2,value))
conn.commit()
tag_co_occure = dict() #already processed tag-pair: <tag-&-tag,count>
if __name__ == '__main__':
tagss = get_tags()
# print tagss
while tagss is not None :
for tags in tagss:
length = len(tags)
for i in range(0,length):
for j in range(i+1,length):
repre = '-&-'.join(sorted([tags[i],tags[j]]))
if repre not in tag_co_occure:
tag_co_occure[repre] = 0
tag_co_occure[repre] += 1
tagss = get_tags()
save_tag_co_occure(tag_co_occure)

82
stdb/stdb_dpl.py Normal file
View File

@ -0,0 +1,82 @@
#coding:utf-8
'''
Created on 2016年10月4日
@author: StarLee
'''
# 统计 duplicate post间的tag关系
import logging
logger = logging.getLogger()
hdlr = logging.FileHandler("post_duplicate.log")
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr)
logger.setLevel(logging.NOTSET)
import MySQLdb
from _mysql import NULL
conn = MySQLdb.connect(host="localhost",user="starlee",passwd="1234",db="sldb",charset='utf8' )
cursor = conn.cursor()
import re
#取出duplicate posts
cursor.execute("SELECT Id,PostId,RelatedPostId FROM `postlinks` WHERE `LinkTypeId` = '3' AND `PostId` < '25829714' ORDER BY `PostId`")
postlinks = cursor.fetchall()
tag_dpl = dict() #already processed tag-pair: <tag-&-tag,count>
for postlink in postlinks:
post_id, related_post_id = postlink[1],postlink[2]
cursor.execute('select tags from posts where Id = %s',(post_id,))
tags1 = cursor.fetchone()
cursor.execute('select tags from posts where Id = %s',(related_post_id,))
tags2 = cursor.fetchone()
if tags1 == None or tags2==None:
pass
else:
########################### comment for log view
ts1 = set(re.findall('<([\s\S]*?)>',tags1[0]))
ts2 = set(re.findall('<([\s\S]*?)>',tags2[0]))
com = ts1 & ts2
if len(ts1-com)>0 and len(ts2-com)>0:
logger.info("%s-%s>> %s - %s"%(post_id,related_post_id,','.join(list(ts1-com)),','.join(list(ts2-com))))
for item1 in list(ts1-com):
for item2 in list(ts2-com):
repre = '-&-'.join(sorted([item1,item2]))
if repre not in tag_dpl:
tag_dpl[repre] = 0
tag_dpl[repre] += 1
#
# cursor.execute('select id,tagname from tags')
# raw_tags = {item[1]:item[0] for item in cursor.fetchall()}
#
# for key, value in tag_dpl.items():
# t1, t2 = key.split("-&-")
# try:
# id1 = raw_tags[t1]
# except Exception, e:
# id1 = -1
# try:
# id2 = raw_tags[t2]
# except Exception, e:
# id2 = -1
# cursor.execute('insert into stdb_dpl(id1,t1,id2,t2,count) values(%s,%s,%s,%s,%s)', (id1, t1, id2, t2, value))
# conn.commit()
########################### comment for log view
# fp.write("%s-%s >> %s - %s\n"%(tags1[0],tags2[0],','.join(),','.join(list(ts2 - com))))
# count += 1
# print count

126
stdb/stdb_pl.py Normal file
View File

@ -0,0 +1,126 @@
#coding:utf-8
'''
Created on 2016年10月4日
@author: StarLee
'''
import logging
logger = logging.getLogger()
hdlr = logging.FileHandler("tag_postlinks.log")
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr)
logger.setLevel(logging.NOTSET)
import MySQLdb
from _mysql import NULL
conn = MySQLdb.connect(host="localhost",user="starlee",passwd="1234",db="sldb",charset='utf8' )
cursor = conn.cursor()
cursor.execute('select id,name from tag')
raw_tags = {item[1]:item[0] for item in cursor.fetchall()}
import re
def get_post_links():
cursor.execute('select value from pointers where name = "postlinks"')
pointer = cursor.fetchone()[0]
logger.info("last pointer: %d"%pointer)
cursor.execute('select Id,PostId,RelatedPostId from postlinks where id>%s limit %s',(pointer,10000))
postlinks = cursor.fetchall()
# print postlinks
if postlinks is None or len(postlinks) == 0:
return None
new_pointer = postlinks[-1][0]
cursor.execute('update pointers set value=%s where name = "postlinks"',(new_pointer,))
conn.commit()
return postlinks
# return [ (item[1],item[2]) for item in postlinks]
def get_tags(post_id):
cursor.execute('select tags from posts where Id = %s',(post_id,))
tags = cursor.fetchone()
# print post_id
# print tags
if tags == None: #post找不到被删除了
return None
else:
tags = tags[0]
if tags == None:#该post类型是answertags 为空
return None
return re.findall('<([\s\S]*?)>',tags)
def update_bad_post_links(bad_post_links):
cursor.execute('select value from pointers where name = "bad_postlinks"')
count = cursor.fetchone()[0]
cursor.execute('update pointers set value=%s where name = "bad_postlinks"',(count + bad_post_links,))
conn.commit()
def save_tag_postlinks(tag_post_link):
for key,value in tag_post_link.items():
t1,t2 = key.split("-&-")
try:
id1 = raw_tags[t1]
except Exception,e:
id1 = -1
try:
id2 = raw_tags[t2]
except Exception,e:
id2 = -1
cursor.execute('insert into tag_postlinks(id1,t1,id2,t2,count) values(%s,%s,%s,%s,%s)',(id1,t1,id2,t2,value))
conn.commit()
tag_post_link = dict() #already processed tag-pair: <tag-&-tag,count>
if __name__ == '__main__':
headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/2010010 1 Firefox/34.0"}
#step 1 load db of processed tag-postlikn
#step 3.1 select batch-size postlinks
postlinks = get_post_links()
while postlinks is not None :
# bad_post_links = 0
for postlink in postlinks:
# logger.info(postlink[0])
post_id, related_post_id = postlink[1],postlink[2]
#step 3.2 get corresponding tags
tags = get_tags(post_id)
related_tags = get_tags(related_post_id)
if tags is None or related_tags is None:
# logger.info('!!bad postlinks')
cursor.execute('insert into bad_tag_postlinks(post_id,related_post_id) values(%s,%s)',(post_id,related_post_id))
# bad_post_links += 1
continue
# print '%s <-> %s'%(','.join(tags),','.join(related_tags))
#step 3.2 match and count corresponding tags
for tag in tags:
for related_tag in related_tags:
# print "\t%s-%s"%(tag,related_tag)
if tag == related_tag:
#print "\tignore"
pass
else:
repre = '-&-'.join(sorted([tag,related_tag]))
if repre not in tag_post_link:
tag_post_link[repre] = 0
tag_post_link[repre] += 1
# print 'processed'
# update_bad_post_links(bad_post_links)
postlinks = get_post_links()
save_tag_postlinks(tag_post_link)