correlation search and prj sim is done

This commit is contained in:
StarLee 2016-10-27 15:50:28 +08:00
parent 1812ac7fa8
commit 53145f48a0
6 changed files with 105 additions and 77 deletions

View File

@ -0,0 +1,25 @@
#coding:utf-8
'''
Created on 2016年10月27日
@author: StarLee
'''
import MySQLdb
conn = MySQLdb.connect(host="localhost",user="starlee",passwd="1234",db="sldb",charset='utf8' )
cursor = conn.cursor()
conn2 = MySQLdb.connect(host="localhost",user="starlee",passwd="1234",db="query_expansion",charset='utf8' )
cursor2 = conn2.cursor()
cursor2.execute('select name,is_prj_name from tag')
ot = cursor2.fetchall()
count = 0
for t in ot:
if t[1] == 1:
print t[0]
print count
cursor.execute('update tags set is_prj_name=1 where tagname=%s',(t[0],))
count += 1
conn.commit()

View File

@ -0,0 +1,37 @@
#coding:utf-8
'''
Created on 2016年10月27日
@author: StarLee
'''
import MySQLdb
#!!!!!!!!!!后面要判断连接的有效性
conn = MySQLdb.connect(host="localhost",user="starlee",passwd="1234",db="sldb",charset='utf8' )
cursor = conn.cursor()
sql_sel_all_co = 'select id1,t1,id2,t2,count,id from stdb_co_backup where id>%s limit 1000'
sql_sel_prj = 'select id from tags where is_prj_name=1'
cursor.execute(sql_sel_prj)
prjs = [item[0] for item in cursor.fetchall()]
start = 2387064
while True:
cursor.execute(sql_sel_all_co,(start,))
co = cursor.fetchall()
if len(co) == 0:
break
for item in co:
print item[5]
if item[2] in prjs:
cursor.execute("insert into stdb_co(id1,t1,id2,t2,count) values(%s,%s,%s,%s,%s)",item[:5])
if item[0] in prjs:
cursor.execute("insert into stdb_co(id2,t2,id1,t1,count) values(%s,%s,%s,%s,%s)",item[:5])
conn.commit()
start += 1000
cursor.close()
conn.close()

View File

@ -47,6 +47,7 @@ def extract_so_excerpt(text):
def get_text_vec(text):
all_token = get_tag_tokens(text, extract_so_excerpt)
if len(all_token) is 0:
return None
all_vec = list()
@ -94,18 +95,36 @@ if __name__ == '__main__':
# # convert tag to vector
# sql_sel_tag = 'select id,excerpt from tag_info where excerpt is not Null'
# sql_sel_tag = 'select id,excerpt,wiki from tag_info where excerpt is not Null or wiki is not Null'
# sql_ins_tag_vec = 'update tag_info set vec=%s where id=%s'
# cursor.execute(sql_sel_tag)
# tags = cursor.fetchall()
# for tag in tags:
# tag_id,excerpt = tag
# tag_id,excerpt,wiki = tag
# print tag_id
# if excerpt.strip() != '':
# te_v = get_text_vec(excerpt)
# if te_v is not None:
# vec_str = pickle.dumps(te_v)
# cursor.execute(sql_ins_tag_vec,(vec_str,tag_id))
#
# all_token = list()
# if excerpt is not None and excerpt.strip() != '':
# all_token.extend(get_tag_tokens(excerpt, extract_so_excerpt))
#
# if wiki is not None and wiki.strip() != '':
# all_token.extend(get_tag_tokens(wiki, extract_so_wiki_body))
#
# if len(all_token) != 0:
#
# all_vec = list()
# for token in all_token:
# if token in model_2:
# all_vec.append(model_2[token])
#
# final_vec = np.array([0.0] * len(all_vec[0]))
# for vec in all_vec:
# final_vec += vec
#
# tag_vec = final_vec / len(all_vec)
#
# vec_str = pickle.dumps(tag_vec)
# cursor.execute(sql_ins_tag_vec,(vec_str,tag_id))
# conn.commit()
# # convert tag to vector
@ -116,17 +135,17 @@ if __name__ == '__main__':
# get similarity between two tags(prj) with vector
with open("stop_word_prj_name") as file:
stop_words_prj_name = set([line.strip() for line in file.readlines() if(( not line.startswith("#")) and line.strip()) ])
sql_sel_prj_tag = 'select id from tags where is_prj_name=1'
cursor.execute(sql_sel_prj_tag)
prj_tags = set([item[0] for item in cursor.fetchall()])
sql_sel_tag_vec = 'select id,vec from tag_info where vec is not Null'
cursor.execute(sql_sel_tag_vec)
prj_vec = [item for item in cursor.fetchall() if item[0] in prj_tags]
print len(prj_vec)
for prj,vec in prj_vec:
print prj
for oprj,ovec in prj_vec:
@ -134,68 +153,8 @@ if __name__ == '__main__':
sim = coss(pickle.loads(str(vec)),pickle.loads(str(ovec)))
cursor.execute('insert into prj_sim(id1,id2,sim) values(%s,%s,%s)',(prj,oprj,sim))
conn.commit()
# get similarity between two tags(prj) with vector
# tag_vec = dict()
# for tag in tags[0:]:
# if tag in model_2:
# tag_vec[tag] = get_text_vec(tag)
#
#
# for k1,v1 in tag_vec.items():
# for k2,v2 in tag_vec.items():
# print "%s-%s:%f"%(k1,k2,coss(v1,v2))
# print '--------------------'
# X = []
# valid_tag = list()
# for tag in tags[0:]:
# if tag in model_2:
# valid_tag.append(tag)
# X.append(list(get_text_vec(tag)))
#
# cp = hcluster.fclusterdata(np.array(X),t=1)
# ct = dict()
# for pos in range(0,len(cp)):
# if cp[pos] not in ct:
# ct[cp[pos]] = list()
# ct[cp[pos]].append((pos,valid_tag[pos]))
#
# for key,value in ct.iteritems():
# print value
#
# X =[[1,1],
# [1,2],
# [2,1],
# [2,2],
#
# [3,7],
# [3,8],
# [4,7],
# [4,8],
#
# [7,2],
# [7,3],
# [8,2],
# [8,3]
#
# ]
#
# print hcluster.fclusterdata(np.array(X),t=1)
# import matplotlib.pylab as plt
#
# d = hcluster.distance.pdist(X)
#
# Z= hcluster.linkage(d,method='complete')
#
# P =hcluster.dendrogram(Z)
#
# plt.savefig('plot_dendrogram.png')
# print 'done'

Binary file not shown.

View File

@ -166,9 +166,9 @@ def cans_query(items):
weight = [1,0.5,0.5]
total_related_result = list()#针对每种方法方法的相关tag的集合
related_functions = [
# get_co_tags#后面的获得相关tag均用这个函数名,
get_co_tags#后面的获得相关tag均用这个函数名,
# get_duplink_tags#后面的获得相关tag均用这个函数名,
get_cf_tags#后面的获得相关tag均用这个函数名
# get_cf_tags#后面的获得相关tag均用这个函数名
]
for related_fun in related_functions:
get_related_tags = related_fun
@ -235,7 +235,7 @@ if __name__ == '__main__':
try:
old_value = cans[key]
if old_value < value:#选相关性较大的那个
cans[old_value] = value
cans[key] = value
except Exception,e:#说明新的结果在can中不存在直接添加即可
cans[key] = value

11
test.py
View File

@ -10,5 +10,12 @@ from __builtin__ import str
conn = MySQLdb.connect(host="localhost",user="starlee",passwd="1234",db="sldb",charset='utf8' )
cursor = conn.cursor()
print ' 123 '
print ' 123 '.s
l = [1,2,3]
m = {item:item for item in l}
print m
print m[1]
print m[1]
print m[1]
m[1] = 2
print m