correlation search and prj sim is done
This commit is contained in:
parent
1812ac7fa8
commit
53145f48a0
|
@ -0,0 +1,25 @@
|
|||
#coding:utf-8
|
||||
'''
|
||||
Created on 2016年10月27日
|
||||
|
||||
@author: StarLee
|
||||
'''
|
||||
import MySQLdb
|
||||
conn = MySQLdb.connect(host="localhost",user="starlee",passwd="1234",db="sldb",charset='utf8' )
|
||||
cursor = conn.cursor()
|
||||
|
||||
|
||||
conn2 = MySQLdb.connect(host="localhost",user="starlee",passwd="1234",db="query_expansion",charset='utf8' )
|
||||
cursor2 = conn2.cursor()
|
||||
|
||||
cursor2.execute('select name,is_prj_name from tag')
|
||||
ot = cursor2.fetchall()
|
||||
|
||||
count = 0
|
||||
for t in ot:
|
||||
if t[1] == 1:
|
||||
print t[0]
|
||||
print count
|
||||
cursor.execute('update tags set is_prj_name=1 where tagname=%s',(t[0],))
|
||||
count += 1
|
||||
conn.commit()
|
|
@ -0,0 +1,37 @@
|
|||
#coding:utf-8
|
||||
'''
|
||||
Created on 2016年10月27日
|
||||
|
||||
@author: StarLee
|
||||
'''
|
||||
|
||||
import MySQLdb
|
||||
#!!!!!!!!!!后面要判断连接的有效性
|
||||
conn = MySQLdb.connect(host="localhost",user="starlee",passwd="1234",db="sldb",charset='utf8' )
|
||||
cursor = conn.cursor()
|
||||
sql_sel_all_co = 'select id1,t1,id2,t2,count,id from stdb_co_backup where id>%s limit 1000'
|
||||
|
||||
sql_sel_prj = 'select id from tags where is_prj_name=1'
|
||||
cursor.execute(sql_sel_prj)
|
||||
prjs = [item[0] for item in cursor.fetchall()]
|
||||
|
||||
start = 2387064
|
||||
|
||||
while True:
|
||||
cursor.execute(sql_sel_all_co,(start,))
|
||||
co = cursor.fetchall()
|
||||
if len(co) == 0:
|
||||
break
|
||||
|
||||
for item in co:
|
||||
print item[5]
|
||||
if item[2] in prjs:
|
||||
cursor.execute("insert into stdb_co(id1,t1,id2,t2,count) values(%s,%s,%s,%s,%s)",item[:5])
|
||||
if item[0] in prjs:
|
||||
cursor.execute("insert into stdb_co(id2,t2,id1,t1,count) values(%s,%s,%s,%s,%s)",item[:5])
|
||||
|
||||
conn.commit()
|
||||
start += 1000
|
||||
|
||||
cursor.close()
|
||||
conn.close()
|
103
other/tag_doc.py
103
other/tag_doc.py
|
@ -47,6 +47,7 @@ def extract_so_excerpt(text):
|
|||
def get_text_vec(text):
|
||||
|
||||
all_token = get_tag_tokens(text, extract_so_excerpt)
|
||||
|
||||
if len(all_token) is 0:
|
||||
return None
|
||||
all_vec = list()
|
||||
|
@ -94,18 +95,36 @@ if __name__ == '__main__':
|
|||
|
||||
|
||||
# # convert tag to vector
|
||||
# sql_sel_tag = 'select id,excerpt from tag_info where excerpt is not Null'
|
||||
# sql_sel_tag = 'select id,excerpt,wiki from tag_info where excerpt is not Null or wiki is not Null'
|
||||
# sql_ins_tag_vec = 'update tag_info set vec=%s where id=%s'
|
||||
# cursor.execute(sql_sel_tag)
|
||||
# tags = cursor.fetchall()
|
||||
# for tag in tags:
|
||||
# tag_id,excerpt = tag
|
||||
# tag_id,excerpt,wiki = tag
|
||||
# print tag_id
|
||||
# if excerpt.strip() != '':
|
||||
# te_v = get_text_vec(excerpt)
|
||||
# if te_v is not None:
|
||||
# vec_str = pickle.dumps(te_v)
|
||||
# cursor.execute(sql_ins_tag_vec,(vec_str,tag_id))
|
||||
#
|
||||
# all_token = list()
|
||||
# if excerpt is not None and excerpt.strip() != '':
|
||||
# all_token.extend(get_tag_tokens(excerpt, extract_so_excerpt))
|
||||
#
|
||||
# if wiki is not None and wiki.strip() != '':
|
||||
# all_token.extend(get_tag_tokens(wiki, extract_so_wiki_body))
|
||||
#
|
||||
# if len(all_token) != 0:
|
||||
#
|
||||
# all_vec = list()
|
||||
# for token in all_token:
|
||||
# if token in model_2:
|
||||
# all_vec.append(model_2[token])
|
||||
#
|
||||
# final_vec = np.array([0.0] * len(all_vec[0]))
|
||||
# for vec in all_vec:
|
||||
# final_vec += vec
|
||||
#
|
||||
# tag_vec = final_vec / len(all_vec)
|
||||
#
|
||||
# vec_str = pickle.dumps(tag_vec)
|
||||
# cursor.execute(sql_ins_tag_vec,(vec_str,tag_id))
|
||||
# conn.commit()
|
||||
# # convert tag to vector
|
||||
|
||||
|
@ -116,17 +135,17 @@ if __name__ == '__main__':
|
|||
# get similarity between two tags(prj) with vector
|
||||
with open("stop_word_prj_name") as file:
|
||||
stop_words_prj_name = set([line.strip() for line in file.readlines() if(( not line.startswith("#")) and line.strip()) ])
|
||||
|
||||
|
||||
sql_sel_prj_tag = 'select id from tags where is_prj_name=1'
|
||||
cursor.execute(sql_sel_prj_tag)
|
||||
prj_tags = set([item[0] for item in cursor.fetchall()])
|
||||
|
||||
|
||||
sql_sel_tag_vec = 'select id,vec from tag_info where vec is not Null'
|
||||
cursor.execute(sql_sel_tag_vec)
|
||||
prj_vec = [item for item in cursor.fetchall() if item[0] in prj_tags]
|
||||
|
||||
|
||||
print len(prj_vec)
|
||||
|
||||
|
||||
for prj,vec in prj_vec:
|
||||
print prj
|
||||
for oprj,ovec in prj_vec:
|
||||
|
@ -134,68 +153,8 @@ if __name__ == '__main__':
|
|||
sim = coss(pickle.loads(str(vec)),pickle.loads(str(ovec)))
|
||||
cursor.execute('insert into prj_sim(id1,id2,sim) values(%s,%s,%s)',(prj,oprj,sim))
|
||||
conn.commit()
|
||||
|
||||
|
||||
# get similarity between two tags(prj) with vector
|
||||
|
||||
|
||||
|
||||
|
||||
# tag_vec = dict()
|
||||
# for tag in tags[0:]:
|
||||
# if tag in model_2:
|
||||
# tag_vec[tag] = get_text_vec(tag)
|
||||
#
|
||||
#
|
||||
# for k1,v1 in tag_vec.items():
|
||||
# for k2,v2 in tag_vec.items():
|
||||
# print "%s-%s:%f"%(k1,k2,coss(v1,v2))
|
||||
# print '--------------------'
|
||||
# X = []
|
||||
# valid_tag = list()
|
||||
# for tag in tags[0:]:
|
||||
# if tag in model_2:
|
||||
# valid_tag.append(tag)
|
||||
# X.append(list(get_text_vec(tag)))
|
||||
#
|
||||
# cp = hcluster.fclusterdata(np.array(X),t=1)
|
||||
# ct = dict()
|
||||
# for pos in range(0,len(cp)):
|
||||
# if cp[pos] not in ct:
|
||||
# ct[cp[pos]] = list()
|
||||
# ct[cp[pos]].append((pos,valid_tag[pos]))
|
||||
#
|
||||
# for key,value in ct.iteritems():
|
||||
# print value
|
||||
#
|
||||
|
||||
|
||||
# X =[[1,1],
|
||||
# [1,2],
|
||||
# [2,1],
|
||||
# [2,2],
|
||||
#
|
||||
# [3,7],
|
||||
# [3,8],
|
||||
# [4,7],
|
||||
# [4,8],
|
||||
#
|
||||
# [7,2],
|
||||
# [7,3],
|
||||
# [8,2],
|
||||
# [8,3]
|
||||
#
|
||||
# ]
|
||||
#
|
||||
# print hcluster.fclusterdata(np.array(X),t=1)
|
||||
|
||||
|
||||
# import matplotlib.pylab as plt
|
||||
#
|
||||
# d = hcluster.distance.pdist(X)
|
||||
#
|
||||
# Z= hcluster.linkage(d,method='complete')
|
||||
#
|
||||
# P =hcluster.dendrogram(Z)
|
||||
#
|
||||
# plt.savefig('plot_dendrogram.png')
|
||||
# print 'done'
|
Binary file not shown.
|
@ -166,9 +166,9 @@ def cans_query(items):
|
|||
weight = [1,0.5,0.5]
|
||||
total_related_result = list()#针对每种方法方法的相关tag的集合
|
||||
related_functions = [
|
||||
# get_co_tags#后面的获得相关tag均用这个函数名,
|
||||
get_co_tags#后面的获得相关tag均用这个函数名,
|
||||
# get_duplink_tags#后面的获得相关tag均用这个函数名,
|
||||
get_cf_tags#后面的获得相关tag均用这个函数名
|
||||
# get_cf_tags#后面的获得相关tag均用这个函数名
|
||||
]
|
||||
for related_fun in related_functions:
|
||||
get_related_tags = related_fun
|
||||
|
@ -235,7 +235,7 @@ if __name__ == '__main__':
|
|||
try:
|
||||
old_value = cans[key]
|
||||
if old_value < value:#选相关性较大的那个
|
||||
cans[old_value] = value
|
||||
cans[key] = value
|
||||
except Exception,e:#说明新的结果在can中不存在,直接添加即可
|
||||
cans[key] = value
|
||||
|
||||
|
|
11
test.py
11
test.py
|
@ -10,5 +10,12 @@ from __builtin__ import str
|
|||
conn = MySQLdb.connect(host="localhost",user="starlee",passwd="1234",db="sldb",charset='utf8' )
|
||||
cursor = conn.cursor()
|
||||
|
||||
print ' 123 '
|
||||
print ' 123 '.s
|
||||
l = [1,2,3]
|
||||
|
||||
m = {item:item for item in l}
|
||||
print m
|
||||
print m[1]
|
||||
print m[1]
|
||||
print m[1]
|
||||
m[1] = 2
|
||||
print m
|
Loading…
Reference in New Issue