search_by_postlink is done

This commit is contained in:
StarLee 2016-10-06 21:49:14 +08:00
parent eb0f4e1bd2
commit fcdf2e8e6b
3 changed files with 162 additions and 38 deletions

84
search_by_postlink.py Normal file
View File

@ -0,0 +1,84 @@
#coding:utf-8
'''
Created on 2016年10月6日
@author: StarLee
'''
import MySQLdb
from _elementtree import Comment
conn2 = MySQLdb.connect("localhost","starlee","1234","query_expansion" )
cursor2 = conn2.cursor()
sql_tag_is_prj = 'select name,is_prj_name from tag'
cursor2.execute(sql_tag_is_prj)
tag_is_prj_name = {row[0]:row[1] for row in cursor2.fetchall()}
#加载过滤掉的tag
with open("stop_word_prj_name") as file:
stop_words_prj_name = set([line.strip() for line in file.readlines() if(( not line.startswith("#")) and line.strip()) ])
sql_get_syns = "select from_tag,to_tag from synonym"
cursor2.execute(sql_get_syns)
syns = {row[0]:row[1] for row in cursor2.fetchall()}
conn = MySQLdb.connect(host="localhost",user="starlee",passwd="1234",db="sldb",charset='utf8' )
cursor = conn.cursor()
sql_sel_linked_tags = 'select t2,count from tag_postlinks where t1=%s union select t1,count from tag_postlinks where t2=%s'
def get_syns(term):
if term not in syns:
return term
return syns[term]
def get_linked_tags(item):#返回一个tag的相关tag
#(tag,count)
#有可能这样总情况4,1,1,1,1,1,1,1,1遮掩就会把4的这个得分给分散低了
cursor.execute(sql_sel_linked_tags,(item,item))
result = cursor.fetchall()
total_count = sum([item[1] for item in result])
# return sorted([(item[0],1.0*item[1]/total_count) for item in result],lambda x, y: cmp(x[1], y[1]),reverse = True)
return {get_syns(item[0]):1.0*item[1]/total_count for item in result}
if __name__ == '__main__':
query = 'android database'
items = query.split(' ')
items = [get_syns(term) for term in items]
total_linked_tags = list()
linkedtags = get_linked_tags(items[0])
total_linked_tags.append(linkedtags)
commen_tags = set(linkedtags.keys())#query 中各个item共同关联的tag
for item in items[1:]:
linkedtags = get_linked_tags(item)
total_linked_tags.append(linkedtags)
commen_tags.intersection_update(set(linkedtags.keys()))
# print sorted(linkedtags.items(),lambda x, y: cmp(x[1], y[1]),reverse = True)
print commen_tags
commen_tags_score = list()
for item in commen_tags:
if (item in tag_is_prj_name and tag_is_prj_name[item] == 0) or item in stop_words_prj_name :
continue
score = 1
for tlt in total_linked_tags:
score *= tlt[item]
commen_tags_score.append((item,score))
for item in sorted(commen_tags_score,lambda x, y: cmp(x[1], y[1]),reverse = True)[:20]:
print item[0]
cursor.close()
conn.close()

35
stop_word_prj_name Normal file
View File

@ -0,0 +1,35 @@
#programing language
java
php
python
ruby
scala
c
c++
objective-c
perl
c#
.net
asp
go
grooby
fortran
swift
lisp
erlang
#operating system
windows
linux
android
ubuntu
ios
#industry standard
json
oauth
http
java-ee
java-se
java-me

81
test.py
View File

@ -1,43 +1,48 @@
# # #coding: utf-8
# # '''
# # Created on 2016年9月15日
# # # #coding: utf-8
# # # '''
# # # Created on 2016年9月15日
# # #
# # # @author: StarLee
# # # '''
# import nltk
# #
# # # nltk.app.chunkparser()
# from gensim.models import word2vec
# from gensim.models import Word2Vec
# #
# # @author: StarLee
# # '''
import nltk
#
# # nltk.app.chunkparser()
from gensim.models import word2vec
from gensim.models import Word2Vec
# text_in_gh = 'Redis is an in-memory database that persists on disk. The data model is key-value, but many different kind of values are supported: Strings, Lists, Sets, Sorted Sets, Hashes, HyperLogLogs, Bitmaps. http://redis.io'
# text_in_so = 'An open source BSD-licensed in-memory data structure store used as database, cache and message broker. Supports data structures such as strings, hashes, lists, sets, sorted sets with range queries, bitmaps, hyperloglogs and geospatial indexes with radius queries. Has built-in replication, Lua scripting, LRU eviction, transactions and different levels of on-disk persistence, high availability via Redis Sentinel and automatic partitioning with Redis Cluster.'
# #
# # # tokens = nltk.word_tokenize(text_in_gh)
# # tokens = nltk.word_tokenize(text_in_so)
# #
# # pos = nltk.pos_tag(tokens)
# # grammer = 'NP: {<DT>?<JJ>*<NN>}'
# # cp = nltk.RegexpParser(grammer)
# # result = cp.parse(pos)
# # print pos
# # print result
# tokens_1 = [nltk.word_tokenize(item) for item in nltk.sent_tokenize(text_in_gh)]
# tokens_2 = [nltk.word_tokenize(item) for item in nltk.sent_tokenize(text_in_so)]
#
text_in_gh = 'Redis is an in-memory database that persists on disk. The data model is key-value, but many different kind of values are supported: Strings, Lists, Sets, Sorted Sets, Hashes, HyperLogLogs, Bitmaps. http://redis.io'
text_in_so = 'An open source BSD-licensed in-memory data structure store used as database, cache and message broker. Supports data structures such as strings, hashes, lists, sets, sorted sets with range queries, bitmaps, hyperloglogs and geospatial indexes with radius queries. Has built-in replication, Lua scripting, LRU eviction, transactions and different levels of on-disk persistence, high availability via Redis Sentinel and automatic partitioning with Redis Cluster.'
#
# # tokens = nltk.word_tokenize(text_in_gh)
# tokens = nltk.word_tokenize(text_in_so)
# model_1 = Word2Vec(tokens_1 ,min_count=1)
# print model_1
# print model_1.most_similar('database')
# print model_1.vocab
#
# pos = nltk.pos_tag(tokens)
# grammer = 'NP: {<DT>?<JJ>*<NN>}'
# cp = nltk.RegexpParser(grammer)
# result = cp.parse(pos)
# print pos
# print result
tokens_1 = [nltk.word_tokenize(item) for item in nltk.sent_tokenize(text_in_gh)]
tokens_2 = [nltk.word_tokenize(item) for item in nltk.sent_tokenize(text_in_so)]
# print word2vec.train_batch_cbow(model_1,tokens_2,0.025,None,None)
# print model_1
# print model_1.most_similar('database')
# print model_1.vocab
#
# tokens_2.extend(tokens_1)
# model_2 = Word2Vec(tokens_2,min_count=1)
# print model_2
# print model_2.most_similar('database')
# print model_2.vocab
model_1 = Word2Vec(tokens_1 ,min_count=1)
print model_1
print model_1.most_similar('database')
print model_1.vocab
print word2vec.train_batch_cbow(model_1,tokens_2,0.025,None,None)
print model_1
print model_1.most_similar('database')
print model_1.vocab
tokens_2.extend(tokens_1)
model_2 = Word2Vec(tokens_2,min_count=1)
print model_2
print model_2.most_similar('database')
print model_2.vocab
a = set([1,2,3,4])
b = set([2,3])
a.intersection_update(b)
print a