search_by_postlink is done

2016-10-06 21:49:14 +08:00 · 2016-10-06 21:49:14 +08:00 · fcdf2e8e6b
parent eb0f4e1bd2
commit fcdf2e8e6b
3 changed files with 162 additions and 38 deletions
--- a/search_by_postlink.py
+++ b/search_by_postlink.py
@ -0,0 +1,84 @@
+#coding:utf-8
+'''
+Created on 2016年10月6日
+
+@author: StarLee
+'''
+import MySQLdb
+from _elementtree import Comment
+
+
+conn2 = MySQLdb.connect("localhost","starlee","1234","query_expansion" )
+cursor2 = conn2.cursor()
+sql_tag_is_prj = 'select name,is_prj_name from tag'
+cursor2.execute(sql_tag_is_prj)
+tag_is_prj_name = {row[0]:row[1] for row in cursor2.fetchall()}
+
+
+#加载过滤掉的tag
+with open("stop_word_prj_name") as file:
+    stop_words_prj_name = set([line.strip() for line in file.readlines() if(( not line.startswith("#")) and line.strip()) ])
+    
+
+
+sql_get_syns = "select from_tag,to_tag from synonym"
+cursor2.execute(sql_get_syns)
+syns = {row[0]:row[1] for row in cursor2.fetchall()}
+
+
+conn = MySQLdb.connect(host="localhost",user="starlee",passwd="1234",db="sldb",charset='utf8' )
+cursor = conn.cursor()
+
+sql_sel_linked_tags = 'select t2,count from tag_postlinks where t1=%s union select t1,count from tag_postlinks where t2=%s'
+
+
+
+def get_syns(term):
+    if term not in syns:
+        return term
+    return syns[term]
+
+def get_linked_tags(item):#返回一个tag的相关tag
+    #(tag,count)
+    #有可能这样总情况（4,1,1,1,1,1,1,1,1）遮掩就会把4的这个得分给分散低了
+    cursor.execute(sql_sel_linked_tags,(item,item))
+    result = cursor.fetchall()
+    
+    total_count = sum([item[1] for item in result])
+#     return sorted([(item[0],1.0*item[1]/total_count) for item in result],lambda x, y: cmp(x[1], y[1]),reverse = True)
+    return {get_syns(item[0]):1.0*item[1]/total_count for item in result}
+    
+    
+if __name__ == '__main__':
+    query = 'android database'
+    
+    items = query.split(' ')
+    items = [get_syns(term) for term in  items]
+    
+    total_linked_tags = list()
+     
+    linkedtags = get_linked_tags(items[0])
+    total_linked_tags.append(linkedtags)
+    commen_tags = set(linkedtags.keys())#query 中各个item共同关联的tag
+    
+    for item in items[1:]:
+        linkedtags = get_linked_tags(item)
+        total_linked_tags.append(linkedtags)
+        commen_tags.intersection_update(set(linkedtags.keys()))
+#         print sorted(linkedtags.items(),lambda x, y: cmp(x[1], y[1]),reverse = True)
+    
+    print commen_tags
+    
+    commen_tags_score = list()
+    for item in commen_tags:
+        if (item in tag_is_prj_name and tag_is_prj_name[item] == 0) or item in stop_words_prj_name :
+            continue
+        score = 1
+        for  tlt in total_linked_tags:
+            score *= tlt[item]
+        commen_tags_score.append((item,score))
+    for item in sorted(commen_tags_score,lambda x, y: cmp(x[1], y[1]),reverse = True)[:20]:
+        print item[0]
+    
+    cursor.close()
+    conn.close()
--- a/35
+++ b/35
@ -0,0 +1,35 @@
+#programing language
+java
+php
+python
+ruby
+scala
+c
+c++
+objective-c
+perl
+c#
+.net
+asp
+go
+grooby
+fortran
+swift
+lisp
+erlang
+
+#operating system
+windows
+linux
+android
+ubuntu
+ios
+
+#industry standard
+json
+oauth
+http
+java-ee
+java-se
+java-me
+
--- a/test.py
+++ b/test.py
@ -1,43 +1,48 @@
-# # #coding: utf-8
-# # '''
-# # Created on 2016年9月15日
+# # # #coding: utf-8
+# # # '''
+# # # Created on 2016年9月15日
+# # # 
+# # # @author: StarLee
+# # # '''
+# import nltk
+# #  
+# # # nltk.app.chunkparser()
+# from gensim.models import word2vec
+# from gensim.models import Word2Vec
 # # 
-# # @author: StarLee
-# # '''
-import nltk
-#  
-# # nltk.app.chunkparser()
-from gensim.models import word2vec
-from gensim.models import Word2Vec
+# text_in_gh = 'Redis is an in-memory database that persists on disk. The data model is key-value, but many different kind of values are supported: Strings, Lists, Sets, Sorted Sets, Hashes, HyperLogLogs, Bitmaps. http://redis.io'
+# text_in_so = 'An open source BSD-licensed in-memory data structure store used as database, cache and message broker. Supports data structures such as strings, hashes, lists, sets, sorted sets with range queries, bitmaps, hyperloglogs and geospatial indexes with radius queries. Has built-in replication, Lua scripting, LRU eviction, transactions and different levels of on-disk persistence, high availability via Redis Sentinel and automatic partitioning with Redis Cluster.'
+# # 
+# # # tokens = nltk.word_tokenize(text_in_gh)
+# # tokens = nltk.word_tokenize(text_in_so)
+# # 
+# # pos = nltk.pos_tag(tokens)
+# # grammer = 'NP: {<DT>?<JJ>*<NN>}'
+# # cp = nltk.RegexpParser(grammer)
+# # result = cp.parse(pos)
+# # print pos
+# # print result
+# tokens_1 = [nltk.word_tokenize(item) for item in nltk.sent_tokenize(text_in_gh)]
+# tokens_2 = [nltk.word_tokenize(item) for item in nltk.sent_tokenize(text_in_so)]
 # 
-text_in_gh = 'Redis is an in-memory database that persists on disk. The data model is key-value, but many different kind of values are supported: Strings, Lists, Sets, Sorted Sets, Hashes, HyperLogLogs, Bitmaps. http://redis.io'
-text_in_so = 'An open source BSD-licensed in-memory data structure store used as database, cache and message broker. Supports data structures such as strings, hashes, lists, sets, sorted sets with range queries, bitmaps, hyperloglogs and geospatial indexes with radius queries. Has built-in replication, Lua scripting, LRU eviction, transactions and different levels of on-disk persistence, high availability via Redis Sentinel and automatic partitioning with Redis Cluster.'
 # 
-# # tokens = nltk.word_tokenize(text_in_gh)
-# tokens = nltk.word_tokenize(text_in_so)
+# model_1 = Word2Vec(tokens_1 ,min_count=1)
+# print model_1
+# print model_1.most_similar('database')
+# print model_1.vocab
 # 
-# pos = nltk.pos_tag(tokens)
-# grammer = 'NP: {<DT>?<JJ>*<NN>}'
-# cp = nltk.RegexpParser(grammer)
-# result = cp.parse(pos)
-# print pos
-# print result
-tokens_1 = [nltk.word_tokenize(item) for item in nltk.sent_tokenize(text_in_gh)]
-tokens_2 = [nltk.word_tokenize(item) for item in nltk.sent_tokenize(text_in_so)]
+# print word2vec.train_batch_cbow(model_1,tokens_2,0.025,None,None)
+# print model_1
+# print model_1.most_similar('database')
+# print model_1.vocab
+# 
+# tokens_2.extend(tokens_1)
+# model_2 =  Word2Vec(tokens_2,min_count=1)
+# print model_2
+# print model_2.most_similar('database')
+# print model_2.vocab

-
-model_1 = Word2Vec(tokens_1 ,min_count=1)
-print model_1
-print model_1.most_similar('database')
-print model_1.vocab
-
-print word2vec.train_batch_cbow(model_1,tokens_2,0.025,None,None)
-print model_1
-print model_1.most_similar('database')
-print model_1.vocab
-
-tokens_2.extend(tokens_1)
-model_2 =  Word2Vec(tokens_2,min_count=1)
-print model_2
-print model_2.most_similar('database')
-print model_2.vocab
+a = set([1,2,3,4])
+b = set([2,3])
+a.intersection_update(b)
+print a