oschina_recommendation/xietongguolv.py

# -*- coding: utf-8 -*-
from numpy import *
import time
import psycopg2
import threading

# 协同过滤推荐算法主要分为：


# class CF:
#
#     def __init__(self, movies, ratings, maxrating, k=5, n=10):
#         self.movies = movies
#         self.ratings = ratings
#         self.maxrating = maxrating
#         # 邻居个数
#         self.k = k
#         # 推荐个数
#         self.n = n
#         # 用户对电影的评分
#         # 数据格式{'UserID：用户ID':[(MovieID：电影ID,Rating：用户对电影的评星)]}
#         self.userDict = {}
#         # 对某电影评分的用户
#         # 数据格式：{'MovieID：电影ID',[UserID：用户ID]}
#         # {'1',[1,2,3..],...}
#         self.ItemUser = {}
#         # 邻居的信息
#         self.neighbors = []
#         # 推荐列表
#         self.recommandList = []
#         self.cost = 0.0
#
#     # 基于用户的推荐
#     # 根据对电影的评分计算用户之间的相似度
#     def recommendByUser(self, userId):
#         self.formatRate()
#         # 推荐个数 等于 本身评分电影个数，用户计算准确率
#         self.n = len(self.userDict[userId])
#         self.getNearestNeighbor(userId)
#         self.getrecommandList(userId)
#         self.getPrecision(userId)
#
#     # 获取推荐列表
#     def getrecommandList(self, userId):
#         self.recommandList = []
#         # 建立推荐字典
#         recommandDict = {}
#         for neighbor in self.neighbors:
#             movies = self.userDict[neighbor[1]]
#             for movie in movies:
#                 if(movie[0] in recommandDict):
#                     recommandDict[movie[0]] += neighbor[0]
#                 else:
#                     recommandDict[movie[0]] = neighbor[0]
#
#         # 建立推荐列表
#         for key in recommandDict:
#             self.recommandList.append([recommandDict[key], key])
#         self.recommandList.sort(reverse=True)
#         self.recommandList = self.recommandList[:self.n]
#
#     # 将ratings转换为userDict和ItemUser
#     def formatRate(self):
#         self.userDict = {}
#         self.ItemUser = {}
#         for i in self.ratings:
#             # 评分最高为5 除以5 进行数据归一化
#             temp = (i[1], float(i[2]) / self.maxrating)
#             # 计算userDict {'1':[(1,5),(2,5)...],'2':[...]...}
#             if(i[0] in self.userDict):
#                 self.userDict[i[0]].append(temp)
#             else:
#                 self.userDict[i[0]] = [temp]
#             # 计算ItemUser {'1',[1,2,3..],...}
#             if(i[1] in self.ItemUser):
#                 self.ItemUser[i[1]].append(i[0])
#             else:
#                 self.ItemUser[i[1]] = [i[0]]
#
#     # 找到某用户的相邻用户
#     def getNearestNeighbor(self, userId):
#         neighbors = []
#         self.neighbors = []
#         # 获取userId评分的电影都有那些用户也评过分
#         for i in self.userDict[userId]:
#             for j in self.ItemUser[i[0]]:
#                 if(j != userId and j not in neighbors):
#                     neighbors.append(j)
#         # 计算这些用户与userId的相似度并排序
#         for i in neighbors:
#             dist = self.getCost(userId, i)
#             self.neighbors.append([dist, i])
#         # 排序默认是升序，reverse=True表示降序
#         self.neighbors.sort(reverse=True)
#         self.neighbors = self.neighbors[:self.k]
#
#     # 格式化userDict数据
#     def formatuserDict(self, userId, l):
#         user = {}
#         for i in self.userDict[userId]:
#             user[i[0]] = [i[1], 0]
#         for j in self.userDict[l]:
#             if(j[0] not in user):
#                 user[j[0]] = [0, j[1]]
#             else:
#                 user[j[0]][1] = j[1]
#         return user
#
#     # 计算余弦距离
#     def getCost(self, userId, l):
#         # 获取用户userId和l评分电影的并集
#         # {'电影ID'：[userId的评分，l的评分]} 没有评分为0
#         user = self.formatuserDict(userId, l)
#         x = 0.0
#         y = 0.0
#         z = 0.0
#         for k, v in user.items():
#             x += float(v[0]) * float(v[0])
#             y += float(v[1]) * float(v[1])
#             z += float(v[0]) * float(v[1])
#         if(z == 0.0):
#             return 0
#         return z / sqrt(x * y)
#
#     # 推荐的准确率
#     def getPrecision(self, userId):
#         user = [i[0] for i in self.userDict[userId]]
#         recommand = [i[1] for i in self.recommandList]
#         count = 0.0
#         if(len(user) >= len(recommand)):
#             for i in recommand:
#                 if(i in user):
#                     count += 1.0
#             self.cost = count / len(recommand)
#         else:
#             for i in user:
#                 if(i in recommand):
#                     count += 1.0
#             self.cost = count / len(user)


class CFThread(threading.Thread):
    def __init__(self, threadID, name, handleUs_i):
        threading.Thread.__init__(self)
        self.threadID = threadID
        self.name = name
        self.handleUs_i = handleUs_i

    def run(self):
        print("in the default run function")


# 获取数据
def readFile(filename):
    # files = open(filename, "r", encoding="utf-8")
    # 如果读取不成功试一下
    files = open(filename, "r", encoding="iso-8859-15")
    data = []
    for line in files.readlines():
        item = line.strip().split("::")
        data.append(item)
    return data

def getRatings():
    data = []
    try:
        con = psycopg2.connect(host = "localhost", port="5432", database='oschina', user='postgres', password='111111')
        cur = con.cursor()
        cur.execute('SELECT uid,aid,times from "iTags"."reads" order by aid asc')
        curs = cur.fetchall()
        for item in curs:
            temp = item
            # print(temp)
            data.append([temp[0],temp[1],temp[2]])
        con.close()
    except psycopg2.DatabaseError as e:
        print('Error %s' % e)

    return data

def getRatings():
    data = []
    try:
        con = psycopg2.connect(host="localhost", port="5432", database='oschina', user='postgres',
                               password='111111')
        cur = con.cursor()
        sql = 'SELECT uid,aid,times from "iTags"."reads"'
        cur.execute(sql)
        curs = cur.fetchall()
        for item in curs:
            temp = item
            # print(temp)
            data.append([temp[0], temp[1], temp[2]])
    except psycopg2.DatabaseError as e:
        print('Error %s' % e)

    return data

# dic_new: is the new dict
# dic_old: is the previous dict
def changedUser(dic_new, dic_old):
    time1 = time.time()
    result = {}

    for uid in dic_new.keys():
        changedArticleSet = set()
        if uid not in dic_old:
            changedArticleSet = dic_new[uid]
        else:
            a_t_map_new = dic_new[uid]
            a_t_map = dic_old[uid]
            for aid in a_t_map_new.keys():
                if aid not in a_t_map or a_t_map_new[aid] != a_t_map[aid]:
                    changedArticleSet.add(aid)
        if len(changedArticleSet) > 0:
            result[uid] = changedArticleSet
    time2 = time.time()
    print("get changed user time: %.2f" % (time2 - time1))
    print("changed user number: %d" % (len(result)))
    return result

# get candidate users
# threshold is the minimum number of articles
def getCandidates(u_aSet_map, threshold):
    result = set()
    for user in u_aSet_map.keys():
        aSet = u_aSet_map[user]
        if len(aSet) < threshold:
            continue
        else:
            result.add(user)
    return result

def insertRelations(result):
    print("开始写入数据库")
    con = psycopg2.connect(host="localhost", port="5432", database='oschina', user='postgres', password='111111')
    cur = con.cursor()
    for (uid, value) in result.items():
        for (uid2, relation) in value.items():
            cur.execute('insert into "iTags"."user_relations" (uid, uid2, relation) values (%s, %s, %s)', (uid, uid2, relation))
        con.commit()
    cur.close()
    con.close()

#
# def getMovies():
#     data = []
#     try:
#         con = psycopg2.connect(host="localhost", port="5432", database='oschina', user='postgres', password='123456')
#         cur = con.cursor()
#         cur.execute('SELECT id,title from "iTags"."articles"')
#         curs = cur.fetchall()
#         for item in curs:
#             temp = item
#             # print(temp)
#             data.append([temp[0], temp[1]])
#     except psycopg2.DatabaseError as e:
#         print('Error %s' % e)

    # return data

# getRatings()

class myThread(threading.Thread):
    def __init__(self, threadID, name, handleUs_i):
        threading.Thread.__init__(self)
        self.threadID = threadID
        self.name = name
        self.handleUs_i = handleUs_i

    def run(self):
        # global relation_result
        # global calculatedDenominatorMap

        global count
        global time0

        con = psycopg2.connect(host="localhost", port="5432", database='oschina', user='postgres', password='111111')
        cur = con.cursor()

        print("线程" + str(self.name) + "开始，处理用户，共" + str(len(self.handleUs_i)) + "个")
        for uid in self.handleUs_i:
            print("开始处理用户：" + str(uid))
            count += 1
            time1 = time.time()
            if count % 50 == 0:
                print("处理了%d个项目，耗时：%.2f" % (count, time1-time0))
            changedArticleSet = changed_user_dict[uid]
            time2 = time.time()
            # print("查看用户阅读情况是否有变化，时间为：%.2f" % (time2 - time1))

            if len(changedArticleSet) > 0:
                # print("user " + str(uid) + "'s article reading result has changed")
                # relation_result.setdefault(uid, {})
                time3 = time.time()
                aSet1 = u_aSet_map_new[uid]
                denominator1 = 0.0
                # if uid in calculatedDenominatorMap:
                #     denominator1 = calculatedDenominatorMap[uid]
                # else:
                for a in aSet1:
                    times1 = u_a_t_map_new[uid][a]
                    denominator1 += times1 * times1
                    # calculatedDenominatorMap[uid] = denominator1

                insert_sql_strings = [] # 一个用户提交一次
                update_sql_strings = []

                matchedUserSet = set()

                for aid in changedArticleSet:
                    time4 = time.time()
                    uSet = a_uSet_map_new[aid] # 不计算所有用户关系，只计算和热门用户之间的关系
                    matchedUserSet = matchedUserSet | uSet

                matchedUserSet = matchedUserSet & candidate_user_set
                print("match user number: %d" % (len(uSet)))
                for uid2 in matchedUserSet:
                    relation_result = 0.0
                    # relation_result.setdefault(uid2, {})
                    if uid == uid2:
                        # relation_result[uid][uid2] = 1.0
                        # relation_result[uid2][uid] = 1.0
                        relation_result = 1.0
                        continue

                    # 计算uid和uid2之间的差值
                    aSet2 = u_aSet_map_new[uid2]
                    sameAs = aSet1 & aSet2
                    # 计算numerator
                    numerator = 0.0
                    for a in sameAs:
                        times1 = u_a_t_map_new[uid][a]
                        times2 = u_a_t_map_new[uid2][a]
                        numerator += times1 * times2
                    # 计算denominator

                    denominator2 = 0.0
                    # if uid2 in calculatedDenominatorMap:
                    #     denominator2 = calculatedDenominatorMap[uid2]
                    # else:
                    for a in aSet2:
                        times2 = u_a_t_map_new[uid2][a]
                        denominator2 += times2 * times2
                        # calculatedDenominatorMap[uid2] = denominator2

                    # 计算相关结果
                    result = numerator / (math.sqrt(denominator1 * denominator2))
                    # relation_result[uid][uid2] = result
                    # relation_result[uid2][uid] = result
                    relation_result = result

                    # 判断结果是否存在
                    cur.execute('select relation from "iTags"."user_relations" where uid=%s and uid2=%s', (uid, uid2))
                    temps = cur.fetchall()
                    if len(temps) == 0:
                        insert_item = [uid, uid2, relation_result]
                        insert_sql_strings.append(insert_item)
                    else:
                        if relation_result != temps[0][0]:
                            update_item = [relation_result, uid, uid2]
                            update_sql_strings.append(update_item)
                if len(insert_sql_strings) > 0:
                    time8 = time.time()
                    sql = 'insert into "iTags"."user_relations" (uid, uid2, relation) values (%s, %s, %s)'
                    cur.executemany(sql, insert_sql_strings)
                    con.commit()
                    time9 = time.time()
                    # print("一批(%d个)数据库insert操作时间为：%.2f" % (len(insert_sql_strings), time9 - time8))
                if len(update_sql_strings) > 0:
                    time10 = time.time()
                    sql = 'update "iTags"."user_relations" set relation=%s where uid=%s and uid2=%s'
                    cur.executemany(sql, update_sql_strings)
                    con.commit()
                    time11 = time.time()
                    # print("一批(%d个)数据库update操作时间为：%.2f" % (len(insert_sql_strings), time11 - time10))

        print('线程' + self.name + '结束！')
        con.close()
# -------------------------开始-------------------------------

# relation_result = {}
u_a_t_map = {}
u_aSet_map = {}
changed_user_dict = {}
candidate_user_set = set()

count_cycle = 0
count = 0
time0 = time.time()
while True:
    print("a new cycle")
    count_cycle += 1
    start_time = time.time()
    u_a_t_map_new = {}
    u_at_map_new = {}
    u_aSet_map_new = {}
    a_uSet_map_new = {}
    # start_index = (count_cycle - 1) * 18901230
    # end_index = start_index + 18901230
    ratings = getRatings()

    for rating in ratings:
        uid = rating[0]
        aid = rating[1]
        times = rating[2]
        u_a_t_map_new.setdefault(uid, {})
        u_a_t_map_new[uid][aid] = times

        u_aSet_map_new.setdefault(uid, set())
        u_aSet_map_new[uid].add(aid)

        a_uSet_map_new.setdefault(aid, set())
        a_uSet_map_new[aid].add(uid)

    # 找到所有变化的用户
    changed_user_dict = changedUser(u_a_t_map_new, u_a_t_map)

    # 找到所有待计算关联用户
    candidate_user_set = getCandidates(u_a_t_map_new, 100)
    print("number of candidates: %d" % len(candidate_user_set))

    if len(changed_user_dict) <= 0:
        print("没有变更用户，程序休眠1000s...")
        time.sleep(1000)
        continue # 表示没有变化的用户

    # 线程中计算结果
    print('这是主线程：', threading.current_thread().name)
    threadNum = 8
    averageNum = math.ceil(len(changed_user_dict) / threadNum) # 平均一个线程多少个
    thread_list = []
    changed_user_list = list(changed_user_dict.keys())
    for i in range(threadNum):
        start_num = i * averageNum
        end_num = start_num + averageNum
        handleUs_i = changed_user_list[start_num:end_num]
        thread = myThread(i, str(i), handleUs_i)
        thread.setDaemon(True)
        thread.start()
        thread_list.append(thread)

    # 等待所有线程完成
    for t in thread_list:
        t.join()

    u_a_t_map = u_a_t_map_new
    u_aSet_map = u_aSet_map_new

    print('主线程结束！', threading.current_thread().name)
    print('一共用时：', time.time() - start_time)

    # 写入数据库
    # insertRelations(relation_result)

    # print("cycle " + str(count_cycle) + " uses time(s): " + str(time.time() - start_time))
    time.sleep(3) # sleep 3 seconds
    break

# start = time.clock()
# movies = readFile("dataset/movies.dat")
# ratings = readFile("dataset/ratings.dat")
# ratings = getRatings()
# demo = CF(movies, ratings, 1309, k=20)
# demo.recommendByUser(1784225)
# print("推荐列表为：")
# end = time.clock()
# print("the time for reading reads table is: " + str((end - start)))


# demo.showTable()
# print("处理的数据为%d条" % (len(demo.ratings)))
# print("准确率： %.2f %%" % (demo.cost * 100))
# end = time.clock()
# print("耗费时间： %f s" % (end - start))