github_crawler/consumer.py

67 lines
2.4 KiB
Python
Executable File

import redis
import urllib2
import logging
import MySQLdb
import time
import sys
r = redis.Redis(host="192.168.80.55",port=6379,db=0)
logger = logging.getLogger()
hdlr = logging.FileHandler("consumer-%s.log"%sys.argv[1])
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr)
logger.setLevel(logging.NOTSET)
send_headers = {"Content-Type":"application/json","Authorization":""}
conn = MySQLdb.connect(host="192.168.80.104",user="influx",passwd="influx1234",db="pages",charset='utf8' )
cursor = conn.cursor()
sql_add_repo = "insert into github_html_detail(url,detail,time) values(%s,%s,%s)"
sql_update_state = "update repo_urls set state=1 where id=%s"
sql_error = "insert into errors(url,message,time) values(%s,%s,%s)"
def start():
logger.info("consumer begains to work")
task = r.blpop("repo_urls")[1]
while True:
logger.info("task : %s",task)
id_to_set_state = task[0:task.index("-")]
url_download = task[task.index("-")+1:]
logger.info(">>id:%s - url:%s"%(id_to_set_state,url_download))
token = r.blpop("tokens")[1]
r.rpush("tokens",token)
logger.info(">>token:%s will be used"%token)
send_headers["Authorization"] = "token %s"%token
req = urllib2.Request(url_download,headers = send_headers)
try:
response = urllib2.urlopen(req,timeout=20)
repo = response.read()
logger.info(">>done downloading")
cursor.execute(sql_add_repo,(url_download,repo,time.strftime('%Y-%m-%d %H:%M:%S')))
cursor.execute(sql_update_state,(id_to_set_state,))
conn.commit()
logger.info(">>done storing")
except urllib2.URLError,e:
if hasattr(e,"reason"):
logger.error("Failed to reach the server")
logger.error("The reason: "%e.reason)
cursor.execute(sql_error,(url_download,e.reason,time.strftime('%Y-%m-%d %H:%M:%S')))
elif hasattr(e,"code"):
logger.error("The server couldn't fulfill the request")
logger.error("Error code: %s"%str(e.code))
logger.error("Return content: %s"%e.read())
cursor.execute(sql_error,(url_download,e.read(),time.strftime('%Y-%m-%d %H:%M:%S')))
else:
logger.error(e.msg)
conn.commit()
except Exception,e:
logger.error(e)
cursor.execute(sql_error,(url_download,e,time.strftime('%Y-%m-%d %H:%M:%S')))
conn.commit()
task = r.blpop("repo_urls")[1]
if __name__ == "__main__":
start()