diff --git a/src/Data_Crawling.py b/src/Data_Crawling.py new file mode 100644 index 0000000..2fd0453 --- /dev/null +++ b/src/Data_Crawling.py @@ -0,0 +1,68 @@ +import requests +import openpyxl +import time + +def search_repo(q,page): + headers = { + 'User-Agent': 'Mozilla/5.0', + 'Authorization': 'token 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', + 'Content-Type': 'application/json', + 'method': 'GET', + 'Accept': 'application/json' + } + url = 'https://api.github.com/search/repositories?q='+q + #查找参数 + params = {'sort': 'stargazers_count', 'order': 'desc', 'per_page': 100,'page':page,'created':'2019-01-01..2019-12-31'} #以星级倒叙,每页100条 + reponse = requests.get(url,headers=headers,params=params) + if (reponse.status_code==200): + print(reponse.status_code, "响应成功!") + return reponse + + + +#获取前1000个仓库 +time_list=[2021,2022,2023] +for year in time_list: + print('-------------{0}开始爬取-------------'.format(year)) + repo_list = [] + for page in range(10): + status=True + while (status==True): + try: + reponse=search_repo('created:{0}-01-01..{0}-12-31'.format(year),page) + response_dict = reponse.json() + repo_list.append(response_dict['items']) + status=False + except KeyError: + status=True #若出现错误重新运行循环 + print('第{0}页获取失败'.format(page+1)) + print(response_dict) + time.sleep(2) #频繁访问间隔 + print("已获取页数:", len(repo_list)) + print('-------------爬取结束-------------') + print("获取库数:", len(repo_list)*100) + ''' + for repo_dict in repo_list: + print('名字:', repo_dict['name']) + print('作者:', repo_dict['owner']['login']) + print('Stars:', repo_dict['stargazers_count']) + print('网址:', repo_dict['html_url']) + print('简介:', repo_dict['description']) + print('标签:',repo_dict['topics']) + ''' + + wb=openpyxl.Workbook() #导出excel + ws=wb.active + ws.append(['名称','作者','创建时间','更新时间','push时间','Stars','语言','网址','标签','开源许可证']) + for i in range(len(repo_list)): + #print(i) + for repo_dict in repo_list[i]: + if (repo_dict['license']!=None): #获取库所以用的协议 + l=repo_dict['license']['name'] + else: + l=None + ws.append([repo_dict['name'],repo_dict['owner']['login'],repo_dict['created_at'],repo_dict['updated_at'],repo_dict['pushed_at'],repo_dict['stargazers_count'],repo_dict['language'],repo_dict['html_url'],','.join(repo_dict['topics']),l]) + wb.save("{0}GitHub数据.xlsx".format(year)) + print("-----------------Done-----------------") +print("-----------------Finish-----------------") +