ADD file via upload

This commit is contained in:
zjlululululu 2024-03-07 13:19:42 +08:00
parent 0b6c02c121
commit 47627e5444
1 changed files with 68 additions and 0 deletions

68
src/Data_Crawling.py Normal file
View File

@ -0,0 +1,68 @@
import requests
import openpyxl
import time
def search_repo(q,page):
headers = {
'User-Agent': 'Mozilla/5.0',
'Authorization': 'token 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',
'Content-Type': 'application/json',
'method': 'GET',
'Accept': 'application/json'
}
url = 'https://api.github.com/search/repositories?q='+q
#查找参数
params = {'sort': 'stargazers_count', 'order': 'desc', 'per_page': 100,'page':page,'created':'2019-01-01..2019-12-31'} #以星级倒叙每页100条
reponse = requests.get(url,headers=headers,params=params)
if (reponse.status_code==200):
print(reponse.status_code, "响应成功!")
return reponse
#获取前1000个仓库
time_list=[2021,2022,2023]
for year in time_list:
print('-------------{0}开始爬取-------------'.format(year))
repo_list = []
for page in range(10):
status=True
while (status==True):
try:
reponse=search_repo('created:{0}-01-01..{0}-12-31'.format(year),page)
response_dict = reponse.json()
repo_list.append(response_dict['items'])
status=False
except KeyError:
status=True #若出现错误重新运行循环
print('{0}页获取失败'.format(page+1))
print(response_dict)
time.sleep(2) #频繁访问间隔
print("已获取页数:", len(repo_list))
print('-------------爬取结束-------------')
print("获取库数:", len(repo_list)*100)
'''
for repo_dict in repo_list:
print('名字:', repo_dict['name'])
print('作者:', repo_dict['owner']['login'])
print('Stars', repo_dict['stargazers_count'])
print('网址:', repo_dict['html_url'])
print('简介:', repo_dict['description'])
print('标签:',repo_dict['topics'])
'''
wb=openpyxl.Workbook() #导出excel
ws=wb.active
ws.append(['名称','作者','创建时间','更新时间','push时间','Stars','语言','网址','标签','开源许可证'])
for i in range(len(repo_list)):
#print(i)
for repo_dict in repo_list[i]:
if (repo_dict['license']!=None): #获取库所以用的协议
l=repo_dict['license']['name']
else:
l=None
ws.append([repo_dict['name'],repo_dict['owner']['login'],repo_dict['created_at'],repo_dict['updated_at'],repo_dict['pushed_at'],repo_dict['stargazers_count'],repo_dict['language'],repo_dict['html_url'],','.join(repo_dict['topics']),l])
wb.save("{0}GitHub数据.xlsx".format(year))
print("-----------------Done-----------------")
print("-----------------Finish-----------------")