2023年6月29日发(作者:)
python数据拼接_爬⾍⽤python拼接json数据python爬⾍新⼿,想抓这个⽹站的内容,请求各位⼤神帮忙:返回的json数据结构类似于:{result: 1,data: {name: "2018",nodeList: [{name: "单选题",questionLists: [{questionId: "9d00dd42-bb88-45d4-af3b-7015fb091d2f",QuestionScore: "1.50",},{questionId: "72a8285e-57b7-4c17-811e-dc786e4e8114",QuestionScore: "1.50",},]},{name: "多选题",questionLists: [{questionId: "f0bd5a5d-e486-4148-9f4c-a753ce2f901d",QuestionScore: "1.50",},{questionId: "c0860e17-8004-45b4-bd65-40b4110ae4a8",QuestionScore: "1.50",},}]}访问这个链接之后得到第⼆个具体题⽬的json数据。问题1:这个⽹站后台为什么不把这两个数据放在⼀个mongodb⽂档⾥,要分开呢?有什么好处吗?问题2:现在我想把这部分数据拼接到第⼀个json⽂件内,和questionId这个键值对并列,该⽤什么⽅法?我已经写了⼀部分代码,⾥⾯有很多不懂的地⽅都标注了,请各位⼤神帮忙。import requestsimport jsonimport pymongo# from myLog import MyLog as mylog# log = mylog()# 总体逻辑:访问paperID后得不到题⽬的具体内容,# 只能得到questionID,然后利⽤questionID组成url才能得到具体的题⽬信息,# 所以考虑拿到题⽬具体信息后再传给paperID,再存⼊数据库。def paperID_spider():# 获取paperID内容paperID_url = '/TK/?method=GetPaperAnswerSheetNew_v2&paperID=a823eddf-d157-4566-be06-b32fe4ee59ac'user_agent = 'SCeBook/5.4.3 (iPhone; iOS 12.1.4; Scale/2.00)'headers = {'User-Agent': user_agent}try:paperID_res = (paperID_url, headers=headers, timeout=5)except:# TODO:对于访问失败的情况怎么办?,重试?还是把失败链接保存到⽇志⾥,后⾯再重新访问存储?# ('paperID_url出错:%s' %paperID_url)passpaperID_json = paperID_()paperID_result_code = paperID_json['result']if paperID_result_code == 1:# TODO: 返回成功的这个状态码⽤不⽤保存,还是只保存有⽤的data数据try:# TODO:先保存这部分paperID内容存到mongo再爬取下⾯的questionID?还是把questionID内容和paparID拼接后再保存?save_mongo_paperID_json(paperID_json)except:# TODO:保存出错该怎么办?# ('paperID_json保存出错:%s' %paperID_url)pass# TODO:这⾥直接传paperID_json,是不是⽐较占内存?提前for循环传paperID_json['data']['nodeList']会不会占⽤内存⼩点questionID_spider(paperID_json)else:# ('访问出错:%s' %paperID_url)print(paperID_json['result'])# TODO:返回状态码不对该怎么操作?passdef questionID_spider(paperID_json):# 保存questionID数据到sts.0⾥⾯和questionId并列nodeLists = paperID_json['data']['nodeList']for nodeList in nodeLists:questionLists = nodeList['questionLists']for questionList in questionLists:questionID = questionList['questionId']print(questionID)questionID_url = '/TK/?id=%s&method=Questions' %questionIDprint(questionID_url)# TODO:⽤不⽤再写⼀次请求头?user_agent = 'SCeBook/5.4.3 (iPhone; iOS 12.1.4; Scale/2.00)'headers = {'User-Agent': user_agent}questionID_res = (questionID_url, headers=headers)questionID_json = questionID_()questionID_result_code = questionID_json['status']# TODO:如何保存questionID数据到sts.0⾥⾯和questionId并列?# TODO:1.直接拼接到paperID数据⾥# TODO:2.保存到mongoDB⾥print(questionID_json['data']['QuestionContent'])def save_mongo_paperID_json(paperID_json):client = lient()db = client['tiku']collection = db['tiku']# TODO:这⾥如何插⼊paperID_json数据,进⾏两部分json数据的组合tiku = (paperID_json)if __name__ == '__main__':eBook = paperID_spider()
发布者:admin,转转请注明出处:http://www.yc00.com/news/1687985120a63860.html
评论列表(0条)