2023年6月27日发(作者:)
关于Python爬取⽹页返回521状况码的解决⽅案⽂章⽬录# 项⽬场景: Python3.8问题描述:在使⽤Python爬⾍爬取⽹页的列表页中的详情页时,返回的详情页的html⽂件的数据长度有限。原因分析:频繁爬取⽬标⽹站,导致的⽹址反爬⾍措施解决⽅案:⽅法⼀:换⼀个vpn,也就是换⼀台电脑执⾏程序⽅法⼆:复制⽬标⽹页的Headers添加到代码中根据⽬标情况不同修改17181926def askURL(url): head = { #
模拟浏览器头部信息,向⾖瓣服务器发送消息 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': '', 'Upgrade-Insecure-Requests': '1', } #
⽤户代理,表⽰告诉⾖瓣服务器,我们是什么类型的机器、浏览器(本质上是告诉浏览器,我们可以接收什么⽔平的⽂件内容) request = t(url, headers=head) html = "" try: response = n(request) html = ().decode("utf-8") except or as e: if hasattr(e, "code"): print() if hasattr(e, "reason"): print() return html} 'Cookie': 'mfw_uuid=61dc38ef-2c67-45ce-ed26-c30fa04f2418; oad_n=a:3:{s:3:"oid";i:1029;s:2:"dm";s:15:"";s:2:"ft";s:19:"2022-01-10 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.107⽅法三:两次访问⽬标详情页代码⼀829import execjsimport requestsimport rehead = { #
模拟浏览器头部信息,向⾖瓣服务器发送消息 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': '', 'Upgrade-Insecure-Requests': '1',}url = '/poi/'# response = (url)# # cookie1# cookie1 = s# # js代码# js_code = 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/ get_521_content(url,head): req = (url, headers=head) cookies = s2936373839464748495657585966676869767778798687 cookies = '; '.join(['='.join(item) for item in ()]) txt_521 = txt_521 = ''.join(l('', txt_521)) return (txt_521, cookies)def fixed_fun(function): func_return = e('eval', 'return') content = e(func_return) req = (url, headers=head) evaled_func = ''.join(l('', )) # print(js_con) # fn = js_('=').split(' ') # evaled_func = (fn) # print(evaled_func) mode_func = evaled_e('while(window._phantom||window.__phantomas){};', ''). replace('=', 'return').replace(';if((function(){try{return !!ntListener;}', ''). replace("catch(e){return false;}})()){ntListener('DOMContentLoaded',l,false);}", ''). replace("else{Event('onreadystatechange',l);}", '').replace( r"setTimeout('=e(/[?|&]captcha-challenge/,'')',1500);", '') content = e(mode_func) cookies = ('l') __jsl_clearance = (';')[0] return __jsl_clearancedef cookie_dict(js, id): dict = {} js = ('=') id = ('=') dict[js[0]] = js[1] dict[id[0]] = id[1] return dictif __name__ == '__main__': func = get_521_content(url,head) content = func[0] cookie_id = func[1] cookie_js = fixed_fun(func[0]) dicted_cookie = cookie_dict(cookie_js, cookie_id) head = { #
模拟浏览器头部信息,向⾖瓣服务器发送消息 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': '', 'Upgrade-Insecure-Requests': '1', 'Cookie': cookie_id + ';' + cookie_js } req = (url, headers=head) print(_code) 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.107代码⼆1# resouce:/qq_41879417/article/details/101701120?spm=1001.2101.3001.6661.1&utm_medium=_relevant_82936373839464748495657585966# resouce:/qq_41879417/article/details/101701120?spm=1001.2101.3001.6661.1&utm_medium=_relevant_# -*- coding: utf-8 -*-# @Time : 2022/1/16 9:11# @Author : sherlock# @File : creeper_2_# @Project : creeperimport execjsimport reimport requestsurl = '/poi/'head = { #
模拟浏览器头部信息,向⾖瓣服务器发送消息 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': '', 'Upgrade-Insecure-Requests': '1',} 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/ get_521_content(url): req = (url, headers=head, timeout=5) print(_code, ) if _code == 521: cookies = dict(()) print(cookies) js_con = ''.join(l('', )) if js_con: __jsl_clearance = fixed_fun(js_con, url) if __jsl_clearance: key, value = __jsl_('=') cookies[key] = value return cookies#
执⾏js代码获取cookies
的__jsl_clearance的键值def fixed_fun(js_con, url): # js_con
第⼀次请求获取的js内容 func_return = js_e('eval(', 'return(') print('第⼀次替换eval==》return后: ', func_return) content = e(func_return) # fn = js_('=')[0].split(' ')[1] #
只有[''] fn = js_('=')[0].split(' ')[1] evaled_func = (fn) print('第⼀次执⾏js代码后: ', evaled_func) fn = evaled_('=')[0].split(' ')[1] #
获取动态函数名 aa = evaled_("") #
获取标签的内容 aa = aa[1].split("")[0] if len(aa) >= 2 else '' mode_func = evaled_func. replace( "setTimeout('=me+e(/[?|&]captcha-challenge/,'')',1500);=", 'return'). replace(';if((function(){try{return !!ntListener;}', ''). replace( "}catch(e){return false;}})()){ntListener('DOMContentLoaded'," + fn + ",false)}else{Event('onreadystatechange'," ''). replace(66676869767778798687888996979899100101 replace( ''). replace("return'__jsl_clearance", "var window={};return '__jsl_clearance"). replace( "var " + fn + "=Element('div');" + fn + ".innerHTML='" + aa + "';" + fn + "=" + fn + ".", "var " + fn + "='" + url + "'") print('第⼆次替换后的js代码:', mode_func) try: content = e(mode_func) cookies = (fn) __jsl_clearance = (';')[0] print(__jsl_clearance) return __jsl_clearance except: print('js执⾏错误:', mode_func) return None "if((function(){try{return !!ntListener;}catch(e){return false;}})()){ntListener('DOMContentLoaded'," + fn + ",false)}else#
携带解密后的cookies第⼆次爬取详情页def con_spider(cookies, url): response = (url, headers=head, cookies=cookies, timeout=5) if _code == 200: ng = 'utf-8' print(_code) print() return response else: print('第⼆次爬取错误状态码:', _code) return Noneif __name__ == "__main__": cookies = get_521_content(url) con_spider(cookies, url)代码三22324# resource:/gongs/p/port execjsimport reimport requestsurl = '/poi/'head = { #
模拟浏览器头部信息,向⾖瓣服务器发送消息 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Host": "", "Upgrade-Insecure-Requests": "1",} "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.94647484956575859666768697677787986878889def getResponse(): """ 获取response :return: """ response = (url, headers=head) return responsedef getJslid(response): """ :param response: :return: """ cook = s ans = '; '.join(['='.join(item) for item in ()]) return ansdef getClearance(response): """ :return: """ txt = ''.join(l('', )) func_return = e('eval', 'return') content = e(func_return) print("accurate error") # error eval_func = ('x') print(1) name = l(r'var (.*?)=function.*', eval_func)[0] print(2) mode_func = eval_e('while(window._phantom||window.__phantomas){};', ''). replace('=', 'return').replace('if((function(){try{return !!ntListener;}', ''). replace("catch(e){return false;}})()){ntListener('DOMContentLoaded',%s,false)}" % name, ''). replace("else{Event('onreadystatechange',%s)}" % name, '').replace( r"setTimeout('=me+e(/[?|&]captcha-challenge/,'')',1500);", '')
content = e(mode_func) cookies = (name) # print(cookies) clearance = (';')[0] return clearancedef structurehead(cook, clearance): """ 构造新的head :return: """ cookie = { 'cookie': cook + ';' + clearance } return dict(head, **cookie)8996979899104def main(): response = getResponse() cook = getJslid(response) print("error") # this step has some error about exejcss clearance = getClearance(response) print("2 error") dict = structurehead(cook, clearance) print(dict)if __name__ == '__main__': main()代码四829363738394# -*- coding: utf-8 -*-# @Time : 2022/1/18 13:32# @Author : sherlock# @File : creeper_4_# @Project : creeper# coding=utf-8# author=zhangjingyuan# python3from import HTMLParserimport lxmlimport requestsfrom lxml import etreeimport timport mport reimport timeimport ioimport gzipimport randomimport codecsimport execjsimport requestsimport reurl1 = '/poi/'url2 = '/haikou/'url3 = '/subject/1292052/'head = { #
模拟浏览器头部信息,向⾖瓣服务器发送消息 # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", # "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", # "Cache-Control": "max-age=0", # "Connection": "keep-alive", # "Host": "", # "Upgrade-Insecure-Requests": "1",} # "Cookie": 'mfw_uuid=61dc38ef-2c67-45ce-ed26-c30fa04f2418; oad_n=a:3:{s:3:"oid";i:1029;s:2:"dm";s:15:"";s:2:"ft";s:19:"2022-01-10 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072def getResponse(): """4647484956575859666768697677787986878889969798991 获取response :return: """ response = (url1, headers=head) return responsedef getJslid(response): """ :param response: :return: """ cook = s return '; '.join(['='.join(item) for item in ()])def getClearance(response): """ :return: """ txt = ''.join(l('', )) func_return = e('eval', 'return') print(func_return) content = e(func_return) print(type(content)) # content = open("jsdom_document").read() # print(content) # execjs._mError: ReferenceError: document is not defined eval_func = ('x') name = l(r'var (.*?)=function.*', eval_func)[0] mode_func = eval_e('while(window._phantom||window.__phantomas){};', ''). replace('=', 'return').replace('if((function(){try{return !!ntListener;}', ''). replace("catch(e){return false;}})()){ntListener('DOMContentLoaded',%s,false)}" % name, ''). replace("else{Event('onreadystatechange',%s)}" % name, '').replace( r"setTimeout('=me+e(/[?|&]captcha-challenge/,'')',1500);", '') content = e(mode_func) cookies = (name) # print(cookies) clearance = (';')[0] return clearancedef structureCookie(cook, clearance): """ 构造新的headers :return: """ cookie = cook + ';' + clearance print(cookie) return cookieif __name__ == '__main__': response = getResponse() clearance = getClearance(response) cook = getJslid(response)4864135136 head = { #
模拟浏览器头部信息,向⾖瓣服务器发送消息 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': '', 'Cookie': cook, 'Upgrade-Insecure-Requests': '1', } 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.107 request = t(url2, headers=head) html = "" try: response = n(request) html = ().decode(encoding="utf-8", errors="ignore") print(html) except or as e: if hasattr(e, "code"): print("状态码:%s" % ()) if hasattr(e, "reason"): print("原因:%s" % ())代码五8293# -*- coding: utf-8 -*-# @Time : 2022/1/18 17:43# @Author : sherlock# @File : creeper_5_# @Project : creeper# -*- coding:utf-8 -*-import requestsfrom bs4 import BeautifulSoupimport redisfrom selenium import webdriverfrom s import Optionsimport smtplibimport rom import MIMETextimport timeurl = ''def driver_chrome(): chrome_options = Options() chrome__argument("--headless") chrome__argument('--no-sandbox') chrome__argument("--disable-gpu") chrome__argument("--window-size=1920x1080") driver = (chrome_options=chrome_options) return driverdef mymail(content): msg = MIMEText(content, _subtype='plain', _charset='utf8') msg['From'] = addr(('Author', '989989797@'))353637383946474849565758596061626364 msg['From'] = addr(('Author', '989989797@')) msg['To'] = addr(('Recipient', '8979879879@')) msg['date'] = me('%a, %d %b %Y %H:%M:%S %z') msg['Subject'] = 'Your ip address' return msgr = (host='localhost', port=6379, decode_responses=True)myip = ('myip')driver = driver_chrome()(url)cookies = _cookies()new_cookies = {}for i in cookies: _cookie({'name': ('name'), 'value': ('value')})(url)soup = BeautifulSoup(_source, features='lxml')myres = _all('div', attrs={'class': 'yourInfo'})trueip = myres[0].find_all('a')[0].textmsg = mymail(trueip)with _SSL('', 465) as server: ('80988988@', '9jsdfhjhfio') if myip != trueip: ('myip', trueip) il('98198397@', '9879878798@', _string())()()Test代码82930# coding=utf-8# author=zhangjingyuan# python3from import HTMLParserimport lxmlimport requestsfrom lxml import etreeimport timport mport reimport timeimport ioimport gzipimport randomimport codecsurl1 = '/poi/'url2 = '/haikou/'url3 ='/subject/1292052/'url4 = '/search/?q=%E6%B3%89%E5%B7%9E'head = { #
模拟浏览器头部信息,向⾖瓣服务器发送消息 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Cookie": 'mfw_uuid=61dc38ef-2c67-45ce-ed26-c30fa04f2418; oad_n=a:3:{s:3:"oid";i:1029;s:2:"dm";s:15:"";s:2:"ft";s:19:"2022-01-10+363738394647484956575859606162 "Cookie": 'mfw_uuid=61dc38ef-2c67-45ce-ed26-c30fa04f2418; oad_n=a:3:{s:3:"oid";i:1029;s:2:"dm";s:15:"";s:2:"ft";s:19:"2022-01-10+ "Host": "", "Upgrade-Insecure-Requests": "1",}# #
输出访问⽹页的状态码# req = (url, headers=head).status_code# print(req)request = t(url1, headers=head)html = ""try: response = n(request) html = ().decode(encoding="utf-8", errors="ignore") print(html)except or as e: if hasattr(e, "code"): print("状态码:%s"%()) if hasattr(e, "reason"): print("原因:%s"%())# response = (url1)# print(response)# # cookie1# cookie1 = s# print(cookie1)# # js代码# js_code = # print(js_code) "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072
发布者:admin,转转请注明出处:http://www.yc00.com/web/1687865921a52063.html
评论列表(0条)