2023年6月27日发(作者:)
爬取猫眼电影榜单TOP100⼀、设计⽅案1.主题式⽹络爬⾍名称:爬取猫眼电影TOP1002.爬取内容与数据特征分析:爬取猫眼电影TOP100榜单电影评分与出版年份..3.设计⽅案概述、思路:⾸先打开⽬标⽹站,进⾏⽬标站点分析 打开猫眼电影 点击榜单 TOP100 每⼀页10个电影,通过URL offset参数改变电影的展⽰,然后进⾏⽹页代码分析 审查源代码,由dd标签包围,抓取单页内容,利⽤request请求⽬标站点,得到单个⽹页HTML代码,返回结果根据HTML代码分析得到电影的名称,主演,上映时间,评分,图⽚链接等信息、保存⽂件、开启循环及多线程....难点:html源码过于杂乱,难以提取数据,数据实时更新,会导致部分上传的数据偏差⼆、主题页⾯的结构特征分析1.主题页⾯的结构与特征分析:通过分析页⾯得知所要获取的数据分布于dd标签中,p为发⾏时间标签,i为影⽚评分标签。页⾯解析:3.节点(标签)查找⽅法与遍历⽅法:通过re模块的findall⽅法进⾏查找。三、⽹络爬⾍程序设计1.数据的爬取与采集import requestsimport bs4import pandas as pddef get_one_page(url):# 设置头⽂件信息#伪装爬⾍headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132Safari/537.36' ,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9','Accept-Encoding': 'gzip, deflate','Accept-Language': 'zh-CN,zh;q=0.9','Connection': 'keep-alive','Cookie': '__mta=213236267.90.55.46.4; uuid_n_v=v1;uuid=CD3E8DF084B411EA92CE8B62796C19887B6E5F257E434EBFA5AFAA7C50986BB1;_csrf=d046dea14a42fa27a03f55d5df7be9fc2669e4f8e3f4e33ea42418cf07c04703;Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1587572266; _lxsdk_cuid=171a2ae53b0c8-0824067bcb9761-366b420b-fa000-171a2ae53b1c8; _lxsdk=CD3E8DF084B411EA92CE8B62796C19887B6E5F257E434EBFA5AFAA7C50986BB1; mojo-uuid=0f40d6f281ba65600ee90a8fb913b7ac; t_lxid=171a2ae579fc8-0428f77e331d8b-366b420b-fa000-171a2ae579fc8-tid;Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1587608996; _lxsdk_s=171a580f996-9c1-c73-c3f%7C%7C1','Host': '','Upgrade-Insecure-Requests': '1'}try:response=(url,headers=headers)if _code==200:ng = nt_encodingreturn cept :return None#解析⽹页def get_soup(htm):soup = fulSoup(htm,'')return soup#找到电影名def find_name(soup):x=_all('p',class_="name")n=[]for i in x:()return n#找到电影上映时间def get_time(soup):x=_all('p',class_="releasetime")n=[]for i in x:()return n#找到电影评分def get_score(soup):x=_all('p',class_="score")n=[]for i in x:()return ndef main():url='/board/4?'html = get_one_page(url)#print(html)soup = get_soup(html)#print(soup)#电影名name = find_name(soup)'''for i in name:print(i)'''#上映时间time = get_time(soup)'''for i in time:print(i)'''#评分score = get_score(soup)'''for i in score:print(i)'''#保存excel⽂件df = ame({'电影名':name,'上映时间':time,'评分':score})_excel('猫眼电影.xlsx')if __name__=='__main__':main()抓取的单页内容:2.对数据进⾏清洗和处理#读取csv⽂件df = ame(_excel('猫眼电影.xlsx'))()#缺失值处理().head() #True为缺失值,False为存在值 #空值处理#().sum() #0表⽰⽆空值#查找重复值ated() #显⽰表⽰已经删除重复值#查看统计信息be()4.数据分析与可视化条形图#绘制条形图df = _excel('猫眼电影.xlsx')x = df['电影名'][:5]y = df['评分'][:5]('电影名')('评分')(x,y)("电影名与评分条形图")()#绘制折线图df = _excel('猫眼电影.xlsx')x = df['电影名'][:10]y = df['评分'][:10]('电影名')('评分')(x,y,color="red",label="折线")("猫眼电影电影名评分折线图")()()散点图df = ame(_excel('猫眼电影.xlsx'))x = df['上映时间'][:1]y = df['评分'][:10](x="上映时间",y= "评分",data=df)
p0=[0,0,0]Para=leastsq(error_func,p0,args=(q,w))a,b,c=Para[0]
(figsize=(6,3))
r(q,w,color="grenn",label=u"最⾼评分散点",linewidth=2)x=ce(0,20,15)
y=a*x*x+b*x+(x,y,color="green",label=u"回归⽅程曲线",linewidth=2)
("电影名")[:10]("最⾼评分")[:10]("猫眼电影回归曲线图")()
()代码汇总:import requestsimport bs4import pandas as pdimport seaborn as snsimport numpy as npfrom numpy import genfromtxtimport scipy as spimport as pltfrom ze import leastsqdef get_one_page(url):# 设置头⽂件信息#伪装headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132Safari/537.36' ,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9','Accept-Encoding': 'gzip, deflate','Accept-Language': 'zh-CN,zh;q=0.9','Connection': 'keep-alive','Cookie': '__mta=213236267.90.55.46.4; uuid_n_v=v1;uuid=CD3E8DF084B411EA92CE8B62796C19887B6E5F257E434EBFA5AFAA7C50986BB1;_csrf=d046dea14a42fa27a03f55d5df7be9fc2669e4f8e3f4e33ea42418cf07c04703;Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1587572266; _lxsdk_cuid=171a2ae53b0c8-0824067bcb9761-366b420b-fa000-171a2ae53b1c8; _lxsdk=CD3E8DF084B411EA92CE8B62796C19887B6E5F257E434EBFA5AFAA7C50986BB1; mojo-uuid=0f40d6f281ba65600ee90a8fb913b7ac; t_lxid=171a2ae579fc8-0428f77e331d8b-366b420b-fa000-171a2ae579fc8-tid;Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1587608996; _lxsdk_s=171a580f996-9c1-c73-c3f%7C%7C1','Host': '','Upgrade-Insecure-Requests': '1'}try:response=(url,headers=headers)if _code==200:ng = nt_encodingreturn cept :return None#解析⽹页def get_soup(htm):soup = fulSoup(htm,'')return soup#找到电影名def find_name(soup):x=_all('p',class_="name")n=[]for i in x:()return n#找到电影上映时间def get_time(soup):x=_all('p',class_="releasetime")n=[]for i in x:()return n#找到电影评分def get_score(soup):x=_all('p',class_="score")n=[]for i in x:()return ndef main():url='/board/4?'html = get_one_page(url)#print(html)soup = get_soup(html)#print(soup)#电影名name = find_name(soup)'''for i in name:print(i)'''#上映时间time = get_time(soup)'''for i in time:print(i)'''#评分score = get_score(soup)'''for i in score:print(i)'''#排名#保存excel⽂件df = ame({'电影名':name,'上映时间':time,'评分':score})_excel('猫眼电影.xlsx')if __name__=='__main__':main()df = ame(_excel('猫眼电影.xlsx'))()#缺失值处理().head() #True为缺失值,False为存在值#空值处理#().sum() #0表⽰⽆空值#查找重复值ated() #显⽰表⽰已经删除重复值#查看统计信息be()#绘制条形图df = _excel('猫眼电影.xlsx')x = df['电影名'][:5]y = df['评分'][:5]('电影名')('评分')(x,y)("电影名与评分条形图")()#绘制折线图df = _excel('猫眼电影.xlsx')x = df['电影名'][:10]y = df['评分'][:10]('电影名')('评分')(x,y,color="red",label="折线")("猫眼电影电影名评分折线图")()()df = ame(_excel('猫眼电影.xlsx'))x = df['上映时间'][:1]y = df['评分'][:10](x="上映时间",y= "评分",data=df)df = ame(_csv('E:/华北天⽓数据.csv'))q = df['地点']w = df['最⾼温度']def func(p,x):a,b,c=preturn a*x*x+b*x+cdef error_func(p,x,y):return func(p,x)-yp0=[0,0,0]Para=leastsq(error_func,p0,args=(q,w))a,b,c=Para[0]
(figsize=(6,3))
r(q,w,color="grenn",label=u"最⾼评分散点",linewidth=2)x=ce(0,20,15)
y=a*x*x+b*x+(x,y,color="green",label=u"回归⽅程曲线",linewidth=2)
("电影名")[:10]("最⾼评分")[:10]("猫眼电影回归曲线图")()
()
1.经过对主题数据的分析与可视化,可以得到哪些结论?可以更直观的发现和解决需求,⼤量数据⼀⽬了然,喜欢追求电影质量的可以更直接的观看⾼评分电影,如果需要查找影⽚也可以更快的从分析与可视化完的数据中查找,也更⽅便迅捷。2.对本次程序设计任务完成的情况做⼀个简单的⼩结。 本次程序设计任务完成时间较久,也是对python的不熟悉,但是随着这次的作业的完成,更熟识了许多库与语法,对⾃⼰有较⼤的提升,尽管还是有很多语法错误与不完整的地⽅,但是我相信⼀步⼀个脚印总会有收获的!
发布者:admin,转转请注明出处:http://www.yc00.com/xiaochengxu/1687865874a52059.html
评论列表(0条)