2023年6月30日发(作者:)
PYTHON爬取旅游数据+MATPLOTLIB简单可视化包导⼊import requestsimport jsonimport openpyxlimport pandasimport as pltimport re爬取数据并转为DATAFRAME在url1中可以查到数据,由于⽹站是异步加载,要抓包得到url2,得到json数据。⽤.text⽅法读取,并使⽤()函数转为python对象。对数据获取和重组要结合url2的数据结构,分层拆开打印 后就可以轻松看到数据获取和重组的部分了。最后⼀步ame(df_dict, index=years[1:], columns=names)获取到的DataFrame列名为地区,⾏索引为年份,在使⽤.stack().unstack(level=0)将其⾏列互转。'''python学习交流群:1136201545更多学习资料可以加群获取'''def get_data_to_df(): """ 获取国家数据⽹上的旅游⾏业数据并转为dataframe返回 :return: dataframe """ url = "/?cn=C01" #
数据查询地址 # json数据地址 url2 = "/?m=QueryData&dbcode=hgnd&rowcode=zb&colcode=sj&wds=%5B%5D&dfwds=%5B%7B%22wdcode%22%3A%22zb%22%2C%22valuecode%22%3A%22A0K05%22%7D%5D&k1=54&h=1" headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit' '/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36'} html = (url2, headers=headers).text data = (html, encoding='utf-8') names = [i["name"] for i in data["returndata"]["wdnodes"][0]["nodes"]] years = [i["name"] for i in data["returndata"]["wdnodes"][1]["nodes"]] (0, "指标") df_dict = {} for num in range(len(names)): sub_data = [i["data"]["strdata"] for i in data["returndata"]["datanodes"][num * 10:num * 10 + 10]] sub_(0, names[num]) df_dict[sub_data[0]] = sub_data[1:] # print(sub_data) df = ame(df_dict, index=years[1:], columns=names).stack().unstack(level=0) return df爬取数据并存⼊EXCELurl解析部分同上。为了简便,对excel写⼊数据使⽤append⽅法,所以年份作为数据的第⼀⾏,要在最前⾯加上⼀个列名,存⼊A1单元格。def get_data_to_excel(): """ 获取国家数据⽹上的旅游⾏业数据并存到中 :return: .xlsx⽂件 """ # json数据地址 url2 = "/?m=QueryData&dbcode=hgnd&rowcode=zb&colcode=sj&wds=%5B%5D&dfwds=%5B%7B%22wdcode%22%3A%22zb%22%2C%22valuecode%22%3A%22A0K05%22%7D%5D&k1=54&h=1" headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit' '/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36'} html = (url2, headers=headers).text data = (html, encoding='utf-8') names = [i["name"] for i in data["returndata"]["wdnodes"][0]["nodes"]] years = [i["name"] for i in data["returndata"]["wdnodes"][1]["nodes"]] (0, "指标") workbook = ok() worksheet = (years) for num in range(len(names)): sub_data = [i["data"]["strdata"] for i in data["returndata"]["datanodes"][num * 10:num * 10 + 10]] sub_(0, names[num]) (sub_data) # print(sub_data) ("")因为下⾯4个图都是matplotlib的,所以直接在这⾥进⾏中⽂和负号乱码问题的处理ms['-serif'] = ['Arial Unicode MS'] #
配置语⾔ms['e_minus'] = False #
解决负号乱码MATPLOTLIB饼图调⽤get_data_to_df()得到DataFrame,然后取出2018年的⼀列数据,再转为dict⽅便数据获取。因为DataFrame中的⾏索引是url上的原始数据 如“朝鲜⼊境游客”,⽽要输出的图例只是国家名,所有对countries 加上 “⼊境游客”来匹配DataFrame中的对应⼈次。def asian_countries_pie(): countries = ["朝鲜", "印度", "印度尼西亚", "⽇本", "马来西亚", "蒙古", "菲律宾", "新加坡", "韩国", "泰国"] df = dict(get_data_to_df()["2018年"])
data = [df[i + "⼊境游客"] for i in countries] (data, labels=countries) ('2018年亚洲各国⼊境⼈次占⽐', fontsize=18) ()# asian_countries_pie() #
图1折线图获取DataFrame的列名后,转为list,并反序排列,得到2009-2018正序的年份列表。从DataFrame中获取对应的国家的10年数据,后均反序排列处理,数据要转为float才能传⼊matplotlib的plot中。def asian_countries_line(): countries = ["朝鲜", "印度", "印度尼西亚", "⽇本", "马来西亚", "蒙古", "菲律宾", "新加坡", "韩国", "泰国"] df = get_data_to_df() years = ()[::-1]
data1 = [float(i) for i in dict([countries[0] + "⼊境游客"]).values()][::-1] data2 = [float(i) for i in dict([countries[1] + "⼊境游客"]).values()][::-1] data3 = [float(i) for i in dict([countries[2] + "⼊境游客"]).values()][::-1] data4 = [float(i) for i in dict([countries[3] + "⼊境游客"]).values()][::-1] data5 = [float(i) for i in dict([countries[4] + "⼊境游客"]).values()][::-1] data6 = [float(i) for i in dict([countries[5] + "⼊境游客"]).values()][::-1] data7 = [float(i) for i in dict([countries[6] + "⼊境游客"]).values()][::-1] data8 = [float(i) for i in dict([countries[7] + "⼊境游客"]).values()][::-1] data9 = [float(i) for i in dict([countries[8] + "⼊境游客"]).values()][::-1] data10 = [float(i) for i in dict([countries[9] + "⼊境游客"]).values()][::-1] (years, data1, label=countries[0]) (years, data2, label=countries[1]) (years, data3, label=countries[2]) (years, data4, label=countries[3]) (years, data5, label=countries[4]) (years, data6, label=countries[5]) (years, data7, label=countries[6]) (years, data8, label=countries[7]) (years, data9, label=countries[8]) (years, data10, label=countries[9]) ("近⼗年亚洲各国⼊境⼈次⾛势图") ('⼊境游客(万⼈次)', fontsize=14) # y轴名称,字号 (loc='upper right') #
这⾥为显⽰图例,并配置图例位置为右上⾓ ()# asian_countries_line() #
图2⼜是饼图获取⾏索引,regions在⾏索引中找到名字⾥带“洲”的,取得去掉后⾯“⼊境游客”四个字符的名称即洲名。在DataFrame 2018年 ⼀整列数据中 取得洲游客数据。def continents_pie(): df = get_data_to_df() regions = () continents = [i[:-4] for i in regions if ("洲", i)] data = [v for k, v in df["2018年"].to_dict().items() if (k[:-4] in continents)] (data, labels=continents) ("各州⼊境⼈次占⽐") ()# continents_pie() #
图3⼜是折线图获取洲名称列表同上。本次要取各⼤洲的⼀整⾏数据,⽤到[⾏索引] ⽅法。'''python学习交流群:1136201545更多学习资料可以加群获取'''def continents_line(): df = get_data_to_df() years = ()[::-1] regions = () continents = [i[:-4] for i in regions if ("洲", i)] Asia = [float(i) for i in [continents[0] + "⼊境游客"].to_dict().values()][::-1] Africa = [float(i) for i in [continents[1] + "⼊境游客"].to_dict().values()][::-1] Europe = [float(i) for i in [continents[2] + "⼊境游客"].to_dict().values()][::-1] Latin = [float(i) for i in [continents[3] + "⼊境游客"].to_dict().values()][::-1] America = [float(i) for i in [continents[4] + "⼊境游客"].to_dict().values()][::-1] Oceania = [float(i) for i in [continents[5] + "⼊境游客"].to_dict().values()][::-1] (years, Asia, label=continents[0]) (years, Africa, label=continents[1]) (years, Europe, label=continents[2]) (years, Latin, label=continents[3]) (years, America, label=continents[4]) (years, Oceania, label=continents[4]) ("近⼗年各洲⼊境⼈次⾛势图") (loc="upper right") ()continents_line() #
图4
发布者:admin,转转请注明出处:http://www.yc00.com/news/1688107254a82537.html
评论列表(0条)