2023年6月29日发(作者:)
python的爬⾍项⽬(链家买⼆⼿房)不知不觉,已经⼯作6年有余,恍恍惚惚,有机会满⾜房⼦需求。在收集房⼦信息过程中,做些记录。贝壳的功能很强⼤,但很难满⾜⼩区、距离、教育、⾯积等多个⽅⾯的匹配,使⽤起来成本仍然较⾼。针对以上情况,编写该项⽬,收集链家的⼆⼿房数据。项⽬中,主要根据价格来筛选⼩区,并根据⼩区教育、同⼯作位置的距离来确定关注⼩区,再通过房⼦⾯积、总价、户型来确定可以选择的房⼦ 列表,从⽽将购房精⼒集中在关注的重点⼩区和房⼦中。当然,每个⼈可以根据⾃⼰需求进⾏调整。⼀、基础环境说明1.1 基础环境1.1.1 python1.1.2 request(加载页⾯)1.1.3 BeautifuSoup(提取信息 )常⽤使⽤例⼦:from bs4 import BeautifulSoupsoup = BeautifulSoup(a, "") # '标题'# ⼀、提取标签# 1.1 提取唯⼀标签('h1')_all('h1')[0]# 1.2 提取多个标签_all('h2')# [
标题2
,
标题3
]_all(['h1','h2'])# [
标题1
,
标题2
,
标题3
]# 1.3 使⽤正则表达式import _all(e('^h'))# [
标题1
,
标题2
,
标题3
]# ⼆、匹配属性# 2.1 匹配属性1,直接将属性名作为参数名,但是有些属性不⾏,⽐如像a-b这样的属性_all('p', id = 'p1') # ⼀般情况_all('p', class_='p3') # class是保留字⽐较特殊,需要后⾯加⼀个_# 2.2 匹配属性2,最通⽤的⽅法_all('p', attrs={'class':'p3'}) # 包含这个属性就算,⽽不是只有这个属性_all('p', attrs={'class':'p3','id':'pp'}) # 使⽤多个属性匹配_all('p', attrs={'class':'p3','id':False}) # 指定不能有某个属性_all('p', attrs={'id':['p1','p2']}) # 属性值是p1或_all('p', attrs={'class':True}) # 含有class属性即可# 2.3 匹配属性3,正则表达式匹配import _all('p', attrs={'id':e('^p')}) # 使⽤正则表达式# 三、根据标签内容⽂本来识别# 3.1 匹配标签内容1,正则表达式import _all('p', text=e('段落'))_all('p',text=True)# 3.2 匹配标签内容2,传⼊函数def nothing(c): return c not in ['段落1','段落2','⽂章']_all('p',text=nothing)def has_class_but_no_id(tag): return _attr('class') and not _attr('id')# 四、提取内容# 4.1 提取标签⽂本 # 多层嵌套也可以直接返回 # 也可以这样 # ⾥⾯有多个内容时 'n标题n段落1n段落2n'# 4.2 其他标签的属性值# 提取属性值,像字典⼀样提取,以下两种⽅法等价soup.h.a['href']('href')# 五、提取标签信息print() # 提取标签名print() # 提取标签所有属性值print(_attr('href')) # 检查标签是否有某属性# 五、⽰例('p', attrs={'class':'first'}).text # '⽂字1'_all('p') # [
⽂字1
,
⽂字2
], 再分别从中提取⽂字('ul', attrs={'class':'list1'}).find_all('li') # [
列表1第1项,
列表1第2项]# 代码参考:/p/353545321.1.4 地理位置信息(百度API)调⽤⽅式1:def geocodeB(address): base = "/geocoder?address=%s&output=json&key=yourak&city=上海" % address response = (base) if _code == 200: answer = () if "location" in answer['result'] and "level" in answer['result']: return (address, # round(answer['result']['location']['lng'], 5), answer['result']['location']['lng'], # round(answer['result']['location']['lat'], 5), answer['result']['location']['lat'], answer['result']["level"]) else: ("geocodeB %s warning:%s" % (address, answer)) return None else: ("geocodeB %s Error" % address) return None调⽤⽅式2:def geocodeB2(address): from t import urlopen, quote from import quote_plus import hashlib, json # 以get请求为例/geocoder/v2/?address=百度⼤厦&output=json&ak=yourak queryStr = '/geocoder/v2/?address=%s&city=上海&output=json&ak=$yourak$' % address # 对queryStr进⾏转码,safe内的保留字符不转换 encodedStr = quote(queryStr, safe="/:=&?#+!$,;'@()*[]") # 在最后直接追加上yoursk rawStr = encodedStr + '$yoursn$' sn = 5(quote_plus(rawStr).encode("utf8")).hexdigest() url = '%s&sn=%s' % (encodedStr, sn) req = urlopen(url) res = ().decode() # 将其他编码的字符串解码成unicode answer = (res) # 对json数据进⾏解析 if "location" in answer['result'] and "level" in answer['result']: return answer['result']['location']['lat'], answer['result']['location']['lng'] else: ("geocodeB %s warning:%s" % (address, answer)) return None调⽤⽅式3:def geocode_by_baidu(address): from ers import baidu apikey = '$yourak$' # 从⽹站申请 /apiconsole/key?application=key sn = '$yoursn$' g = (api_key=apikey, security_key=sn, timeout=200) a = e(address) # return (round(de, 6), round(ude, 6)) return de, ude1.1.5 地理获取距离计算(geopy)# x and y is (lat,lng)def get_distance(x, y): from ce import geodesic return round(geodesic(x, y).km, 3)1.1.6 解决懒加载和滚动加载(selenium1.1.6 解决懒加载和滚动加载(selenium))Selenium是⼀个⽤于Web应⽤程序测试的⼯具。Selenium测试直接运⾏在浏览器中,就像真正的⽤户在操作⼀样。⽀持的浏览器包括IE(7, 8, 9, 10,11),Firefox,Safari,Chrome,Opera等。使⽤python爬⾍调⽤selenium来模拟正常⽤户访问浏览器.1.2 主要问题1.2.1 懒加载问题1.2.2 滚动加载问题1.2.3 IP访问限制参考:Python爬⾍ | 代理IP的获取和使⽤⼆、前期准备2.1 分析获取的需求个⼈买房需求:预算:400万,最多不超过450万;教育:2梯队学区房户型:⼆房以上房龄:1990年后⾯积:60平⽶以上交通:离世纪⼤道乘公交不超过1⼩时===》1、学区情况,根据⼩区攻略的教育评分来过滤,确定⼩区范围2、根据a. ⼩区的中的房⼦的价格,使⽤预算过滤;b.⼩区的位置,通过距离来过滤交通,不满⾜ 的⼩区3、通过符合要求的⼩区列表,来针对每个⼩区获取房⼦列表,并确定跟踪重点⼩区特别说明:1、为什么不直接获取房⼦呢?房⼦⽆法判断是否满⾜教育;如果通过房⼦找⼩区,再找教育,考虑房⼦⽐⼩区多出⼏个数量级,会有更多的时间浪费2、通过预算和⾯积需求,可以确定房⼦的单价,通过单价来筛选⼩区,减少⼩区范围。2.2 分析页⾯路径2.2.1 获取⼩区列表1、⼩区列表的链接分析由于链家仅显⽰前100页内容,⽽整个上海的⼩区显然⽐100页更多,故根据区来获取⼩区。其中1. bp5ep7.5为价格在5-7.5万的区间,bp为begin price;ep为end price。2. pg为page的页⾯2、⼩区是否有评价的判断可以根据第⼀步获取的⼩区列表中,查看⼩区是否存在⼩区攻略标签来判断是否有⼩区评价信息特别说明:并不是每⼀个⼩区,都可以查看到⼩区的教育评分2.2.2 根据⼩区,获取攻略⼩区的攻略地址为:对于⼩区,有总体评分和分项评分,其中分项评分包含建筑品质、户型设计、交通条件、教育质量、商业环境、花园景观、物业管理等评分。每个⼈可以根据⾃⼰的需求,使⽤不同的评分项进⾏⼩区过滤。例如,我优先考虑教育,则以教育条件进⾏主要过滤条件,要求教育8分以上,⽽其他的要求6.5分以上。2.2.3 根据⼩区,获取房⼦列表⼩区的房⼦列表:三、项⽬代码实现3.1 获取⼩区def get_xiaoqu_list(self, area, save_path): page_size = 100 # 由于仅收集上海,故未对多城市处理 fieldnames = ['area', 'page', 'xiaoqu_id', 'url', 'name', "brief", "loc", "build_type", "build_year", "price", "have_gonglue"] # 如果不存在,则创建⼀个空CSV⽂件,包含表头 # 如果已存在,则将记录已处理的记录情况(针对IP限制,需要跑多次情况) handled_list = [] if (save_path): with open(save_path, encoding='utf-8-sig') as csvfile: reader = ader(csvfile) for row in reader: handled_("%s_%s" % (row['area'], row['page'])) else: with open(save_path, "a+", newline='n', encoding='utf-8-sig') as csvfile: writer = iter(csvfile, fieldnames=fieldnames) eader() handled_set = set(handled_list) ( "get_xiaoqu_list, have handled:%s " % (len(handled_set))) # 针对上海各区进⾏处理 with open(save_path, "a+", newline='n', encoding='utf-8-sig') as csvfile: writer = iter(csvfile, fieldnames=fieldnames) for page_num in range(1, page_size): # /sh/xiaoqu/pudong/pb4ep4.5pg10/ url = "/sh/xiaoqu/%s/bp5ep7.5pg%s/" % (area, str(page_num)) if "%s_%s" % (area, page_num) in handled_set: ("%s has been handled." % url) continue else: (url) # 获取页⾯内容 r = (url=url, headers=_headers) html = t lj = BeautifulSoup(html, '') page_items = _all('li', attrs={'class': 'pictext'}) # 解析各页中的⼩区列表 if len(page_items) > 0: for item in page_items: xiaoqu_url = ('href') xiaoqu_id = xiaoqu_("/")[-2] xiaoqu_gonglue = _all("p", attrs={"class": "gonglue_title"}) if len(xiaoqu_gonglue) == 0: is_gonglue = 0 else: is_gonglue = 1 xiaoqu_info = _all("div", attrs={"class": "item_list"})[0] xiaoqu_name = xiaoqu__all("div", attrs={"class": "item_main"})[0].string xiaoqu_brief = xiaoqu__all("div", attrs={"class": "item_other"})[0].( "nr "") xiaoqu_brief = " ".join(xiaoqu_()) xiaoqu_loc = xiaoqu_()[0] build_type = xiaoqu_()[1] build_year = (r' (?P
d{1,})年建成', xiaoqu_brief, re.I) if build_year: xiaoqu_build = build_("build_year") else: xiaoqu_build = "" xiaoqu_price = xiaoqu__all("span", attrs={"class": "price_total"})[0]. xiaoqu_dict = { "area": area, "page": page_num, "xiaoqu_id": xiaoqu_id, "url": xiaoqu_url, "name": xiaoqu_name, "brief": xiaoqu_brief, "loc": xiaoqu_loc, "build_type": build_type, "build_year": xiaoqu_build, "price": xiaoqu_price, "have_gonglue": is_gonglue } ow(xiaoqu_dict) ow(xiaoqu_dict) else: # 表⾯已到最后⼀页 break handled_({"%s_%s" % (area, page_num)})3.2 根据⼩区列表,获取包含攻略的⼩区3.2.1 根据单个页⾯获取⼩区详细信息# 根据指定⼩区的id,获取⼩区的攻略信息def get_xiaoqu_gonglue_dict(self, id): url = "/sh/xiaoqu/%s/?click_source=m_resblock_detail#review" % id (url) # 根据url加载页⾯ # /sh/xiaoqu/5/?click_source=m_resblock_detail#review html = (url=url, headers=_headers).content lj = BeautifulSoup(html, '') loc_node = ('div', attrs={'class': 'head_location'}) if loc_node is not None: loc_name = loc_ else: loc_name = "" cpt_content = _all('div', attrs={'id': 'review'})[0] totoal_score = cpt_('div', attrs={'class': "review_score"}).get_text().replace("综合测评得分", "") review_txt = "" if cpt_('div', attrs={'class': "review_txt_box"}) is not None: review_txt = cpt_('div', attrs={'class': "review_txt_box"}).get_text().strip(" nr") review_list_txt = cpt_('ul', attrs={'class': "review_list"}) review_list = review_list__all('li') other = "" jianzhu_score = huxing_score = jiaotong_score = shangye_score = jiaoyu_score = jingguan_score = wuye_score = "" for item in review_list: key = value = ('value') if key == "建筑品质": jianzhu_score = value elif key == "户型设计": huxing_score = value elif key == "交通条件": jiaotong_score = value elif key == "教育质量": jiaoyu_score = value elif key == "商业环境": shangye_score = value elif key == "花园景观": jingguan_score = value elif key == "物业管理": wuye_score = value else: other = " %s:%s " % (key, value) peitao_node = ('div', attrs={"class": "box peitao card_box"}) map_api_node = peitao_('img') if peitao_node is not None else None if map_api_node is not None: map_api = map_api_('src') else: map_api = "" def get_geo_from_mapapi(map_api): geo = (r'center=(?P[d.]+),(?P[d.]+)', map_api, re.I) if geo: lat = ("lat") lng = ("lng") else: lat = lng = None return lat, lng lat, lng = get_geo_from_mapapi(map_api) gonglue_dict = { "xiaoqu_id": id, "loc_name": loc_name, "total_score": totoal_score, "review_txt": review_txt if review_txt is not None else "", "jianzhu_score": jianzhu_score if jianzhu_score is not None else "", "huxing_score": huxing_score if huxing_score is not None else "", "jiaotong_score": jiaotong_score if jiaotong_score is not None else "", "jiaoyu_score": jiaoyu_score if jiaoyu_score is not None else "", "shangye_score": shangye_score if shangye_score is not None else "", "jingguan_score": jingguan_score if jingguan_score is not None else "", "wuye_score": wuye_score if wuye_score is not None else "", "map_api": map_api, "lng": lng if lng is not None else "", "lat": lat if lat is not None else "", "other": other } return gonglue_dict3.2.2 根据列表,⽣成所有攻略信息列表# 根据第⼀步获取的⼩区列表,逐项⽣成攻略列表def handle_gonglue_by_xiaoqu(self, file_path, save_path, if_distance=False, local_geo=None): # 判断参数是否正确 if if_distance == True and local_geo is None: ("in handle_gonglue_by_xiaoqu, if_distance's Ture, local_geo can't be None") exit(1) # ⽣成⼩区列表 url_list = [] with open(file_path, encoding='utf-8-sig') as csvfile: reader = ader(csvfile) for row in reader: if row['have_gonglue'] == "1": url_(row['xiaoqu_id']) # 如果攻略列表已存在,则统计已处理的记录 handled_list = [] fieldnames = ['xiaoqu_id', 'loc_name', 'total_score', "review_txt", "jianzhu_score", "huxing_score", "jiaotong_score", "jiaoyu_score", "shangye_score", "jingguan_score", "wuye_score", "map_api", "lat", "lng", "distance", "other"] if (save_path): with open(save_path, encoding='utf-8-sig') as csvfile: reader = ader(csvfile) for row in reader: handled_(row['xiaoqu_id']) else: # 如果不存在,则创建⼀个空CSV⽂件,包含表头 with open(save_path, "a+", newline='n', encoding='utf-8-sig') as csvfile: writer = iter(csvfile, fieldnames=fieldnames) eader() handled_set = set(handled_list) ("handle_gonglue_by_xiaoqu, the length of url_list: %s" % len(url_list)) # 针对每⼀个⼩区列表,获取攻略信息 with open(save_path, "a+", newline='n', encoding='utf-8-sig') as csvfile: writer = iter(csvfile, fieldnames=fieldnames) for xiaoqu_id in url_list: if xiaoqu_id not in handled_set: gonglue_dict = _xiaoqu_gonglue_dict(id=xiaoqu_id) if if_distance: distance = get_distance((gonglue_dict["lat"], gonglue_dict["lng"]), local_geo) gonglue_dict["distance"] = distance ow(gonglue_dict) handled_({xiaoqu_id}) else: ("xiaoqu %s is handled" % xiaoqu_id)3.3 根据攻略列表,⽣成关注的房⼦列表3.3.1 获取单个⼩区的房⼦列表# 根据⼩区id,获取⼩区的满⾜条件的房⼦列表def get_houselist_by_xiaoqu(self, xiaoqu_id): # /sh/ershoufang/bp350ep450l2l3ba67ea70c5 # bp350ep450 表⽰价格开始和结束 # l2l3 户型2室和3室 # ba67ea70 ⾯积67-70 # c5 ⼩区编号 url = "/sh/ershoufang/bp350ep450l2l3ba60ea90c%s" % xiaoqu_id html = (url=url, headers=_headers).content house_list = [] lj = BeautifulSoup(html, '') # 页⾯中包含多个列表,包含当前搜索以及推荐其他⼩区 view_body = ('div', attrs={'class': 'list-view-section-body'}) item_list = view__all('div', attrs={'class': 'lj-track', 'data-click-event': 'SearchClick'}) for item in item_list: house_body = ("div", attrs={'class': 'kem__house-tile-ershou'}) house_id = house_("data-id") ("handle house_id:%s" % house_id) house_txt = house_("div", attrs={'class': 'house-text'}) house_title = house_("div", attrs={"class": 'house-title'}).text house_desc = house_("div", attrs={"class": 'house-desc'}).string house_price_total = house_("span", attrs={"class": "price-total"}). house_price_unit = house_("span", attrs={"class": "price-unit"}).("元/平") house_dict = { "xiaoqu_id": xiaoqu_id, "house_id": house_id, "title": house_title, "desc": house_desc, "price_total": house_price_total, "price_unit": house_price_unit } house_(house_dict) return house_list3.3.2 根据攻略列表,⽣成房⼦列表# 根据攻略列表,提取关注的⼩区,再逐项获取列表def handle_hoselist_by_gonglue(self, file_path, save_path, filter_func=None): xiaoqu_list = [] with open(file_path, encoding='utf-8-sig') as csvfile: reader = ader(csvfile) for row in reader: if filter_func is not None: if filter_func(row): # 将⼩区的ID,加⼊到处理列表中 xiaoqu_((row["xiaoqu_id"], row["loc_name"], row["distance"])) else: xiaoqu_((row["xiaoqu_id"], row["loc_name"], row["distance"])) handled_list = [] fieldnames = ['xiaoqu_id', 'xiaoqu_name', 'distance', 'house_id', 'title', "desc", "price_total", "price_unit"] if (save_path): with open(save_path, encoding='utf-8-sig') as csvfile: reader = ader(csvfile) for row in reader: handled_(row['xiaoqu_id']) else: # 如果不存在,则创建⼀个空CSV⽂件,包含表头 with open(save_path, "a+", newline='n', encoding='utf-8-sig') as csvfile: writer = iter(csvfile, fieldnames=fieldnames) eader() handled_set = set(handled_list) ( "handle_hoselist_by_xiaoqu, to be handled: %s, have handled:%s " % (len(xiaoqu_list), len(handled_set))) with open(save_path, "a+", newline='n', encoding='utf-8-sig') as csvfile: writer = iter(csvfile, fieldnames=fieldnames) for xiaoqu_id, xiaoqu_loc_name, distance in xiaoqu_list: if xiaoqu_id not in handled_set: ("handle xiaoqu:%s" % xiaoqu_id) house_list = _houselist_by_xiaoqu(xiaoqu_id) if len(house_list) > 0: for house_dict in house_list: house_dict["xiaoqu_name"] = xiaoqu_loc_name house_dict["distance"] = distance ow(house_dict) else: house_dict = { "xiaoqu_id": xiaoqu_id, "xiaoqu_name": xiaoqu_loc_name, "distance": distance } ow(house_dict) ("⼩区:%s %s have no match house." % (xiaoqu_id, xiaoqu_loc_name)) handled_({xiaoqu_id}) else: ("%s is handled" % xiaoqu_id)
发布者:admin,转转请注明出处:http://www.yc00.com/news/1687982266a63486.html
评论列表(0条)