2023年7月3日发(作者:)
python⽕车票爬取代码1、根据搜索词下载百度图⽚:# -*- coding: utf-8 -*-"""根据搜索词下载百度图⽚"""import reimport sysimport urllibimport requestsdef get_onepage_urls(onepageurl): """获取单个翻页的所有图⽚的urls+当前翻页的下⼀翻页的url""" if not onepageurl: print('已到最后⼀页, 结束') return [], '' try: html = (onepageurl) ng = 'utf-8' html = except Exception as e: print(e) pic_urls = [] fanye_url = '' return pic_urls, fanye_url pic_urls = l('"objURL":"(.*?)",', html, re.S) fanye_urls = l(e(r'下⼀页'), html, flags=0) fanye_url = '' + fanye_urls[0] if fanye_urls else '' return pic_urls, fanye_urldef down_pic(pic_urls): """给出图⽚链接列表, 下载所有图⽚""" for i, pic_url in enumerate(pic_urls): try: pic = (pic_url, timeout=15) string = str(i + 1) + '.jpg' with open(string, 'wb') as f: (t) print('成功下载第%s张图⽚: %s' % (str(i + 1), str(pic_url))) except Exception as e: print('下载第%s张图⽚时失败: %s' % (str(i + 1), str(pic_url))) print(e) continueif __name__ == '__main__': keyword = '⽕车票' # 关键词, 改为你想输⼊的词即可, 相当于在百度图⽚⾥搜索⼀样 url_init = url_init_first + (keyword, safe='/') all_pic_urls = [] onepage_urls, fanye_url = get_onepage_urls(url_init) all_pic_(onepage_urls) fanye_count = 0 # 累计翻页数 while 1: onepage_urls, fanye_url = get_onepage_urls(fanye_url) fanye_count += 1 # print('第页' % str(fanye_count)) if fanye_url == '' and onepage_urls == []: break all_pic_(onepage_urls) down_pic(list(set(all_pic_urls))) url_init_first = r'/search/flip?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=85_R&pv=&
2、根据搜索词下载⾕歌、必应、百度图⽚# coding:utf-8# 基于icrawler第三⽅库同时爬取google,baidu,bing图⽚,并对名称进⾏重写,数据进⾏分类# 图⽚存放路径为:base_dir='F:/⽂档/text'import loggingimport sysimport base64from datetime import datefrom n import BaiduImageCrawler, BingImageCrawler, GoogleImageCrawlerfrom icrawler import ImageDownloaderfrom n import GoogleImageCrawlerfrom import urlparseclass PrefixNameDownloader(ImageDownloader): def get_filename(self, task, default_ext): filename = super(PrefixNameDownloader, self).get_filename( task, default_ext) return 'prefix_' + filenameclass Base64NameDownloader(ImageDownloader): def get_filename(self, task, default_ext): url_path = urlparse(task['file_url'])[2] if '.' in url_path: extension = url_('.')[-1] if () not in [ 'jpg', 'jpeg', 'png', 'bmp', 'tiff', 'gif', 'ppm', 'pgm' ]: extension = default_ext else: extension = default_ext filename = base64.b64encode(url_()).decode() return '{}.{}'.format(filename, extension)def test_google(dir,keyword): print('启⽤google爬⾍') google_crawler = GoogleImageCrawler(parser_threads=20, downloader_threads=20, downloader_cls=Base64NameDownloader, storage={'root_dir': dir}, log_level = ) google_(keyword=keyword, offset=0, max_num=1000,min_size=(200,200), max_size=None)def test_bing(dir,keyword): keyword = e(': ', '') print('启⽤bing爬⾍',keyword) bing_crawler = BingImageCrawler( # parser_threads=16, downloader_cls=Base64NameDownloader, downloader_threads=16, storage={'root_dir': dir}, log_level=) bing_(keyword=keyword,offset=0, max_num=1000,min_size=None,max_size=None)def test_baidu(dir,keyword): keyword = e(': ', '') print('启⽤百度爬⾍',keyword) baidu_crawler = BaiduImageCrawler( # parser_threads=16, # downloader_threads=16, downloader_cls=Base64NameDownloader, storage={'root_dir': dir}, log_level = ) baidu_(keyword=keyword, offset=0,max_num=1000,min_size=None,max_size=None)def main():################################################################## keyword='⽕车票' base_dir='F:/⽂档/text' if len() == 1: dst = 'all' else: dst = [1:] if 'all' in dst: dst = ['google', 'bing', 'baidu',] if 'google' in dst: test_google(base_dir,keyword) if 'bing' in dst: test_bing(base_dir,keyword) if 'baidu' in dst: test_baidu(base_dir,keyword)if __name__ == '__main__': main()
3、github 搜索爬⾍,有许多有趣的项⽬。
发布者:admin,转转请注明出处:http://www.yc00.com/xiaochengxu/1688382653a129688.html
评论列表(0条)