python爬虫爬取天猫店铺商品数据|江阴雨辰互联

2023年6月27日发(作者：)

python爬⾍爬取天猫店铺商品数据#!coding=utf-8import requestsimport reimport randomimport timeimport jsonfrom ions import InsecureRequestWarningimport pandas as e_warnings(InsecureRequestWarning) ###禁⽌提醒SSL警告class tm(object):####⼿机端 def __init__(self,path): ###保存数据路径 =path def goodsid(self,url): ###通过店铺URL获取店铺所有ID shopname = ('(.*?).tmall', url).group(1) searchurl = '{}./shop/shop_auction_?spm=a1z60.7754813.0.0.301755f0pZ1GjU&sort=defaul'.format( shopname) s=n() headers = {'Accept': '*/*', 'Accept-Language': 'zh-CN', 'Referer':'{}./shop/shop_auction_?spm=a1z60.7754813.0.0.301755f0pZ1GjU&sort=default'.format(shopname) } (headers) page1=(url=searchurl,verify=False).text print(page1) js=(page1) total_page=int(js['total_page']) shop_id=js['shop_id'] shop_title = js['shop_title'] shop_id_list = [] shop_title_list = [] item_id=l('"item_id":(.*?),"',page1) title=l('"title":"(.*?)","',page1) sold=l('"sold":"(.*?)","',page1) totalSoldQuantity=l('"totalSoldQuantity":(.*?),"',page1) skuurl=l('"url":"(.*?)","',page1) price=l('"price":"(.*?)","',page1) item_id_l=len(item_id) shop_id_(shop_id) shop_id_(shop_id_list*(int(item_id_l)-1)) shop_title_(shop_title) shop_title_(shop_title_list*(int(item_id_l)-1)) # print(js) # print(len(shop_id_list)) # print(len(shop_title_list)) # print(len(item_id)) # print(len(title)) # print(len(sold)) # print(len(totalSoldQuantity)) # print(len(skuurl)) # print(len(price)) 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_3 like Mac OS X) AppleWebKit/603.3.8 (KHTML, like Gecko) FxiOS/10.6b8836 Mobile/14G6 data = {'shop_id': shop_id_list,'shop_title': shop_title_list,'item_id': item_id, 'title': title, 'sold':sold, 'totalSoldQuantity':totalSoldQuantity, 'skuurl':skuurl, 'price':pr df = ame(data=data) #print(df) savepath= + r'tmgoodsid{}.csv'.format(shopname) print(savepath) print(savepath) _csv(savepath, mode='a', index=False, encoding="GB18030") (() * 2) if total_page!=1: for i in range(2,total_page+1): (() * 2) htmlurl=searchurl+'&p={}'.format(i) html=(url=htmlurl,verify=False).text shop_id_list = [] shop_title_list = [] print(html) item_id = l('"item_id":(.*?),"',html) title = l('"title":"(.*?)","', html) sold = l('"sold":"(.*?)","', html) totalSoldQuantity = l('"totalSoldQuantity":(.*?),"', html) skuurl = l('"url":"(.*?)","', html) price = l('"price":"(.*?)","',html) item_id_l = len(item_id) shop_id_(shop_id) shop_id_(shop_id_list * (int(item_id_l) - 1)) shop_title_(shop_title) shop_title_(shop_title_list * (int(item_id_l) - 1)) data = {'shop_id': shop_id_list, 'shop_title': shop_title_list, 'item_id': item_id, 'title': title, 'sold': sold, 'totalSoldQuantity': totalSoldQuantity, 'skuurl': skuurl, 'price': price} df = ame(data=data) _csv( + r'tmgoodsid{}.csv'.format(shopname),mode='a', index=False,header=0 ,encoding="GB18030") df1 = _csv( + r'tmgoodsid{}.csv'.format(shopname), encoding='GB18030') () return df1 def getiddata(self,id): ###获取ID数据 (() * 1 + 1) s = n() t=int(()*1000) url='/h5/ail/6.0/' '?jsv=2.4.8&appKey=12574478&t={}' '&sign=7c9e1dedaa295fdb175d22c99746493b&api=ail' '&v=6.0&dataType=jsonp&ttid=2017%40taobao_h5_6.6.0&AntiCreep=true&type=jsonp&callback=mtopjsonp2&' 'data=%7B%22itemNumId%22%3A%22{}%22%7D'.format(t,id) headers = {'Accept': '*/*', 'Accept-Language': 'zh-CN', 'Referer': '/?spm=a220m.6910245.0.0.55b17434eiwv4f&id={}'.format(id) } print(url) (headers) html = (url=url, verify=False).text html=e('','') (0.5) info=('skuBase":(.*?),"skuCore',html) if info!=None: skuBase=('skuBase":(.*?),"skuCore',html).group(1) ##SKU+颜⾊ skuId = l('"skuId":"(.*?)","', skuBase) propPath=l('"propPath":"(.*?)"}',skuBase) skuBase=(skuBase) prop_list=[] for i in propPath: prop = '' prop1=(';') for j in prop1: prop2=(':') for pid in skuBase['props']: if pid['pid']==prop2[0]: #prop=prop+pid['name'] 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_3 like Mac OS X) AppleWebKit/603.3.8 (KHTML, like Gecko) FxiOS/10.6b8836 Mobile/14G6 #prop=prop+pid['name'] for vid in pid['values']: if vid['vid']==prop2[1]: prop=prop+vid['name'] prop_(str(prop)) sku2info = ('"sku2info":(.*?)},"s', html).group(1) ##价格 sku2info = (sku2info) price = [] for i in skuId: p = sku2info[str(i)]['price']['priceText'] (p) else: skuId=[' '] prop_list=[' '] price=[' '] data = {'skuid': skuId, 'prop': prop_list,'price':price} df = ame(data=data) return df def iddata(self,id_df): df_l=id_[:,0].size df=ame() [0, "shop_id"] = '' [:, "shop_title"] = '' [:, "item_id"] = '' [:, "title"] = '' [:, "sold"] = '' [:, "totalSoldQuantity"] = '' [:, "skuurl"] = '' [:, "price"] = '' [:, "skuid"] = '' [:, "prop"] = '' [:, "skuprice"] = '' shopid=id_df['shop_id'][1] y=0 for i in range(0,df_l): (() * 2.56) pid=id_df['item_id'][i] data=ata(pid) data_l=[:,0].size for j in range(0,data_l): [y, "shop_id"] = id_df['shop_id'][i] [y, "shop_title"] = id_df['shop_title'][i] [y, "item_id"] = id_df['item_id'][i] [y, "title"] = id_df['title'][i] [y, "sold"] = id_df['sold'][i] [y, "totalSoldQuantity"] = id_df['totalSoldQuantity'][i] [y, "skuurl"] = id_df['skuurl'][i] [y, "price"] = id_df['price'][i] [y, "skuid"] = data['skuid'][j] [y, "prop"] = data['prop'][j] [y, "skuprice"] = data['price'][j] y +=1 _csv( + r'tm{}.csv'.format(shopid), index=False, encoding="GB18030") return df def urlitem(self,url,*args): ##通过⽬录获取只适合部分 s = n() headers = {'Accept': '*/*', 'Accept-Language': 'zh-CN', 'Accept-Language': 'zh-CN', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.15 Safari/537.36' } (headers) itemhtml = (url=url, verify=False).text #print(itemhtml) shopid = ('class="J_TModule"(.*?)"搜索列表"', itemhtml).group(1) shopid=('data-widgetid="(.*?)" id',shopid).group(1) #print(shopid) id=('category-(.*?).htm',url).group(1) nm=('(.*?)./',url).group(1) t=int(()*1000) pageurl='{}./i/?_ksTS={}_888&callback=jsonp289&mid=w-{}-0&wid={}&path=/category-{}.htm'.format(nm,t,shopid,shopid,id) print(pageurl) (() * 1 + 1) html = (url=pageurl, verify=False).text html = e('', '') html=('n','',html) page=('ui-page-s-len">1/(.*?)',html).group(1) print(page) nm_list=[] idurl_list=[] price_list=[] sale_list=[] for p in range(1,int(page)+1): (()) pageurl = '{}./i/?_ksTS={}_888&callback=jsonp289&mid=w-{}-0&wid={}&path=/category-{}.htm'.format( nm, t, shopid, shopid, id) html = (url=pageurl, verify=False).text html = e('', '') html = ('n', '', html) print(html) nm=l('(.*?) ',html)[:-8] sale=l('sale-num">(.*?)',html)[:-8] nm_(nm) idurl_(idurl) price_(price) sale_(sale) print(len(nm_list)) print(len(idurl_list)) print(len(price_list)) print(len(sale_list)) data={'nm':nm_list,'idurl':idurl_list,'price':price_list,'sale':sale_list} df=ame(data) l=len(args) for j in range(0,l): [:, "col"+str(j)] = args[j] print(df) () return df # 例⼦： # tm = tm() # url = '/' # # url = '/' # m(url, '电脑', 'cpu')if __name__=='__main__': path=r'E:tm' tm=tm(path) df=d('') (df)

发布者：admin，转转请注明出处：http://www.yc00.com/xiaochengxu/1687866048a52072.html