# -*- coding: utf-8 -*-import scrapyimport jsonimport csvfrom milk.items import MilkItemclass MilkspiderSpider(scrapy.Spider): name = 'milkspider' # allowed_domains = ['www.xxx.com'] start_urls = ['https://search.jd.com/Search?keyword=%E8%BF%9B%E5%8F%A3%E7%89%9B%E5%A5%B6&enc=utf-8&suggest=3.def.0.V09--12s0,20s0,38s0&wq=%E8%BF%9B%E5%8F%A3&pvid=96ab0296e9ce494fb251b716911d93ec'] data_list = [] def parse(self, response): li_list = response.xpath('//li[@class="gl-item"]') for li in li_list: good_id = li.xpath('./@data-sku').get() # 从自己开始找 # print(good_id) shop_name = li.xpath('.//a[@class="curr-shop"]/text()').get() # print(shop_name) good_name = li.xpath('.//div[@class="p-name p-name-type-2"]/a/em/text()').getall() good_name = ','.join(good_name).strip().replace(",", "").replace("\n\t", "") # print(good_name) good_url = li.xpath('.//div[@class="p-name p-name-type-2"]/a/@href').get() if good_url.startswith('https:'): good_url = good_url else: good_url = 'https:' + good_url # print(good_url) good_price = li.xpath('.//div[@class="p-price"]/strong//text()').getall() good_price = ','.join(good_price).replace(",", "") # print(good_price) # 评论数在源码没有 获取不到 需要去详情页获取 item = MilkItem() item["shop_name"] = shop_name item["good_name"] = good_name item["good_price"] = good_price item["good_id"] = good_id item['good_url'] = good_url yield scrapy.Request(url=good_url, meta={ "item": item}, callback=self.parse_detail) def parse_detail(self, response): # 获取的评论是动态加载的 item = response.meta['item'] # 拼接每个商品的评论的url comment_info_url = 'https://club.jd.com/comment/productCommentSummaries.action?referenceIds=' + item['good_id'] # print(comment_info_url) yield scrapy.Request(url=comment_info_url, meta={ "item": item}, callback=self.parse_comment) def parse_comment(self, response): item = response.meta['item'] # response.body是一个bytes格式的 转成str str = response.body.decode('utf-8', 'replace') json_str = str.replace('��', '万') dict = json.loads(json_str) total_comment = dict['CommentsCount'][0]['CommentCountStr'] good_comment = dict['CommentsCount'][0]['GoodCountStr'] video_count = dict['CommentsCount'][0]['VideoCountStr'] general_count = dict['CommentsCount'][0]['GeneralCountStr'] poor_count = dict['CommentsCount'][0]['PoorCountStr'] item['total_comment'] = total_comment item['good_comment'] = good_comment item['video_count'] = video_count item['general_count'] = general_count item['poor_count'] = poor_count self.data_list.append(item) # print(self.data_list) with open('./京东进口牛奶.csv', 'w', encoding='utf-8', errors='ignore', newline="") as csvfile: fieldnames = ['good_id', 'good_name', 'shop_name', 'good_url', 'total_comment', 'good_comment', 'video_count', 'general_count', 'poor_count', 'good_price'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() writer.writerows(self.data_list) return self.data_list
items
# -*- coding: utf-8 -*-# Define here the models for your scraped items## See documentation in:# https://doc.scrapy.org/en/latest/topics/items.htmlimport scrapyclass MilkItem(scrapy.Item): # define the fields for your item here like: good_id = scrapy.Field() good_name = scrapy.Field() shop_name = scrapy.Field() good_url = scrapy.Field() total_comment = scrapy.Field() good_comment = scrapy.Field() video_count = scrapy.Field() general_count = scrapy.Field() poor_count = scrapy.Field() good_price = scrapy.Field()
start
from scrapy import cmdlinecmdline.execute("scrapy crawl milkspider".split())