博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
京东进口牛奶的爬取
阅读量:4916 次
发布时间:2019-06-11

本文共 4091 字,大约阅读时间需要 13 分钟。

# -*- coding: utf-8 -*-import scrapyimport jsonimport csvfrom milk.items import MilkItemclass MilkspiderSpider(scrapy.Spider):    name = 'milkspider'    # allowed_domains = ['www.xxx.com']    start_urls = ['https://search.jd.com/Search?keyword=%E8%BF%9B%E5%8F%A3%E7%89%9B%E5%A5%B6&enc=utf-8&suggest=3.def.0.V09--12s0,20s0,38s0&wq=%E8%BF%9B%E5%8F%A3&pvid=96ab0296e9ce494fb251b716911d93ec']    data_list = []    def parse(self, response):        li_list = response.xpath('//li[@class="gl-item"]')        for li in li_list:            good_id = li.xpath('./@data-sku').get()  # 从自己开始找            # print(good_id)            shop_name = li.xpath('.//a[@class="curr-shop"]/text()').get()            # print(shop_name)            good_name = li.xpath('.//div[@class="p-name p-name-type-2"]/a/em/text()').getall()            good_name = ','.join(good_name).strip().replace(",", "").replace("\n\t", "")            # print(good_name)            good_url = li.xpath('.//div[@class="p-name p-name-type-2"]/a/@href').get()            if good_url.startswith('https:'):                good_url = good_url            else:                good_url = 'https:' + good_url            # print(good_url)            good_price = li.xpath('.//div[@class="p-price"]/strong//text()').getall()            good_price = ','.join(good_price).replace(",", "")            # print(good_price)            # 评论数在源码没有 获取不到 需要去详情页获取            item = MilkItem()            item["shop_name"] = shop_name            item["good_name"] = good_name            item["good_price"] = good_price            item["good_id"] = good_id            item['good_url'] = good_url            yield scrapy.Request(url=good_url, meta={
"item": item}, callback=self.parse_detail) def parse_detail(self, response): # 获取的评论是动态加载的 item = response.meta['item'] # 拼接每个商品的评论的url comment_info_url = 'https://club.jd.com/comment/productCommentSummaries.action?referenceIds=' + item['good_id'] # print(comment_info_url) yield scrapy.Request(url=comment_info_url, meta={
"item": item}, callback=self.parse_comment) def parse_comment(self, response): item = response.meta['item'] # response.body是一个bytes格式的 转成str str = response.body.decode('utf-8', 'replace') json_str = str.replace('��', '万') dict = json.loads(json_str) total_comment = dict['CommentsCount'][0]['CommentCountStr'] good_comment = dict['CommentsCount'][0]['GoodCountStr'] video_count = dict['CommentsCount'][0]['VideoCountStr'] general_count = dict['CommentsCount'][0]['GeneralCountStr'] poor_count = dict['CommentsCount'][0]['PoorCountStr'] item['total_comment'] = total_comment item['good_comment'] = good_comment item['video_count'] = video_count item['general_count'] = general_count item['poor_count'] = poor_count self.data_list.append(item) # print(self.data_list) with open('./京东进口牛奶.csv', 'w', encoding='utf-8', errors='ignore', newline="") as csvfile: fieldnames = ['good_id', 'good_name', 'shop_name', 'good_url', 'total_comment', 'good_comment', 'video_count', 'general_count', 'poor_count', 'good_price'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() writer.writerows(self.data_list) return self.data_list

 

items

# -*- coding: utf-8 -*-# Define here the models for your scraped items## See documentation in:# https://doc.scrapy.org/en/latest/topics/items.htmlimport scrapyclass MilkItem(scrapy.Item):    # define the fields for your item here like:    good_id = scrapy.Field()    good_name = scrapy.Field()    shop_name = scrapy.Field()    good_url = scrapy.Field()    total_comment = scrapy.Field()    good_comment = scrapy.Field()    video_count = scrapy.Field()    general_count = scrapy.Field()    poor_count = scrapy.Field()    good_price = scrapy.Field()

 

start

from scrapy import cmdlinecmdline.execute("scrapy crawl milkspider".split())

 

转载于:https://www.cnblogs.com/kenD/p/11123581.html

你可能感兴趣的文章
信息安全-1:python之playfair密码算法详解[原创]
查看>>
Linq
查看>>
OC中新增的数据类型
查看>>
在自己的iOS程序中引入自定义字体
查看>>
页面的按钮3d效果
查看>>
CSS-微信开放UI样式
查看>>
TensorFlow 学习(2)——正式起步
查看>>
TableViewer使用
查看>>
GDB调试原理——ptrace系统调用
查看>>
包含单引号的sql
查看>>
net2.0对于递归变量的处理方式不同引发的递归问题
查看>>
asp.net 数据库连接 使用事务处理(一)
查看>>
Ionic学习
查看>>
ContentProvider的使用
查看>>
使用聚合接口获取汉字数据字典
查看>>
STM32之DMA实例
查看>>
Spring MVC入门知识总结
查看>>
java RandomAccessFile类(随机访问文件)
查看>>
编写弹窗 并居中
查看>>
XML Helper XML操作类
查看>>