如何用Scrapy框架帮助菜鸟高效爬取糗事百科内容?
- 内容介绍
- 文章标签
- 相关推荐
本文共计623个文字,预计阅读时间需要3分钟。
第一步:创建项目+ 使用 Scrapy 创建项目:`scrapy startproject [name]`,例如:`scrapy startproject choushibaike`
第二步:进入项目目录并创建 APP+ 进入项目文件夹:`cd choushibaike`+ 创建 APP:`scrapy gensider baike lovehhy.net`
第三步:配置 baike.py 文件+ 编辑 `baike.py` 文件,进行相应配置。
第一步:
创建项目
scrapy stratproject [name]
如 scrapy startproject choushibaike
第二步:
进入到项目的文件夹目录创建APP
scrapy gensider baike lovehhy.net
第三步:
配置baike.py文件
# -*- coding: utf-8 -*-
import scrapy
from ..items import ChoushibaikeItem
class BaikeSpider(scrapy.Spider):
name = ‘baike‘
allowed_domains = [‘lovehhy.net‘]
start_urls = [‘www.lovehhy.net/joke/Detail/QSBK‘]
def parse(self, response):
titles = response.xpath(‘//div[@class="cat_llb"]/h3/a/text()‘).extract()
contents = response.xpath(‘//div[@class="cat_llb"]/div[@id="endtext"]/text()‘).extract()
times = response.xpath(‘//div[@class="cat_llb"]/text()‘).extract()
title_list = []
for title in titles:
title_list.append(title)
content_list = []
for content in contents:
content_list.append(content)
time_list = []
for time in times:
time_list.append(time)
item_lists = zip(title_list, content_list, time_list)
for item_list in item_lists:
item = ChoushibaikeItem()
item[‘title‘] = item_list[0]
item[‘content‘] = item_list[1]
item[‘time‘] = item_list[2][0:22]
item[‘click‘] = item_list[2][22:-1]
yield item
next_url = response.xpath(‘//a[text()="下一页>>"]/@href‘).extract_first()
url = response.urljoin(next_url)
yield scrapy.Request(url=url, callback=self.parse)
第四步:
配置items.py文件
import scrapy
class ChoushibaikeItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
content = scrapy.Field()
time = scrapy.Field()
click = scrapy.Field()
第五步:
配置pipelines.py文件
import pymongo class MongoPipeline(object): def __init__(self, mongo_uri, mongo_db): self.mongo_uri = mongo_uri self.mongo_db = mongo_db @classmethod def from_crawler(cls, crawler): return cls( mongo_uri=crawler.settings.get(‘MONGO_URI‘), mongo_db=crawler.settings.get(‘MONGO_DB‘) ) def open_spider(self, spider): self.client = pymongo.MongoClient(self.mongo_uri) self.db = self.client[self.mongo_db] def process_item(self, item, spider): name = item.__class__.__name__ self.db[name].insert(dict(item)) return item def close_spider(self, spider): self.client.close()
第六步:
配置settings.py文件
# -*- coding: utf-8 -*- # Scrapy settings for choushibaike project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # docs.scrapy.org/en/latest/topics/settings.html # docs.scrapy.org/en/latest/topics/downloader-middleware.html # docs.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = ‘choushibaike‘ SPIDER_MODULES = [‘choushibaike.spiders‘] NEWSPIDER_MODULE = ‘choushibaike.spiders‘ # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36‘ # Obey robots.txt rules ROBOTSTXT_OBEY = False ITEM_PIPELINES = { # ‘choushibaike.pipelines.ChoushibaikePipeline‘: 300, ‘choushibaike.pipelines.MongoPipeline‘: 400, } MONGO_URI = ‘mongodb://admin:[emailprotected]/‘ MONGO_DB = ‘choushibaike‘
第七步:
运行项目
scrapy crawl baike
本文共计623个文字,预计阅读时间需要3分钟。
第一步:创建项目+ 使用 Scrapy 创建项目:`scrapy startproject [name]`,例如:`scrapy startproject choushibaike`
第二步:进入项目目录并创建 APP+ 进入项目文件夹:`cd choushibaike`+ 创建 APP:`scrapy gensider baike lovehhy.net`
第三步:配置 baike.py 文件+ 编辑 `baike.py` 文件,进行相应配置。
第一步:
创建项目
scrapy stratproject [name]
如 scrapy startproject choushibaike
第二步:
进入到项目的文件夹目录创建APP
scrapy gensider baike lovehhy.net
第三步:
配置baike.py文件
# -*- coding: utf-8 -*-
import scrapy
from ..items import ChoushibaikeItem
class BaikeSpider(scrapy.Spider):
name = ‘baike‘
allowed_domains = [‘lovehhy.net‘]
start_urls = [‘www.lovehhy.net/joke/Detail/QSBK‘]
def parse(self, response):
titles = response.xpath(‘//div[@class="cat_llb"]/h3/a/text()‘).extract()
contents = response.xpath(‘//div[@class="cat_llb"]/div[@id="endtext"]/text()‘).extract()
times = response.xpath(‘//div[@class="cat_llb"]/text()‘).extract()
title_list = []
for title in titles:
title_list.append(title)
content_list = []
for content in contents:
content_list.append(content)
time_list = []
for time in times:
time_list.append(time)
item_lists = zip(title_list, content_list, time_list)
for item_list in item_lists:
item = ChoushibaikeItem()
item[‘title‘] = item_list[0]
item[‘content‘] = item_list[1]
item[‘time‘] = item_list[2][0:22]
item[‘click‘] = item_list[2][22:-1]
yield item
next_url = response.xpath(‘//a[text()="下一页>>"]/@href‘).extract_first()
url = response.urljoin(next_url)
yield scrapy.Request(url=url, callback=self.parse)
第四步:
配置items.py文件
import scrapy
class ChoushibaikeItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
content = scrapy.Field()
time = scrapy.Field()
click = scrapy.Field()
第五步:
配置pipelines.py文件
import pymongo class MongoPipeline(object): def __init__(self, mongo_uri, mongo_db): self.mongo_uri = mongo_uri self.mongo_db = mongo_db @classmethod def from_crawler(cls, crawler): return cls( mongo_uri=crawler.settings.get(‘MONGO_URI‘), mongo_db=crawler.settings.get(‘MONGO_DB‘) ) def open_spider(self, spider): self.client = pymongo.MongoClient(self.mongo_uri) self.db = self.client[self.mongo_db] def process_item(self, item, spider): name = item.__class__.__name__ self.db[name].insert(dict(item)) return item def close_spider(self, spider): self.client.close()
第六步:
配置settings.py文件
# -*- coding: utf-8 -*- # Scrapy settings for choushibaike project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # docs.scrapy.org/en/latest/topics/settings.html # docs.scrapy.org/en/latest/topics/downloader-middleware.html # docs.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = ‘choushibaike‘ SPIDER_MODULES = [‘choushibaike.spiders‘] NEWSPIDER_MODULE = ‘choushibaike.spiders‘ # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36‘ # Obey robots.txt rules ROBOTSTXT_OBEY = False ITEM_PIPELINES = { # ‘choushibaike.pipelines.ChoushibaikePipeline‘: 300, ‘choushibaike.pipelines.MongoPipeline‘: 400, } MONGO_URI = ‘mongodb://admin:[emailprotected]/‘ MONGO_DB = ‘choushibaike‘
第七步:
运行项目
scrapy crawl baike

