Bootstrap

Scrapy爬虫结合Selenium爬取简书保存到Mysql

这是一个比较完整的项目,功能已经实现,可以保存到本地mysql,下面代码实现:
爬虫主程序 spider:

from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from jianshu_spider.items import ArticleItem

class JsSpider(CrawlSpider):
    name = 'js'
    allowed_domains = ['jianshu.com']
    start_urls = ['http://jianshu.com/']

    rules = (
        Rule(LinkExtractor(allow=r'.*/p/[0-9a-z]{12}.*'), callback='parse_detail', follow=True),
    )

    def parse_detail(self, response):
        title = response.xpath('//h1[@class="title"]/text()').get()
        avatar = response.xpath('//a[@class="avatar"]/img/@src').get()
        author = response.xpath('//span[@class="name"]/a/text()').get()
        pub_time = response.xpath('//span[@class="publish-time"]/text()').get().replace('*','')

        # 从url中截取文章id
        url = response.url
        url1 = url.split('?')[0]
        article_id = url1.split('/')[-1]

        content = response.xpath('//div[@class="show-content-free"]').get()
        origin_url = response.url


        read_count = response.xpath('//span[@class="views-count"]/text()').get()
        like_count = response.xpath('//span[@class="likes-count"]/text()').get()
        word_count = response.xpath('//span[@class="wordage"]/text()').get()
        comment_count = response.xpath('//span[@class="comments-count"]/text()').get()
        subjects = ','.join(response.xpath('//div[@class="include-collection"]/a/div/text()').getall())

        item = ArticleItem(
            title=title,
            avatar=avatar,
            author=author,
            pub_time=pub_time,
            article_id=article_id,
            origin_url=origin_url,
            content=content,
            read_count=read_count,
            like_count=like_count,
            word_count=word_count,
            comment_count=comment_count,
            subjects=subjects,
        )
        yield item

设计item

import scrapy

class ArticleItem(scrapy.Item):
    title = scrapy.Field()
    content = scrapy.Field()
    article_id = scrapy.Field()
    origin_url = scrapy.Field()
    author = scrapy.Field()
    avatar = scrapy.Field()
    pub_time =scrapy.Field()
    read_count = scrapy.Field()
    like_count = scrapy.Field()
    word_count = scrapy.Field()
    comment_count = scrapy.Field()
    subjects = scrapy.Field()

使用pymysql保存到数据库,pipeline

import pymysql

class JianshuSpiderPipeline(object):

    def __init__(self):
        '''
        配置mysql的参数
        '''
        dbparams = {
            'host': 'localhost',
            'port': 3306,
            'user': 'test',
            'password': '******',
            'database': 'jianshu',	
            'charset': 'utf8',		# 注意,这里不能使用utf-8,mysql的语法规则
        }
        self.conn = pymysql.connect(**dbparams)
        self.cursor = self.conn.cursor()        # 创建游标
        self._sql = None

    def process_item(self, item, spider):
        self.cursor.execute(self.sql, (
            item['title'],
            item['content'],
            item['pub_time'],
            item['article_id'],
            item['author'],
            item['avatar'],
            item['origin_url'],
        ))
        self.conn.commit()
        return item

    @property
    def sql(self):
        if not self._sql:
            self._sql = """
            insert into article(id,title,content,pub_time,article_id,author,avatar,origin_url) values (null ,%s, %s, %s, %s, %s, %s, %s)
            """
            return self._sql
        return self._sql

设计selenium动态爬取,设计中间件,middlewares

from selenium import webdriver
import time
from scrapy.http.response.html import HtmlResponse

class SeleniumDownloadMiddleware(object):

    def __init__(self):
        self.driver = webdriver.Firefox(executable_path=r'D:\huohuWEB\geckodriver.exe')

    def process_request(self, request, spider):
        self.driver.get(request.url)
        time.sleep(1)

        # 判断标签是否存在
        try:
            while True:
                showMore = self.driver.find_element_by_class_name('show-more')
                showMore.click()
                time.sleep(0.5)
                if not showMore:
                    break
        except:
            pass

        source = self.driver.page_source
        response = HtmlResponse(url=self.driver.current_url, body=source, request=request,encoding='utf-8')
        return response

下面贴上显示效果:

;