这是一个比较完整的项目,功能已经实现,可以保存到本地mysql,下面代码实现:
爬虫主程序 spider:
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from jianshu_spider.items import ArticleItem
class JsSpider(CrawlSpider):
name = 'js'
allowed_domains = ['jianshu.com']
start_urls = ['http://jianshu.com/']
rules = (
Rule(LinkExtractor(allow=r'.*/p/[0-9a-z]{12}.*'), callback='parse_detail', follow=True),
)
def parse_detail(self, response):
title = response.xpath('//h1[@class="title"]/text()').get()
avatar = response.xpath('//a[@class="avatar"]/img/@src').get()
author = response.xpath('//span[@class="name"]/a/text()').get()
pub_time = response.xpath('//span[@class="publish-time"]/text()').get().replace('*','')
# 从url中截取文章id
url = response.url
url1 = url.split('?')[0]
article_id = url1.split('/')[-1]
content = response.xpath('//div[@class="show-content-free"]').get()
origin_url = response.url
read_count = response.xpath('//span[@class="views-count"]/text()').get()
like_count = response.xpath('//span[@class="likes-count"]/text()').get()
word_count = response.xpath('//span[@class="wordage"]/text()').get()
comment_count = response.xpath('//span[@class="comments-count"]/text()').get()
subjects = ','.join(response.xpath('//div[@class="include-collection"]/a/div/text()').getall())
item = ArticleItem(
title=title,
avatar=avatar,
author=author,
pub_time=pub_time,
article_id=article_id,
origin_url=origin_url,
content=content,
read_count=read_count,
like_count=like_count,
word_count=word_count,
comment_count=comment_count,
subjects=subjects,
)
yield item
设计item
import scrapy
class ArticleItem(scrapy.Item):
title = scrapy.Field()
content = scrapy.Field()
article_id = scrapy.Field()
origin_url = scrapy.Field()
author = scrapy.Field()
avatar = scrapy.Field()
pub_time =scrapy.Field()
read_count = scrapy.Field()
like_count = scrapy.Field()
word_count = scrapy.Field()
comment_count = scrapy.Field()
subjects = scrapy.Field()
使用pymysql保存到数据库,pipeline
import pymysql
class JianshuSpiderPipeline(object):
def __init__(self):
'''
配置mysql的参数
'''
dbparams = {
'host': 'localhost',
'port': 3306,
'user': 'test',
'password': '******',
'database': 'jianshu',
'charset': 'utf8', # 注意,这里不能使用utf-8,mysql的语法规则
}
self.conn = pymysql.connect(**dbparams)
self.cursor = self.conn.cursor() # 创建游标
self._sql = None
def process_item(self, item, spider):
self.cursor.execute(self.sql, (
item['title'],
item['content'],
item['pub_time'],
item['article_id'],
item['author'],
item['avatar'],
item['origin_url'],
))
self.conn.commit()
return item
@property
def sql(self):
if not self._sql:
self._sql = """
insert into article(id,title,content,pub_time,article_id,author,avatar,origin_url) values (null ,%s, %s, %s, %s, %s, %s, %s)
"""
return self._sql
return self._sql
设计selenium动态爬取,设计中间件,middlewares
from selenium import webdriver
import time
from scrapy.http.response.html import HtmlResponse
class SeleniumDownloadMiddleware(object):
def __init__(self):
self.driver = webdriver.Firefox(executable_path=r'D:\huohuWEB\geckodriver.exe')
def process_request(self, request, spider):
self.driver.get(request.url)
time.sleep(1)
# 判断标签是否存在
try:
while True:
showMore = self.driver.find_element_by_class_name('show-more')
showMore.click()
time.sleep(0.5)
if not showMore:
break
except:
pass
source = self.driver.page_source
response = HtmlResponse(url=self.driver.current_url, body=source, request=request,encoding='utf-8')
return response
下面贴上显示效果: