Bootstrap

scrapy爬虫实战教程

1.概述内容
今天我们来用scrapy爬取电影天堂(http://www.dytt8.net/)这个网站,将影片存入mysql,下面是我的结果图:
这里写图片描述

2.要安装的python库
1.scrapy
2.BeautifulSoup
3.MySQLdb
这个大家自己百度安装吧!

3.爬取步骤

1.创建tb_movie表存储电影数据,我这里收集的字段比较详细,大家可以酌情收集。

CREATE TABLE `tb_movie` (
  `movie_id` int(10) NOT NULL AUTO_INCREMENT COMMENT '电影id主键',
  `full_title` varchar(500) COLLATE utf8_unicode_ci DEFAULT NULL,
  `category_name` varchar(20) COLLATE utf8_unicode_ci DEFAULT NULL,
  `txt_yiming` varchar(1000) COLLATE utf8_unicode_ci DEFAULT NULL COMMENT '译  名',
  `txt_pianming` varchar(500) COLLATE utf8_unicode_ci DEFAULT NULL COMMENT '片  名',
  `txt_nian_dai` varchar(100) COLLATE utf8_unicode_ci DEFAULT NULL COMMENT '年  代',
  `txt_chan_di` varchar(500) COLLATE utf8_unicode_ci DEFAULT NULL COMMENT '产  地',
  `txt_lei_bie` varchar(1000) COLLATE utf8_unicode_ci DEFAULT NULL COMMENT '类  别',
  `txt_yu_yan` varchar(1000) COLLATE utf8_unicode_ci DEFAULT NULL COMMENT '语  言',
  `txt_zi_mu` varchar(1000) COLLATE utf8_unicode_ci DEFAULT NULL COMMENT '字  幕',
  `txt_shang_yin_ri_qi` varchar(1000) COLLATE utf8_unicode_ci DEFAULT NULL COMMENT '上映日期',
  `txt_imdb` varchar(1000) COLLATE utf8_unicode_ci DEFAULT NULL COMMENT 'IMDb评分',
  `txt_dou_ban` varchar(1000) COLLATE utf8_unicode_ci DEFAULT NULL COMMENT '豆瓣评分',
  `txt_format` varchar(1000) COLLATE utf8_unicode_ci DEFAULT NULL COMMENT '文件格式',
  `txt_chi_cun` varchar(1000) COLLATE utf8_unicode_ci DEFAULT NULL COMMENT '视频尺寸',
  `txt_size` varchar(1000) COLLATE utf8_unicode_ci DEFAULT NULL COMMENT '文件大小',
  `txt_pian_chang` varchar(1000) COLLATE utf8_unicode_ci DEFAULT NULL COMMENT '片  长',
  `txt_dao_yan` varchar(1000) COLLATE utf8_unicode_ci DEFAULT NULL COMMENT '导  演',
  `txt_zhu_yan` varchar(1000) COLLATE utf8_unicode_ci DEFAULT NULL COMMENT '主  演',
  `txt_jian_jie` varchar(1000) COLLATE utf8_unicode_ci DEFAULT NULL COMMENT '简  介',
  `small_images` varchar(1000) COLLATE utf8_unicode_ci DEFAULT NULL COMMENT '简介 图片列表(以逗号分隔)',
  `big_images` varchar(1000) COLLATE utf8_unicode_ci DEFAULT NULL COMMENT '大图, 海报图',
  `release_time` varchar(1000) COLLATE utf8_unicode_ci DEFAULT NULL COMMENT '发布时间',
  `download_url` varchar(1000) COLLATE utf8_unicode_ci DEFAULT NULL COMMENT '发布时间',
  `create_time` datetime DEFAULT NULL COMMENT '创建时间',
  `modify_time` datetime DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
  `detail_url` varchar(1000) COLLATE utf8_unicode_ci DEFAULT NULL,
  PRIMARY KEY (`movie_id`),
  KEY `index_category_name` (`category_name`)
) ENGINE=InnoDB AUTO_INCREMENT=4076 DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci COMMENT='电影主表';

2.创建scrapy工程 ,然后进入工程目录创建spider,名为getmovie.py

1.scrapy startproject dytt
2.cd dytt
3.scrapy genspider getmovie dytt8.net

3.编写items.py文件,字段和数据库相对应(注意调整你的items.py文件为utf-8编码)

class DyttItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    # 译  名
    txtYiming = scrapy.Field()
    # 片  名
    txtPianming = scrapy.Field()
    # 年  代
    txtNianDai = scrapy.Field()
    # 产  地
    txtChanDi = scrapy.Field()
    # 类  别
    txtLeiBie = scrapy.Field()
    #语  言
    txtYuYan = scrapy.Field()
    #字  幕
    txtZiMu = scrapy.Field()
    #上映日期
    txtShangYinRiQi = scrapy.Field()
    #IMDb评分
    txtImdb = scrapy.Field()
    #豆瓣评分
    txtDouBan = scrapy.Field()
    #文件格式
    txtFormat = scrapy.Field()
    #视频尺寸
    txtChiCun = scrapy.Field()
    #文件大小
    txtSize = scrapy.Field()
    #片  长
    txtPianChang = scrapy.Field()
    #导  演
    txtDaoYan = scrapy.Field()
    #主  演
    txtZhuYan = scrapy.Field()
    #简  介
    txtJianJie = scrapy.Field()
    #简介 图片列表
    smallImages = scrapy.Field()
    #大图, 海报图
    bigImages = scrapy.Field()
    #发布时间
    releaseTime = scrapy.Field()
    #下载地址
    downloadUrl = scrapy.Field()
    #全标题
    fullTitle = scrapy.Field()
    # 电影天堂详细电影 页面 地址
    detailUrl = scrapy.Field()
    # 类别
    categoryName = scrapy.Field()

4.编写spiders的getmovie.py
主要功能是访问解析网页,将数据内容封装成上面定义好的DyttItem。
解析访问流程主要是分为10大类,爬取时每类分为500页,每一页会访问详细电影介绍url, 详细注释在代码中,大家要打开电源天堂网站看对应的标签结构才能明白。

# -*- coding: utf-8 -*-
import scrapy
import re
import logging
from dytt.items import DyttItem
from bs4 import BeautifulSoup
import urllib.request

class GetmovieSpider(scrapy.Spider):
    name = 'getmovie'
    allowed_domains = ['dytt8.net']
    start_urls = ['http://dytt8.net/']


    def getUrl(self,pageIndex):
        self.domain = "http://www.dytt8.net"
        self.categoryName="最新影片"
        return "http://www.dytt8.net/html/gndy/dyzz/list_23_"+str(pageIndex)+".html"


        self.domain = "http://www.ygdy8.net"

#         self.categoryName="国内电影"
#         return "http://www.ygdy8.net/html/gndy/china/list_4_"+str(pageIndex)+".html"
#         
#         self.categoryName="日韩影片"
#         return "http://www.ygdy8.net/html/gndy/rihan/list_6_"+str(pageIndex)+".html"
#         
#         self.categoryName="欧美影片"
#         return "http://www.ygdy8.net/html/gndy/oumei/list_7_"+str(pageIndex)+".html"
#         
#         self.categoryName="华语电视"
#         return "http://www.ygdy8.net/html/tv/hytv/list_71_"+str(pageIndex)+".html"
#         
#         self.categoryName="日韩电视"
#         return "http://www.ygdy8.net/html/tv/rihantv/list_8_"+str(pageIndex)+".html"
#         
#         self.categoryName="欧美电视"
#         return "http://www.ygdy8.net/html/tv/oumeitv/index.html"
#         
#         self.categoryName="最新综艺"
#         return "http://www.ygdy8.net/html/zongyi2013/list_99_"+str(pageIndex)+".html"
#         
#         self.categoryName="旧版综艺"
#         return "http://www.ygdy8.net/html/2009zongyi/list_89_"+str(pageIndex)+".html"
#         
#         self.categoryName="动漫资源"
#         return "http://www.ygdy8.net/html/dongman/list_16_"+str(pageIndex)+".html"
#         
#        self.categoryName="游戏下载"
       return "http://www.ygdy8.net/html/game/list_19_"+str(pageIndex)+".html"

    #初始化request调用 
    def start_requests(self):
        reqs = []
        # 第一页到 500页
        for i in range(1,500):
            req = scrapy.Request(self.getUrl(i))
            reqs.append(req)
        return reqs
    #解析每一个url时调用
    def parse(self,response):
        tables = response.xpath('//table[@class="tbspan"]');
        for itemEntry in tables:
            item  = DyttItem()
            details = itemEntry.xpath('tr[2]/td[2]/b').extract()[0]
            soup = BeautifulSoup(details)
            aList = soup.findAll('a', {'class':'ulink'})
            if len(aList) == 1:
                a = aList[0]
            else:
                a = aList[1]
            detailUrl = self.domain + a.get('href')
            fullTitle = a.string

            item['fullTitle'] = fullTitle
            item['detailUrl'] = detailUrl
            item['categoryName'] = self.categoryName
            yield scrapy.Request(url=detailUrl,meta={'item':item},callback=self.parseDetail,dont_filter=True)

    def parseDetail(self,response):
        item = response.meta['item']
        logging.log(logging.WARNING,str(item['fullTitle']) + "  url:"+response.url)
        releaseTime = response.xpath('//div[@class="co_content8"]').extract()[0]
        movContent = response.xpath('//div[@id="Zoom"]').extract()
        # 正则 取时间 
        releaseTime = re.search(r"(\d{4}-\d{1,2}-\d{1,2})",releaseTime).group(0)
        # 电影下载地址
        downloadUrl = re.search(r'(<a[^>]*href="([^"]+)"[^>]*>)',movContent[0]).group(0)[9:-2]
        #电影图片 数组
        imagesArray = self.findImageSrc(movContent[0])
        imagesArray = self.distinct(imagesArray)
        # 文字信息数组
        txtArray = self.findMoiveTextInfo(movContent[0])
        #self.myParseMovieInfo(movContent[0])
        if len(imagesArray) > 0:
            try:
                item['bigImages'] = imagesArray[0]
                item['smallImages'] = self.arrayToString(imagesArray)
            except:
                item['bigImages'] = ''
                item['smallImages'] = ''
        else:
            item['bigImages'] = ''
            item['smallImages'] = ''

        item['downloadUrl'] = downloadUrl
        item['releaseTime'] = releaseTime
        # 译  名
        item['txtYiming'] = self.getStringInfo(txtArray,'译  名')
        # 片  名
        item['txtPianming'] = self.getStringInfo(txtArray,'片  名')
        # 年  代
        item['txtNianDai'] = self.getStringInfo(txtArray,'年  代')
        # 产  地
        item['txtChanDi'] = self.getStringInfo(txtArray,'产  地')
        # 类  别
        item['txtLeiBie'] = self.getStringInfo(txtArray,'类  别')
        #语  言
        item['txtYuYan'] = self.getStringInfo(txtArray,'语  言')
        #字  幕
        item['txtZiMu'] = self.getStringInfo(txtArray,'字  幕')
        #上映日期
        item['txtShangYinRiQi'] = self.getStringInfo(txtArray,'上映日期')
        #IMDb评分
        item['txtImdb'] = self.getStringInfo(txtArray,'IMDb评分')
        #豆瓣评分
        item['txtDouBan'] = self.getStringInfo(txtArray,'豆瓣评分')
        #文件格式
        item['txtFormat'] = self.getStringInfo(txtArray,'文件格式')
        #视频尺寸
        item['txtChiCun'] = self.getStringInfo(txtArray,'视频尺寸')
        #文件大小
        item['txtSize'] = self.getStringInfo(txtArray,'文件大小')
        #片  长
        item['txtPianChang'] = self.getStringInfo(txtArray,'片  长')
        #导  演
        item['txtDaoYan'] = self.getStringInfo(txtArray,'导  演')
        #主  演
        item['txtZhuYan'] = self.getStringInfo(txtArray,'主  演')
        #简  介
        item['txtJianJie'] = self.getStringInfo(txtArray,'简  介')
        #格式化简介
        if item['txtJianJie'] != "":
            item['txtJianJie'] = item['txtJianJie'][0:str.rfind(item['txtJianJie'],'。')]

        yield item

    #### 数组去重
    def distinct(self,src):
        news_ids = []
        for id in src:
            if id not in news_ids:
                news_ids.append(id)
        return news_ids


    def getStringInfo(self,array,key):
        for i in array:
            if key in i:
                i = i.replace(key,"")
                return i.strip()
        return ""

    def arrayToString(self,array):
        str=""
        if len(array) > 1:
            for i in array[1:]:
                str = str + i + ","
        if ',' in str:
            str = str[0:-1]
        return str


    def myParseMovieInfo(self,movContent):
        try:

            downloadUrl = re.search(r'(<a[^>]*href="([^"]+)"[^>]*>)',movContent).group(0)
            downloadUrl = re.search(r'(ftp*>)',downloadUrl).group(0)
            for item in movContent.split('<br>'):
                item = item.strip()
                if item == "":
                    continue
                #logging.log(logging.WARNING, "&&&&:>>"+item)
        except Exception as e:
            logging.log(logging.WARNING, "parser movie content error")

    def findMoiveTextInfo(self,movContent):
        list = movContent.split('◎')
        arryas = []
        for i in list[1:]:
            arryas.append(i)
        return arryas

    def findImageSrc(self,movContent):
        try:
            replace_pattern = r'<[img|IMG].*?>' #img标签的正则式
            img_url_pattern = r'.+?src="(\S+)"' #img_url的正则式
            img_url_list = []
            need_replace_list = re.findall(replace_pattern, movContent)#找到所有的img标签
            for tag in need_replace_list:
                img_url_list.append(re.findall(img_url_pattern, tag)[0])#找到所有的img_url
            return img_url_list
        except:
            logging.log(logging.WARNING, "parser movie image error")
            return []

5.pipelines.py管道文件
主要是将上面封装好的DyttItem数据写入mysql,写入前先跟进名称和下载url查询,如果存在则代表重复电影,不插入数据库

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import MySQLdb
import MySQLdb.cursors
from twisted.enterprise import adbapi
from dytt import settings
import logging

class DyttPipeline(object):

    def __init__(self):
        dbargs = dict(
            host='你的数据库地址',
            db='你的数据库名称',
            user='你的数据库用户名',
            passwd='你的数据库密码',
            charset='utf8',
            cursorclass = MySQLdb.cursors.DictCursor,
            use_unicode= True,
        )
        self.dbpool = adbapi.ConnectionPool('MySQLdb', **dbargs)

    def process_item(self, item, spider):
        res = self.dbpool.runInteraction(self.insert_into_table,item)
        return item
    def insert_into_table(self,conn,item):
        # 先查询存不存在
        sql = 'select * from tb_movie where full_title="'+item['fullTitle']+'" and detail_url="'+item['detailUrl']+'"'
        try:
            # 执行SQL语句
            conn.execute(sql)
            # 获取所有记录列表
            results = conn.fetchall()
            if len(results) > 0: ## 存在
                logging.log(logging.WARNING, "此电影已经存在@@@@@@@@@@@@")
                return 
        except:
            pass

        sql = '''insert into tb_movie (txt_yiming,txt_pianming,txt_nian_dai,txt_chan_di,txt_lei_bie
                ,txt_yu_yan,txt_zi_mu,txt_shang_yin_ri_qi,txt_imdb,txt_dou_ban,txt_format,txt_chi_cun,txt_size
                ,txt_pian_chang,txt_dao_yan,txt_zhu_yan,txt_jian_jie,small_images,big_images,release_time,download_url,full_title,detail_url,category_name) 
                VALUES ('''
        if item['txtYiming'] is not None:
            sql = sql + '"'+item['txtYiming']+'",'
        if item['txtPianming'] is not None: 
            sql = sql + '"'+item['txtPianming']+'",'
        if item['txtNianDai'] is not None: 
            sql = sql + '"'+item['txtNianDai']+'",'
        if item['txtChanDi'] is not None: 
            sql = sql + '"'+item['txtChanDi']+'",'

        if item['txtLeiBie'] is not None:    
            sql = sql + '"'+item['txtLeiBie']+'",'
        if item['txtYuYan'] is not None:
            sql = sql + '"'+item['txtYuYan']+'",'
        if item['txtZiMu'] is not None:
            sql = sql + '"'+item['txtZiMu']+'",'
        if item['txtShangYinRiQi'] is not None:
            sql = sql + '"'+item['txtShangYinRiQi']+'",'
        if item['txtImdb'] is not None:
            sql = sql + '"'+item['txtImdb']+'",'
        if item['txtDouBan'] is not None:
            sql = sql + '"'+item['txtDouBan']+'",'
        if item['txtFormat'] is not None:
            sql = sql + '"'+item['txtFormat']+'",'
        if item['txtChiCun'] is not None:
            sql = sql + '"'+item['txtChiCun']+'",'
        if item['txtSize'] is not None:
            sql = sql + '"'+item['txtSize']+'",'
        if item['txtPianChang'] is not None:
            sql = sql + '"'+item['txtPianChang']+'",'
        if item['txtDaoYan'] is not None:
            sql = sql + '"'+item['txtDaoYan']+'",'

        if item['txtZhuYan'] is not None:
            sql = sql + '"'+item['txtZhuYan']+'",'

        if item['txtJianJie'] is not None:
            sql = sql + '"'+item['txtJianJie']+'",'

        if item['smallImages'] is not None:
            sql = sql + '"'+item['smallImages']+'",'

        if item['bigImages'] is not None:
            sql = sql + '"'+item['bigImages']+'",'

        if item['releaseTime'] is not None:
            sql = sql + '"'+item['releaseTime']+'",'

        if item['downloadUrl'] is not None:
            sql = sql + '"'+item['downloadUrl']+'",'

        if item['fullTitle'] is not None:
            sql = sql + '"'+item['fullTitle']+'",'

        if item['detailUrl'] is not None:
            sql = sql + '"'+item['detailUrl']+'",'

        if item['categoryName'] is not None:
            sql = sql + '"'+item['categoryName']+'",'
        sql = sql[0:-1]
        sql = sql + ')'
        try:
            conn.execute(sql)
        except Exception as e:
            logging.log(logging.WARNING, "sqlsqlsqlsqlsqlsqlsql error>> "+sql)

5.settings.py

#遵循爬虫规范
ROBOTSTXT_OBEY = True
#模拟浏览器
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
#注册piplinses
ITEM_PIPELINES = {
    'dytt.pipelines.DyttPipeline': 300,
}

6.最后运行爬虫

#列举项目的所有爬虫
scrapy list

#运行getmovie爬虫
scrapy crawl getmovie

老生常谈:深圳有爱好音乐的会打鼓(吉他,键盘,贝斯等)的程序员和其它职业可以一起交流加入我们乐队一起嗨。我的QQ:657455400

;