爬取图片 - 悦读

原创文章，转载注明出处
不喜勿喷
图片爬取
爬取网址 http://desk.zol.com.cn/bizhi/7453_92417_2.html
运行环境：pycharm,python3.7.3

** 下面为代码区

from typing import List, Any

import requests
import re
import os
page_url = 'http://desk.zol.com.cn/bizhi/7453_92417_2.html'

# mkdir()创建一层目录，makedirs()创建多层目录。
# 撤销与反撤销操作：Ctrl + z，Ctrl + Shift + z

# 下载到文件夹里
def download_image(image_url, path, title):
    response = requests.get(image_url)
    # print(response)  # 输出 <Response [200]> 表示成功。
    folder = os.path.exists(title)
    if not folder:  # 判断是否存在文件夹如果不存在则创建为文件夹
        os.makedirs(title)  # makedirs 创建文件时如果路径不存在会创建这个路径
        fb = open(str(title) + '/' + path + '.jpg', 'wb')
        fb.write(response.content)
        fb.close()
    else:
        fb = open(str(title) + '/' + path + '.jpg', 'wb')
        fb.write(response.content)
        fb.close()

# 寻找所给图组的 上一组，下一组
# 返回元组
def prev_next_url(page_url):
    responsep = requests.get(page_url)
    htmlp = responsep.text.encode(responsep.encoding).decode('gb2312')
    # print(htmlp)
    prev_group = re.findall(r'URLprevGroup:"(.*?)",//上一组图', re.sub(r'\s', '', htmlp), re.S)[0]
    next_group = re.findall(r'URLnextGroup:"(.*?)",//下一组图', re.sub(r'\s', '', htmlp), re.S)[0]
    prev_groupc = 'http://desk.zol.com.cn' + str(prev_group)
    next_groupc = 'http://desk.zol.com.cn' + str(next_group)
    return (prev_groupc, next_groupc)


# 找到该组图片的title,以及所包括图片的url,及其个数
# 返回元组
def find_title_url(page_url):
    responsep = requests.get(page_url)
    htmlp = responsep.text.encode(responsep.encoding).decode('gb2312')
    # print(htmlp)
    title = re.findall(r'nowGroupName:"(.*?)",//当前组图名', re.sub(r'\s', '', htmlp), re.S)[0]
    # print(title)
    # id唯一 利用id找图片
    all_url = re.findall(r'id="showImg".*?<i><em>.+</em>/.+</i>', htmlp, re.S)[0]
    # print(all_url)
    all_urlc = re.sub(r'\s+', ' ', all_url)
    all_urlsc = re.sub(r's144x90c5', 's1920x1080c5', all_urlc)
    # print(all_urlsc)
    all_urls = re.findall(r'<img src[s]?="(.*?)" width=', all_urlsc)
    # print(all_urls)
    num = len(all_urls)
    return (title, all_urls, num)

ftu = find_title_url(prev_next_url(page_url)[1])
for i in range(ftu[2]):
    download_image(ftu[1][i], 'picture' + str(i+1), ftu[0])
    print(ftu[0] + ':第' + str(i+1) + '张下载完成')

url(url) 函数，寻找上一组，下一组图片继续爬取.
里面大量的 #print是方便调试。
def 中缩进消失了，粘贴注意 def 缩进