python爬取豆瓣电影

相信有许多人时时刻刻的都在等着自己心仪的那部电影什么时候出，其实，我们可以利用python简单的帮你实现
# @author:、Edgar
# @date: unknown 
# version:1.0.1
import urllib.request
import urllib.error
from bs4 import BeautifulSoup
import datetime
import time
"""
TODO: 这个程序访问网站过快，有时候网页就访问不了
但是如果sleep太久了时间就长了
"""


class DouBan:
    def __init__(self):
        self.baseUrl = 'https://movie.douban.com/cinema/nowplaying/shanghai/'
        self.header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, "
                                     "like Gecko) Chrome/76.0.3809.100 Safari/537.36"}
    
    def get_html(self, url):
        """
        获得页面的源代码
        """
        request = urllib.request.Request(url, headers=self.header)
        try:
            response = urllib.request.urlopen(request, timeout=5)
        except urllib.error.HTTPError as e:
            print(e)
        except urllib.error.URLError as e:
            print(e)
        except Exception as e:
            print(e)
        else:
            return response.read().decode("utf-8")
    
    def get_cities(self, url):
        """
        获得所有的城市的名称，以此来拼接网页
        但是部分城市获得的不是英文名称，而是数字，但是可以打开网页
        """
        html = self.get_html(url)
        soup = BeautifulSoup(html, 'lxml')
        try:
            city_list = soup.find("div", {"id": "cities-list"}).find("div", {"class": "cities-list-bd"})
        except AttributeError as e:
            print(e)
        except Exception as e:
            print(e)
        else:
            hot_list = city_list.find("div").findAll("span")
            hot_cities_list_en = []  # 用以拼接字符串
            # 获得热门城市
            for hot in hot_list:
                # 以下的这种方式保存的是中文格式，无法用于网页的拼接，但是可用于提示：
                hot_cities_list_en.append(hot.a["uid"])
            all_city_list = city_list.findAll("span")[:-1]
            all_cities_en = []  # 用以拼接字符串
            # 获得所有的城市
            for city in all_city_list:
                all_cities_en.append(city.a["uid"])
            return hot_cities_list_en, all_cities_en
        
    def spider_nowplaying(self, url):
        """
        用来爬取正在上映的电影
        """
        # 首先爬取正在上映的影片
        try:
            html = self.get_html(self.baseUrl)
            nowplaying_soup = BeautifulSoup(html, 'lxml').find(id="nowplaying")
            html_for_location = self.get_html(url)
        except TypeError:
            self.get_cities(self.baseUrl)
            nowplaying_soup = BeautifulSoup(html, 'lxml').find(id="nowplaying")
        except Exception as e:
            print(e)
        finally:
            location = "上映地区：" + BeautifulSoup(html_for_location, "lxml").find(id="hd").h1.get_text()[5:] + "\n"
            li_list = nowplaying_soup.find("ul", {"class": "lists"}).children
            li_num = 0
            data = '--'*20 + "\n"
            for li in li_list:
                """
                这些只是从拼接后的网页上观看到的
                还可以获得其中的连接，在其网页中进行爬取
                """
                try:
                    # print("影片名： ", li["data-title"])
                    title = "| 影片名： {}\n".format(li["data-title"])
                    # print("导演： ", li["data-director"])
                    director = "| 导演： {}\n".format(li["data-director"])
                    # print("主角： ", li["data-actors"])
                    actors = "| 主角： {}\n".format(li["data-actors"])
                    # print("豆瓣评分： ", li["data-score"])
                    score = "| 豆瓣评分： {}\n".format(li["data-score"])
                    # print("评分人数： ", li["data-votecount"])
                    vote = "| 评分人数： {}\n".format(li["data-votecount"])
                    # print("--"*20)
                    data = data + title + director + actors + score + vote + "--" * 20 + "\n"
                    li_num += 1
                    # print(data)
                except KeyError:
                    pass
                except Exception:
                    pass
            data = location + data + "一共查找到 {} 部正在上映的电影\n\n".format(li_num)
            print(data)
            return data

    def spider_upcoming(self):
        """
        爬取即将上映的电影的信息
        """
        url = 'https://movie.douban.com/coming'
        request = urllib.request.Request(url, headers=self.header)
        try:
            response = urllib.request.urlopen(request, timeout=2)
        except urllib.error.HTTPError as e:
            print(e)
        except urllib.error.URLError as e:
            print(e)
        except Exception as e:
            print(e)
        else:
            soup = BeautifulSoup(response.read().decode("utf-8"), 'lxml')
            coming = soup.find(id='content').find("table", {"class": "coming_list"})
            # 在这里采用逐一获得每一个电影的方式
            tbody_list = coming.find("tbody").findAll("tr")
            data = ''
            year = datetime.date.today().year

            for tbody in tbody_list:
                data_list = tbody.findAll('td')
                date = data_list[0].get_text().strip()
                name = data_list[1].get_text().strip()
                film_type = data_list[2].get_text().strip()
                maker_location = data_list[3].get_text().strip()
                wants = data_list[4].get_text().strip()
                if '20' not in date:
                    date = str(year) + "年" + date
                    date = "| 电影上映时间： {}\n".format(date)
                else:
                    date = "| 电影上映时间： {}\n".format(date)
                name = "| 影片名：{}\n".format(name)
                film_type = "| 电影类型： {}\n".format(film_type)
                maker_location = "| 制片国家/地区： {}\n".format(maker_location)
                wants = "| 想看： {}\n".format(wants)
                data = data + date + name + film_type + maker_location + wants + "--"*20 + "\n"
            data = data + "一共搜索到 {} 部电影即将上映".format(len(tbody_list))
            print(data)
            return data
            
    def get_all_web(self):
        """
        拼接以获得所有的网页地址
        """
        hot_cities_list_en, all_cities_en = self.get_cities(self.baseUrl)
        hot_city_list = []
        for hot_city in hot_cities_list_en:
            hot_city = "https://movie.douban.com/cinema/nowplaying/{}/".format(hot_city)
            hot_city_list.append(hot_city)
        all_city_list = []
        for city in all_cities_en:
            city_web = "https://movie.douban.com/cinema/nowplaying/{}/".format(city)
            all_city_list.append(city_web)
        return hot_city_list, all_city_list

    @staticmethod
    def write_file(data1='', data2=''):
        with open("douban-data.txt", "w", encoding='utf-8') as file:
            file.write(data1)
            file.write("\n")
            file.write(data2)


if __name__ == '__main__':
    test = DouBan()
    hot_city_list, all_city_list = test.get_all_web()
    # print(all_city_list)
    data1 = ''
    success = 0
    for city in all_city_list:
        try:
            data1 = data1 + test.spider_nowplaying(city)
        except Exception:
            continue
        else:
            success += 1
            continue
    data2 = test.spider_upcoming()
    data2 = '--'*20 + '\n' + data2
    # print(data2)
    # test.spider_nowplaying(test.baseUrl)
    # with open("douban.txt", "w", encoding='utf-8') as file:
    #     file.write(data2)
    test.write_file(data1, data2)
    print("一共有 {} 个地区， 成功访问了 {} 个".format(len(all_city_list), success))
python爬取豆瓣电影

悦读