相信有许多人时时刻刻的都在等着 自己心仪的那部电影什么时候出,其实,我们可以利用python简单的帮你实现
# @author:、Edgar
# @date: unknown
# version:1.0.1
import urllib.request
import urllib.error
from bs4 import BeautifulSoup
import datetime
import time
"""
TODO: 这个程序访问网站过快,有时候网页就访问不了
但是如果sleep太久了时间就长了
"""
class DouBan:
def __init__(self):
self.baseUrl = 'https://movie.douban.com/cinema/nowplaying/shanghai/'
self.header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, "
"like Gecko) Chrome/76.0.3809.100 Safari/537.36"}
def get_html(self, url):
"""
获得页面的源代码
"""
request = urllib.request.Request(url, headers=self.header)
try:
response = urllib.request.urlopen(request, timeout=5)
except urllib.error.HTTPError as e:
print(e)
except urllib.error.URLError as e:
print(e)
except Exception as e:
print(e)
else:
return response.read().decode("utf-8")
def get_cities(self, url):
"""
获得所有的城市的名称,以此来拼接网页
但是部分城市获得的不是英文名称,而是数字,但是可以打开网页
"""
html = self.get_html(url)
soup = BeautifulSoup(html, 'lxml')
try:
city_list = soup.find("div", {"id": "cities-list"}).find("div", {"class": "cities-list-bd"})
except AttributeError as e:
print(e)
except Exception as e:
print(e)
else:
hot_list = city_list.find("div").findAll("span")
hot_cities_list_en = [] # 用以拼接字符串
# 获得热门城市
for hot in hot_list:
# 以下的这种方式保存的是中文格式,无法用于网页的拼接,但是可用于提示:
hot_cities_list_en.append(hot.a["uid"])
all_city_list = city_list.findAll("span")[:-1]
all_cities_en = [] # 用以拼接字符串
# 获得所有的城市
for city in all_city_list:
all_cities_en.append(city.a["uid"])
return hot_cities_list_en, all_cities_en
def spider_nowplaying(self, url):
"""
用来爬取正在上映的电影
"""
# 首先爬取正在上映的影片
try:
html = self.get_html(self.baseUrl)
nowplaying_soup = BeautifulSoup(html, 'lxml').find(id="nowplaying")
html_for_location = self.get_html(url)
except TypeError:
self.get_cities(self.baseUrl)
nowplaying_soup = BeautifulSoup(html, 'lxml').find(id="nowplaying")
except Exception as e:
print(e)
finally:
location = "上映地区:" + BeautifulSoup(html_for_location, "lxml").find(id="hd").h1.get_text()[5:] + "\n"
li_list = nowplaying_soup.find("ul", {"class": "lists"}).children
li_num = 0
data = '--'*20 + "\n"
for li in li_list:
"""
这些只是从拼接后的网页上观看到的
还可以获得其中的连接,在其网页中进行爬取
"""
try:
# print("影片名: ", li["data-title"])
title = "| 影片名: {}\n".format(li["data-title"])
# print("导演: ", li["data-director"])
director = "| 导演: {}\n".format(li["data-director"])
# print("主角: ", li["data-actors"])
actors = "| 主角: {}\n".format(li["data-actors"])
# print("豆瓣评分: ", li["data-score"])
score = "| 豆瓣评分: {}\n".format(li["data-score"])
# print("评分人数: ", li["data-votecount"])
vote = "| 评分人数: {}\n".format(li["data-votecount"])
# print("--"*20)
data = data + title + director + actors + score + vote + "--" * 20 + "\n"
li_num += 1
# print(data)
except KeyError:
pass
except Exception:
pass
data = location + data + "一共查找到 {} 部正在上映的电影\n\n".format(li_num)
print(data)
return data
def spider_upcoming(self):
"""
爬取即将上映的电影的信息
"""
url = 'https://movie.douban.com/coming'
request = urllib.request.Request(url, headers=self.header)
try:
response = urllib.request.urlopen(request, timeout=2)
except urllib.error.HTTPError as e:
print(e)
except urllib.error.URLError as e:
print(e)
except Exception as e:
print(e)
else:
soup = BeautifulSoup(response.read().decode("utf-8"), 'lxml')
coming = soup.find(id='content').find("table", {"class": "coming_list"})
# 在这里采用逐一获得每一个电影的方式
tbody_list = coming.find("tbody").findAll("tr")
data = ''
year = datetime.date.today().year
for tbody in tbody_list:
data_list = tbody.findAll('td')
date = data_list[0].get_text().strip()
name = data_list[1].get_text().strip()
film_type = data_list[2].get_text().strip()
maker_location = data_list[3].get_text().strip()
wants = data_list[4].get_text().strip()
if '20' not in date:
date = str(year) + "年" + date
date = "| 电影上映时间: {}\n".format(date)
else:
date = "| 电影上映时间: {}\n".format(date)
name = "| 影片名:{}\n".format(name)
film_type = "| 电影类型: {}\n".format(film_type)
maker_location = "| 制片国家/地区: {}\n".format(maker_location)
wants = "| 想看: {}\n".format(wants)
data = data + date + name + film_type + maker_location + wants + "--"*20 + "\n"
data = data + "一共搜索到 {} 部电影即将上映".format(len(tbody_list))
print(data)
return data
def get_all_web(self):
"""
拼接以获得所有的网页地址
"""
hot_cities_list_en, all_cities_en = self.get_cities(self.baseUrl)
hot_city_list = []
for hot_city in hot_cities_list_en:
hot_city = "https://movie.douban.com/cinema/nowplaying/{}/".format(hot_city)
hot_city_list.append(hot_city)
all_city_list = []
for city in all_cities_en:
city_web = "https://movie.douban.com/cinema/nowplaying/{}/".format(city)
all_city_list.append(city_web)
return hot_city_list, all_city_list
@staticmethod
def write_file(data1='', data2=''):
with open("douban-data.txt", "w", encoding='utf-8') as file:
file.write(data1)
file.write("\n")
file.write(data2)
if __name__ == '__main__':
test = DouBan()
hot_city_list, all_city_list = test.get_all_web()
# print(all_city_list)
data1 = ''
success = 0
for city in all_city_list:
try:
data1 = data1 + test.spider_nowplaying(city)
except Exception:
continue
else:
success += 1
continue
data2 = test.spider_upcoming()
data2 = '--'*20 + '\n' + data2
# print(data2)
# test.spider_nowplaying(test.baseUrl)
# with open("douban.txt", "w", encoding='utf-8') as file:
# file.write(data2)
test.write_file(data1, data2)
print("一共有 {} 个地区, 成功访问了 {} 个".format(len(all_city_list), success))