具体分析以后再补,静态页面也没啥好分析的。
import requests, re
from bs4 import BeautifulSoup
import xlsxwriter
import datetime
from lxml import etree
def get_URLs(URL, page):
# URLs = []
# html = get_html(start_url)
# soup = BeautifulSoup(html, 'lxml')
# urls = soup.find(id='menu')
# pattern = re.compile(r'href="(.+)">') # 匹配模式
# res = re.findall(pattern, str(urls))
# host = 'http://www.ygdy8.net/'
# for u in res:
# if 'http' not in u:
# u = host + u
# URLs.append(u)
# URLs.insert(10, URLs[-3])
# del URLs[1]
# url = URLs[:10]
# html = get_html(url[0])
# soup = BeautifulSoup(html, 'lxml')
# page = soup.find(class_='x')
domain = 'http://www.ygdy8.net'
start_url = URL
url = start_url + page + '.html'
html = get_html(url)
soup = BeautifulSoup(html, 'lxml')
urls = soup.find_all(class_='ulink')
# print(urls)
pattern = re.compile(r'href="(.+?)">') # 匹配模式
res = re.findall(pattern, str(urls))
#print(len(res))
# print(res)
for u in res:
if 'index' in u:
res.pop(res.index(u))
# 每页落掉了两个
urls = list(map(lambda u: domain+u, res))
return urls
def get_html(url):
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36",
'ContentType':