爬取今日头条新闻,并导入execle中(主要用了selenium)
from selenium import webdriver;
from bs4 import BeautifulSoup
import time
import xlwt
import os
os.chdir('D://')
all_list=[]
path="D://TSBrowser//chromedriver.exe"#这句话不可少喔
browser=webdriver.Chrome(executable_path=path)
browser.get("https://www.toutiao.com/ch/news_hot/")
#browser.find_element_by_name("yhm").send_keys("")
#browser.find_element_by_id("mm").send_keys("")
#browser.find_element_by_xpath('//*[@id="dl"]').click()
#js="var q=document.documentElement.scrollTop=100000"
#browser.execute_script(js)
for y in range(50):
js='window.scrollBy(0,500)'
browser.execute_script(js)
time.sleep(1.5)
# 10000表示一下拉到底
text1=str(browser.page_source)
"""
list2=re.findall('class="link">(.*?)</a></div>',text1,re.S)
for i in list2:
print(i)
print("0k")"""
dict1={}
soup=BeautifulSoup(text1,'html.parser')
list1=soup.select('li>div')
for i in list1:
try:
dict1["essay"]=i.find("a",class_="link").string
dict1["media"]=i.find("a",ga_event="article_name_click").string
dict1["comment"]=i.find("a",ga_event="article_comment_click").string
dict1["link"]="https://www.toutiao.com/a"+i.find("a",ga_event="article_comment_click").get("href")[7:]
all_list.append([dict1["essay"],dict1["media"],dict1["comment"],dict1["link"]])
print(dict1["essay"],end="-->")
print(dict1["media"],end="-->")
print(dict1["comment"],end="-->")
print(dict1["link"])
except:
pass
workbook=xlwt.Workbook(encoding='utf-8')
worksheet=workbook.add_sheet('sheet1')
titles=["标题","媒体","评论数","链接"]
for index,title in enumerate(titles):
worksheet.write(0,index,title)
for index,kk in enumerate(all_list):
for i,j in enumerate(kk):
worksheet.write(index+1,i,j)
workbook.save("今日头条.xls")