Bootstrap

python爬今日头条图片_使用python3+selenium爬取今日头条图片(爬坑)

#-*-coding:utf-8-*- """ 可爱的新垣结衣^v^^v^^v^ 暂时只爬取到图片的src 后续继续可以继续获得图片 """ from selenium import webdriver import time,getopt,sys from lxml import etree import threading from urllib import parse import os class srcGetParser(): def pagesget(self,url): #直接获取网页 firefoxOptions = webdriver.FirefoxOptions() firefoxOptions.add_argument("-headless") #设置Firefox参数,不启动Firefox driver = webdriver.Firefox(options = firefoxOptions) driver.get(url) text = driver.page_source driver.close() return text def pageget(self): #将网页保存在本地(.txt文件) brower = webdriver.Firefox() brower.get("https://www.toutiao.com/a6736825928690696716/#p=3") with open("seleniumgettext.txt","w",encoding="utf-8") as fp: fp.write(brower.page_source) fp.close brower.close() def htmlparser(self,opts,url=None): #解析获取到的网页 if opts[0][0] in ("-f","--file"): with open("seleniumgettext.txt","r",encoding = "utf-8") as fp: text = fp.read() fp.close if opts[0][0] in ("-p","http"): text = self.pagesget(url) try: html = etree.HTML(text) src = html.xpath("/html/body/div/div[2]/div[1]/div/div/div[1]/div/div/ul/li[3]/div/img/@src") return src except: return 0 def srcGet(src_list): pass if __name__ == "__main__": opts,args = getopt.getopt(sys.argv[1:],"fph",["--file","--http","--help"]) if len(opts) == 0: print("请检查参数输入:输入-h或者--help获得参数信息") sys.exit() if opts[0][0] in ("-h","--help"): print("-f or --file 将网页保存至本地再读取") print("-p or --http 将直接读取网页信息") sys.exit() print("网页解析开始!") time.sleep(1) print("---------loading---------") src_list = [] parser = srcGetParser() if opts[0][0] in ("-p","--http"): for page in range(1,9): url = "https://www.toutiao.com/a6736825928690696716/#p=%s" % str(page) url = parse.unquote(url) #print(url) src = parser.htmlparser(opts,url) #print(src) #会获取到空list所以使用try...except 继续程序 try: src_list.append(src[0]) except: print("NoneError:src is []") continue if src == 0: print("网页解析错误") else: print("to get the src is :\n") print(src_list) #最后使用系统命令终止firefox进程和geckodriver进程 try: os.system("taskkill -f -im firefox.exe >> .\\systemlog.txt") os.system("taskkill -f -im geckodirver.exe") except Exception as e: print("ERROR:%s" % e)

;