写爬虫过程中发现图片下载比较慢,遂使用多线程下载来提速
import threading
import requests
class MulThreadDownload(threading.Thread):
def __init__(self, url, startpos, endpos, temp_dict, headers, proxies):
super(MulThreadDownload, self).__init__()
self.url = url
self.startpos = startpos
self.endpos = endpos
self.temp_dict = temp_dict
self.headers = headers
self.proxies = proxies
def download(self, proxies):
headers = {"Range": "bytes=%s-%s" % (self.startpos, self.endpos)}
if self.headers:
self.headers['Range'] = "bytes=%s-%s" % (self.startpos, self.endpos)
headers = self.headers
for i in range(10):
# 切换代理形式,增加成功率
if i in [2, 5]:
proxies = {'http': 'http:{}'.format(proxies.get("https")[6:])}
if i in [3, 7]:
proxies = {'http': proxies.get("https")}
try:
res = requests.get(self.url, headers=headers, proxies=proxies, timeout=3)
if res.content:
self.temp_dict[self.startpos]=res.content
break
else:
continue
except Exception as e:
print(f'{self.url} down load error {str(e)}')
continue
def run(self):
self.download(self.proxies)
def download_img_multi_thread(url, headers, proxies):
# 获取文件的大小和文件名
filesize = 0
if headers:
filesize = int(requests.head(url, headers=headers, proxies=proxies).headers.get('Content-Length'))
if not headers:
filesize = int(requests.head(url, proxies=proxies).headers.get('Content-Length'))
if filesize:
# 线程数
threadnum = 5
# 信号量,同时只允许5个线程运行
# threading.BoundedSemaphore(threadnum)
# 默认5线程现在,也可以通过传参的方式设置线程数
step = filesize // threadnum
mtd_list = []
start = 0
end = -1
# 如果文件大小为11字节,那就是获取文件0-10的位置的数据。如果end = 10,说明数据已经获取完了。
temp_dict = dict()
while end < filesize - 1:
start = end + 1
end = start + step - 1
if end > filesize - 1:
end = filesize - 1
if filesize - 1 - end < step:
end = filesize - 1
t = MulThreadDownload(url, start, end, temp_dict, headers, proxies)
t.start()
mtd_list.append(t)
for i in mtd_list:
i.join()
# 所有线程都下完,组合所有字节到一起
temp_dict = sorted(temp_dict.items(), key=lambda x: x[0])
temp_b = b''
for i in temp_dict:
temp_b = temp_b+i[1]
if len(temp_b) == filesize: # 校验文件大小
return temp_b
else:
print(f'file download failed temp_b {len(temp_b)} filesize {filesize}')
return None