1、过程
- 使用selenium打开网站(网页异步加载),获取m3u8地址,视频名称(网页标题)
- 获取m3u8文件,解析ts文件地址,ts文件加密密钥key,偏移量iv
- 使用多线程爬取ts文件,并重命名保存到本地临时文件夹
- 编写脚本调用openssl程序解密ts文件,ts文件合并,删除临时文件
2、具体实现
1、获取m3u8地址
使用selenium打开网站,使用XPATH解析m3u8地址,视频名称(文件命名处理)
def get_m3u8_url(url):
options = webdriver.EdgeOptions()
options.add_experimental_option("detach", True)
# 创建浏览器对象
driver = webdriver.Edge(options=options)
#
driver.get(url)
time.sleep(5)
# 查找脚本元素
r = driver.find_elements(By.XPATH, '//*[@id="site-content"]/div/div/div[1]/section[1]/script[2]')
# 解析元素,匹配hlsurl
str1 = r[0].get_attribute("innerHTML")
m3u8_url = re.findall("hlsUrl = '(.*)';", str1)
e = driver.find_elements(By.XPATH, '//*[@id="site-content"]/div/div/div[1]/section[2]/div[1]/div[1]/h4')
video_name = e[0].text
if m3u8_url[0]:
driver.close()
# 文件名处理 \/?:"*<>| 替换为 \/?_"*<>︱
for i, j in ('/\', '\/', ' _','??', ':_', '""', '**', '<<', '>>', '|︱'):
video_name = video_name.replace(i, j)
return m3u8_url[0], video_name
2、获取ts列表,key,iv
获取m3u8文件
def m3u8(url):
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0',
'connection': 'close'
}
r = requests.get(url,headers=headers)
with open('m3u8.txt', 'wb') as f:
f.write(r.content)
f.close()
解析ts文件地址,download_url 为m3u8地址去除最后一串数字,ts文件地址为****.ts ,因此正则表达式为'.*ts',
def prase_m3u8(download_url):
list_url = []
with open('m3u8.txt', 'r') as f:
str = f.read()
list = re.findall('.*ts',str)
for i in list:
list_url.append(download_url+i)
f.close()
return list_url
ts文件加密密钥key,偏移量iv获取
m3u8文件存储key的地址和iv的值
key地址中存储key文件,经过测试,以16进制打开key文件即为ts文件加密密钥
#获取key和iv
def get_keyandiv(url,key,iv):
url_prefix = url.rsplit("/",1)[0] +"/"
#iv和 key.ts URL获取
with open('m3u8.txt', 'r') as f:
str1 = f.read()
iv = re.findall('0x.*',str1)[0]
iv = iv.split('0x')[1]
key_url = re.findall('.*ts',str1)[0]
key_url = url_prefix+key_url.rsplit('"')[1]
f.close()
# key.ts获取
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0',
}
try:
r = requests.get(key_url, headers=headers,timeout=15)
with open('key.ts', 'wb') as f:
f.write(r.content)
print('key.ts已下载.')
f.close()
except TimeoutError: # 1
print('-----------------TimeoutError----------------key.ts下载失败')
except: # 2
print('---------------------OtherError----------------------key.ts下载失败')
# key获取
with open("key.ts","rb") as f:
r = f.read()
key = binascii.b2a_hex(r)
key = str(key)
key = key[2:-1]
f.close()
return key,iv
3、多线程爬取ts文件
下载函数
@retry(stop_max_attempt_number=3, wait_fixed=20000)
def downloads(j,url):
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0'}
try:
r = requests.get(url, headers=headers,timeout=15)
with open('OpenSSL-Win64\\bin\\temp\\' + str(j) + '.ts', 'wb') as f:
f.write(r.content)
print(url)
print(str(j) + '已下载.')
f.close()
except TimeoutError: # 1
print('-----------------TimeoutError----------------'+'\\n'+j+'下载失败')
except: # 2
print('---------------------OtherError----------------------'+'\\n'+j+'下载失败')
多线程实现函数
使用锁确保不会重复调用
class myThread(threading.Thread):
def __init__(self, threadID, name, counter):
threading.Thread.__init__(self)
self.threadID = threadID
self.name = name
self.counter = counter
def run(self):
global counter
global j
global x
global l
temp_sum = l-1
while counter:
if (x == temp_sum):
break
threadLock.acquire()
str_url = list_geturl(list,l)
counter = counter-1
j = j+1
# 释放锁
threadLock.release()
downloads(j,str_url)
def list_geturl(list,l):
global x
if(x<l):
x = x+1
return list[x]
def start_download():
# 创建新线程
thread1 = myThread(1, "Thread-1", 1)
thread2 = myThread(2, "Thread-2", 2)
thread3 = myThread(3, "Thread-3", 3)
thread4 = myThread(4, "Thread-4", 4)
thread5 = myThread(5, "Thread-5", 5)
# 开启新线程
thread1.start()
thread2.start()
thread3.start()
thread4.start()
thread5.start()
4、解密ts文件,文件合并
编写ts文件解密bat脚本
# ts文件解密bat文件
def create_deAES_bat(key,iv):
with open("OpenSSL-Win64\\bin\\video\\de_aes.bat", "w+",encoding='utf-8') as f:
f.write('cd OpenSSL-Win64/bin\n')
C = 1
for i in range(1,l):
if (i < 11 and C == 1):
str1 = '000' + str(i - 1)
C = 0
if (i < 101 and C == 1):
str1 = '00' + str(i - 1)
C = 0
if (i < 1001 and C == 1):
str1 = '0' + str(i - 1)
C = 0
if (i >= 1001 and C == 1):
str1 = str(i - 1)
C = 0
C = 1
str2 = f'openssl aes-128-cbc -d -in temp/{i}.ts -out video/{str1}.ts -K {key} -iv {iv}' + '\n'
f.write(str2)
f.close()
编写ts文件合并bat脚本
# 创建ts合成bat文件
def create_bat(video_name):
with open("OpenSSL-Win64\\bin\\video\\compose.bat", "w+",encoding='utf-8') as f:
f.write('cd OpenSSL-Win64/bin/video\n')
f.write('chcp 65001\n')
f.write(f'copy /b *.ts {video_name}.mp4\n')
f.write(f'move {video_name}.mp4 ../../../\n')
f.close()
# 爬取线程结束判断,执行bat脚本
def thread_break():
global counter
while 1:
if counter == 1:
print("---------------线程已执行完毕---------------")
subprocess.call(['OpenSSL-Win64\\bin\\video\\de_aes.bat'])
subprocess.call(['OpenSSL-Win64\\bin\\video\\compose.bat'])
shutil.rmtree('OpenSSL-Win64\\bin\\temp') # 删除目录
shutil.rmtree('OpenSSL-Win64\\bin\\video') # 删除目录
print('成功')
break
整体代码
import os
import shutil
import subprocess
import requests
import re
from Crypto.Cipher import AES
import threading
import time
from retrying import retry
import binascii
from selenium import webdriver
from selenium.webdriver.common.by import By
requests.adapters.DEDAULT_RETRIES = 5
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0',
'referer': 'https://jable.tv/'
}
# 初始化目录不存在temp目录则创建目录
def init_dir_path():
if os.path.exists('OpenSSL-Win64\\bin\\temp'):
shutil.rmtree('OpenSSL-Win64\\bin\\temp')#删除目录
if not os.path.exists('OpenSSL-Win64\\bin\\temp'):
os.mkdir('OpenSSL-Win64\\bin\\temp')
if os.path.exists('OpenSSL-Win64\\bin\\video'):
shutil.rmtree('OpenSSL-Win64\\bin\\video')#删除目录
if not os.path.exists('OpenSSL-Win64\\bin\\video'):
os.mkdir('OpenSSL-Win64\\bin\\video')
def get_m3u8_url(url):
options = webdriver.EdgeOptions()
options.add_experimental_option("detach", True)
# 创建浏览器对象
driver = webdriver.Edge(options=options)
#
driver.get(url)
time.sleep(5)
# 查找脚本元素
r = driver.find_elements(By.XPATH, '//*[@id="site-content"]/div/div/div[1]/section[1]/script[2]')
# 解析元素,匹配hlsurl
str1 = r[0].get_attribute("innerHTML")
m3u8_url = re.findall("hlsUrl = '(.*)';", str1)
e = driver.find_elements(By.XPATH, '//*[@id="site-content"]/div/div/div[1]/section[2]/div[1]/div[1]/h4')
video_name = e[0].text
if m3u8_url[0]:
driver.close()
# 文件名处理 \/?:"*<>| 替换为 \/?_"*<>︱
for i, j in ('/\', '\/', ' _','??', ':_', '""', '**', '<<', '>>', '|︱'):
video_name = video_name.replace(i, j)
return m3u8_url[0], video_name
def m3u8(url):
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0',
'connection': 'close'
}
r = requests.get(url,headers=headers)
with open('m3u8.txt', 'wb') as f:
f.write(r.content)
f.close()
def prase_m3u8(download_url):
list_url = []
with open('m3u8.txt', 'r') as f:
str = f.read()
list = re.findall('.*ts',str)
for i in list:
list_url.append(download_url+i)
f.close()
return list_url
@retry(stop_max_attempt_number=3, wait_fixed=20000)
def downloads(j,url):
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0'}
try:
r = requests.get(url, headers=headers,timeout=15)
with open('OpenSSL-Win64\\bin\\temp\\' + str(j) + '.ts', 'wb') as f:
f.write(r.content)
print(url)
print(str(j) + '已下载.')
f.close()
except TimeoutError: # 1
print('-----------------TimeoutError----------------'+'\\n'+j+'下载失败')
except: # 2
print('---------------------OtherError----------------------'+'\\n'+j+'下载失败')
class myThread(threading.Thread):
def __init__(self, threadID, name, counter):
threading.Thread.__init__(self)
self.threadID = threadID
self.name = name
self.counter = counter
def run(self):
global counter
global j
global x
global l
temp_sum = l-1
while counter:
if (x == temp_sum):
break
threadLock.acquire()
str_url = list_geturl(list,l)
counter = counter-1
j = j+1
# 释放锁
threadLock.release()
downloads(j,str_url)
def list_geturl(list,l):
global x
if(x<l):
x = x+1
return list[x]
def start_download():
# 创建新线程
thread1 = myThread(1, "Thread-1", 1)
thread2 = myThread(2, "Thread-2", 2)
thread3 = myThread(3, "Thread-3", 3)
thread4 = myThread(4, "Thread-4", 4)
thread5 = myThread(5, "Thread-5", 5)
# 开启新线程
thread1.start()
thread2.start()
thread3.start()
thread4.start()
thread5.start()
#获取key和iv
def get_keyandiv(url,key,iv):
url_prefix = url.rsplit("/",1)[0] +"/"
#iv和 key.ts URL获取
with open('m3u8.txt', 'r') as f:
str1 = f.read()
iv = re.findall('0x.*',str1)[0]
iv = iv.split('0x')[1]
key_url = re.findall('.*ts',str1)[0]
key_url = url_prefix+key_url.rsplit('"')[1]
f.close()
# key.ts获取
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0',
}
try:
r = requests.get(key_url, headers=headers,timeout=15)
with open('key.ts', 'wb') as f:
f.write(r.content)
print('key.ts已下载.')
f.close()
except TimeoutError: # 1
print('-----------------TimeoutError----------------key.ts下载失败')
except: # 2
print('---------------------OtherError----------------------key.ts下载失败')
# key获取
with open("key.ts","rb") as f:
r = f.read()
key = binascii.b2a_hex(r)
key = str(key)
key = key[2:-1]
f.close()
return key,iv
# 创建ts合成bat文件
def create_bat(video_name):
with open("OpenSSL-Win64\\bin\\video\\compose.bat", "w+",encoding='utf-8') as f:
f.write('cd OpenSSL-Win64/bin/video\n')
f.write('chcp 65001\n')
f.write(f'copy /b *.ts {video_name}.mp4\n')
f.write(f'move {video_name}.mp4 ../../../\n')
f.close()
# ts文件解密bat文件
def create_deAES_bat(key,iv):
with open("OpenSSL-Win64\\bin\\video\\de_aes.bat", "w+",encoding='utf-8') as f:
f.write('cd OpenSSL-Win64/bin\n')
C = 1
for i in range(1,l):
if (i < 11 and C == 1):
str1 = '000' + str(i - 1)
C = 0
if (i < 101 and C == 1):
str1 = '00' + str(i - 1)
C = 0
if (i < 1001 and C == 1):
str1 = '0' + str(i - 1)
C = 0
if (i >= 1001 and C == 1):
str1 = str(i - 1)
C = 0
C = 1
str2 = f'openssl aes-128-cbc -d -in temp/{i}.ts -out video/{str1}.ts -K {key} -iv {iv}' + '\n'
f.write(str2)
f.close()
# 爬取线程结束判断,执行bat脚本
def thread_break():
global counter
while 1:
if counter == 1:
print("---------------线程已执行完毕---------------")
subprocess.call(['OpenSSL-Win64\\bin\\video\\de_aes.bat'])
subprocess.call(['OpenSSL-Win64\\bin\\video\\compose.bat'])
shutil.rmtree('OpenSSL-Win64\\bin\\temp') # 删除目录
shutil.rmtree('OpenSSL-Win64\\bin\\video') # 删除目录
print('成功')
break
if __name__ == '__main__':
init_dir_path()
url= "https://xxxxxx"
m3u8_url,video_name = get_m3u8_url(url)
print(m3u8_url)
m3u8(m3u8_url)#获取m3u8文件
m3u8_url_pra = m3u8_url.rsplit('/', 1)[0] + '/'
list = prase_m3u8(m3u8_url_pra) # 解析m3u8文件为list
j = 0 # 记录下载多少ts 0
list1 = [] # 未下载的文件
x = 0 # 从list中下载 -1
l = len(list)
counter = l
threadLock = threading.Lock()
threads = []
# print(list)
start_download() #开始下载
key= ''
iv = ''
key,iv = get_keyandiv(m3u8_url,key,iv)
print(video_name)
create_bat(video_name)
create_deAES_bat(key,iv)
thread_break()
print('------------------------------------------------------------------------------')
对于此网站,只需在main中url输入视频页地址即可爬取视频并保存在当前目录(需要将openssl文件复制到该目录下,如ts文件未加密,可不执行解密bat文件,修改合并bat文件即可)