用大模型提取《cuda-c-programming-guide》的摘要并输出中文
想快速知道CUDA C++ Programming Guide 中的内容。打开网页,保存成mhtml内容,用以下脚本提取内容,调用qwen大模型生成摘要。当然,还可以对摘要再提一次摘要
1.代码
import email
from bs4 import BeautifulSoup
def extract_content(element):
contents = []
for child in element.children:
if child.name in ['h1', 'h2', 'h3','h4','h5','h6','p']:
body=child.get_text(strip=False)
if len(body.strip())>0 and child.name in ['h1', 'h2', 'h3','h4','h5','h6']:
contents.append(f"####{body.strip()}")
else:
contents.append(f"{body}")
elif child.name == 'section':
contents.extend(extract_content(child))
return contents
def extract_article_body_from_mhtml(file_path):
with open(file_path, 'rb') as file:
msg = email.message_from_binary_file(file)
html_content = None
for part in msg.walk():
if part.get_content_type() == 'text/html':
html_content = part.get_payload(decode=True)
break
output=[]
if html_content:
soup = BeautifulSoup(html_content, 'lxml')
article_body = soup.find('div', itemprop='articleBody')
if article_body:
contents = extract_content(article_body)
return "\n".join(contents)
return None
file_path = 'Guide.mhtml'
article_body_content = extract_article_body_from_mhtml(file_path)
def split_text_to_chunks(text, max_chunk_size=8192, delimiter='####'):
"""
将长文本分割成多块,每块的大小不超过最大块大小,并且以给定的分隔符开头和结尾。
在分割时确保不会跨段,每段由分隔符开头。
:param text: 要分割的长文本
:param max_chunk_size: 每一块的最大大小
:param delimiter: 每一段的起始分隔符
:return: 分割后的块列表
"""
# 按照分隔符分割文本,去除空白段
sections = [section for section in text.split(delimiter) if section.strip()]
chunks = []
current_chunk = ""
for section in sections:
# 每一段都要包含起始分隔符
section = delimiter + section
if len(current_chunk) + len(section) + len(delimiter) * 2 + 2 <= max_chunk_size:
current_chunk += section
else:
# 若当前段添加到当前块后超出最大块大小,则当前块保存
if current_chunk:
chunks.append(current_chunk.strip() + "\n")
current_chunk = section
# 添加最后一个块,如果有内容
if current_chunk:
chunks.append(current_chunk.strip() + "\n")
return chunks
def llm_summary(question):
from http import HTTPStatus
import dashscope
dashscope.api_key="sk-你的"
from dashscope import Generation
system_prompt="你是一个人工助手,帮助用户提取内容摘要,输出语言为中文"
messages = [{'role': 'system', 'content': system_prompt},
{'role': 'user', 'content': question}]
response = Generation.call(model="qwen-max", messages=messages,result_format='message')
if response.status_code == HTTPStatus.OK:
messages.append({'role': response.output.choices[0]['message']['role'],
'content': response.output.choices[0]['message']['content']})
output=response.output.choices[0]['message']['content']
return output
return ""
import time
f=open("output.txt","a+")
chunks = split_text_to_chunks(article_body_content)
total=len(chunks)
for idx, chunk in enumerate(chunks):
#print(f"Chunk {idx + 1}: {len(chunk)} \n{chunk}\n")
summary=llm_summary(chunk)
print(f" --------- {idx}/{total} @{idx/total:.3f} --------- ")
print(summary)
f.write(f"####{summary}")
f.flush()
time.sleep(5)