Bootstrap

用大模型提取《cuda-c-programming-guide》的摘要并输出中文

用大模型提取《cuda-c-programming-guide》的摘要并输出中文

想快速知道CUDA C++ Programming Guide 中的内容。打开网页,保存成mhtml内容,用以下脚本提取内容,调用qwen大模型生成摘要。当然,还可以对摘要再提一次摘要

1.代码

import email
from bs4 import BeautifulSoup

def extract_content(element):
    contents = []
    for child in element.children:
        if child.name in ['h1', 'h2', 'h3','h4','h5','h6','p']:
            body=child.get_text(strip=False)
            if len(body.strip())>0 and child.name in ['h1', 'h2', 'h3','h4','h5','h6']:            
                contents.append(f"####{body.strip()}")
            else:
                contents.append(f"{body}")
        elif child.name == 'section':
            contents.extend(extract_content(child))
    return contents
    
def extract_article_body_from_mhtml(file_path):
    with open(file_path, 'rb') as file:
        msg = email.message_from_binary_file(file)
    
    html_content = None
    for part in msg.walk():
        if part.get_content_type() == 'text/html':
            html_content = part.get_payload(decode=True)
            break
    output=[]
    if html_content:
        soup = BeautifulSoup(html_content, 'lxml')
        article_body = soup.find('div', itemprop='articleBody')
        if article_body:
            contents = extract_content(article_body)
            return "\n".join(contents)
    return None

file_path = 'Guide.mhtml'
article_body_content = extract_article_body_from_mhtml(file_path)

def split_text_to_chunks(text, max_chunk_size=8192, delimiter='####'):
    """
    将长文本分割成多块,每块的大小不超过最大块大小,并且以给定的分隔符开头和结尾。
    在分割时确保不会跨段,每段由分隔符开头。
    
    :param text: 要分割的长文本
    :param max_chunk_size: 每一块的最大大小
    :param delimiter: 每一段的起始分隔符
    :return: 分割后的块列表
    """
    # 按照分隔符分割文本,去除空白段
    sections = [section for section in text.split(delimiter) if section.strip()]
    
    chunks = []
    current_chunk = ""
    
    for section in sections:
        # 每一段都要包含起始分隔符
        section = delimiter + section
        
        if len(current_chunk) + len(section) + len(delimiter) * 2 + 2 <= max_chunk_size:
            current_chunk += section
        else:
            # 若当前段添加到当前块后超出最大块大小,则当前块保存
            if current_chunk:
                chunks.append(current_chunk.strip() + "\n")
            current_chunk = section
    
    # 添加最后一个块,如果有内容
    if current_chunk:
        chunks.append(current_chunk.strip() + "\n")
    
    return chunks

def llm_summary(question):
    from http import HTTPStatus
    import dashscope
    dashscope.api_key="sk-你的"
    from dashscope import Generation    
    
    system_prompt="你是一个人工助手,帮助用户提取内容摘要,输出语言为中文"
    
    messages = [{'role': 'system', 'content': system_prompt},
                {'role': 'user', 'content': question}]

    response = Generation.call(model="qwen-max", messages=messages,result_format='message')
    if response.status_code == HTTPStatus.OK:
        messages.append({'role': response.output.choices[0]['message']['role'],
                        'content': response.output.choices[0]['message']['content']})
        output=response.output.choices[0]['message']['content']
        return output
    return ""

import time
f=open("output.txt","a+")
chunks = split_text_to_chunks(article_body_content)
total=len(chunks)
for idx, chunk in enumerate(chunks):
    #print(f"Chunk {idx + 1}: {len(chunk)} \n{chunk}\n")
    summary=llm_summary(chunk)
    print(f" --------- {idx}/{total} @{idx/total:.3f} --------- ")    
    print(summary)
    f.write(f"####{summary}")
    f.flush()
    time.sleep(5)

;