Bootstrap

(RAG系列)FastGPT批量添加索引

(RAG系列)FastGPT批量添加索引

引言

索引制作:

  • 通过模型对分块内容进行概况

  • 根据文本内容划分特点,例如,文档有明显的大小标题,把标题作为索引

版本

fastgpt v4.8.10

使用说明

根据知识库文档分块内容,提炼相对应的索引,制作索引文件 (xlsx文件)。注意索引要跟分块数量一一对应,不添加索引的把索引设置成 no,而且,该操作会直接覆盖已存在的索引,适用于还未添加索引的场景

脚本代码

import requests
import json
import pandas as pd
import os

def batch_add_index(headers: dict, datasetid: str, get_collection_url: str, get_chunk_url: str, update_index_url: str,
                     parentid=None, index_path=None):

    index_filename_list = []
    if index_path != None:
        try:
            index_filename_list = os.listdir(index_path)
            for filename in index_filename_list:
                if filename.endswith('.xlsx'):
                    continue
                else:
                    return print('index_path必须是文件夹路径,并且文件夹中需要是.xlsx后缀的文件')
        except:
            return print('index_path必须是文件夹路径,并且文件夹中需要是.xlsx后缀的文件')
    # 循环知识库里的每一页
    collection_pagenum = 1
    while True:
        get_collectionId = {
            "pageNum": collection_pagenum,
            "pageSize": 20,
            "datasetId": datasetid,
            "parentId": parentid,
            "searchText": ""
        }
        collection_response = requests.post(url=get_collection_url, headers=headers, json=get_collectionId).json()
        # 如果知识库该页码为空时,结束当前知识库
        if collection_response['data']['data'] == []:
            break

        # 循环知识库当前页码下的内容
        for subset in collection_response['data']['data']:
            # 跳过 手动录入
            if subset['name'] == '手动录入':
                continue
            # 如果是文件夹
            if subset['type'] == 'folder':
                # 递归进去
                batch_add_index(headers, datasetid, get_collection_url, get_chunk_url, update_index_url,
                                parentid=subset['_id'], index_path=index_path)
            # 如果是链接
            elif subset['type'] == 'link':
                continue
            # 如果时是文件
            else:
                # 特用
                if index_path != None:
                    if subset['name'].replace(subset['name'][subset['name'].find('.'):], '.xlsx') in index_filename_list:
                        print(subset['name'].replace(subset['name'][subset['name'].find('.'):], '.xlsx') + " start")

                        df_index = pd.read_excel(os.path.join(index_path, subset['name'].replace(subset['name'][subset['name'].find('.'):], '.xlsx')))
                        try:
                            df_index = df_index[['index']]
                        except:
                            return print('xlsx文件中第一列第一行第一个单元格应是单词index')
                        df_index_list = df_index['index'].to_list()
                    else:
                        continue
                num = 0
                # 循环文件下的每一页
                chunk_pagenum = 1
                while True:
                    get_chunkId = {
                        "pageNum": chunk_pagenum,
                        "pageSize": 24,
                        "collectionId": subset['_id'],
                        "searchText": ""
                    }
                    chunk_response = requests.post(url=get_chunk_url, headers=headers, json=get_chunkId).json()
                    # 如果文件该页码为空时,结束该文件
                    if chunk_response['data']['data'] == []:
                        break
                    # 循环文件当前页码下的chunk
                    for chunk in chunk_response['data']['data']:
                        try:
                            print("num" + str(num))
                            print("chunk" + str(chunk['chunkIndex']))
                            if(chunk['chunkIndex'] != num) :
                                print("----------------"+ str(chunk['chunkIndex']) + "------------------------")
                            indexes = []
                            if df_index_list[chunk['chunkIndex']] != 'no':
                                p_l = df_index_list[chunk['chunkIndex']].split('\n')
                                p_l = list(set(p_l))
                                filtered_lst = [item for item in p_l if item != '']
                                #print(filtered_lst)
                                for i in filtered_lst:
                                    indexes.append({'text': i})

                            update_data = {
                                "dataId": chunk['_id'],
                                "q": chunk['q'],
                                "a": chunk['a'],
                                "indexes": indexes
                            }
                        except:
                            print("********************"+ subset['name'].replace(subset['name'][subset['name'].find('.'):], '.xlsx') + "有报错***************************")

                        update_response = requests.post(url=update_index_url, headers=headers, json=update_data).json()
                        if update_response['code'] != 200:
                            print(update_response)
                            print(f'集合名称:{subset["name"]}\n集合ID:{subset["_id"]}\nchunkID:{chunk["_id"]}\nchunk页码:{chunk_pagenum}')
                        num += 1
                    chunk_pagenum += 1
                print(subset['name'].replace(subset['name'][subset['name'].find('.'):], '.xlsx') + " over")
        collection_pagenum += 1


if __name__ == '__main__':
    #账号->API密钥->填在Authorization
    headers = {
        'Authorization': 'Bearer ',
        'Content-Type': 'application/json',
    }
    #知识库ID->打开知识库看浏览器界面链接
    datasetId = ''
    get_collection_url = 'http://xxxx:3000/api/core/dataset/collection/list'
    get_chunk_url = 'http://xxxx:3000/api/core/dataset/data/list'
    update_index_url = 'http://xxxx:3000/api/core/dataset/data/update'
    #文件夹ID(如果没有文件夹此项注释)->打开知识库看浏览器界面链接
    parentId = ''
    #索引文件(添加index列)->放置索引文件位置
    index_path = r'D:\mnt\data\111'

    batch_add_index(headers, datasetId, get_collection_url, get_chunk_url, update_index_url,parentid=parentId,index_path=index_path)

;