1.第一步,本地模型搭建
.ChatGLM3-6b代码
https://github.com/THUDM/ChatGLM3
2.FastGPT
https://github.com/labring/FastGPT
3.向量模型m3e,可以通过docker部署
https://huggingface.co/moka-ai/m3e-base
先安装Anaconda3Index of /anaconda/archive/ | 清华大学开源软件镜像站 | Tsinghua Open Source Mirror
先创建名字是chatglm3-demo的环境
conda create -n chatglm3-demo python=3.10
conda activate chatglm3-demo
chatglm3-6b需要Python 3.10以上。
cd /ChatGLM3-main/composite_demo
换到demo下测试模型,先安装requirements.txt,可以先把torch删除单独安装。
pip install -r requirements.txt
Code Interpreter 还需要安装 Jupyter 内核。新建系统变量名=IPYKERNEL。值=chatglm3-demo
ipython kernel install --name chatglm3-demo --user
安装pytorch和torchvision的对应版本
torch-2.0.0+cu117-cp310-cp310-win_amd64.whl
torchvision-0.15.0+cu117-cp310-cp310-win_amd64.whl
在下载包的位置安装
pip install torch-2.0.0+cu117-cp310-cp310-win_amd64.whl
pip install torchvision-0.15.0+cu117-cp310-cp310-win_amd64.whl
可在网站搜索对应版本https://download.pytorch.org/whl/
验证是否使用GPU学习
import torch
from transformers import __version__ as transformers_version
import torchvision
print("PyTorch VER:", torch.__version__)
print("Transformers version:", transformers_version)
print("TorchVision version:", torchvision.__version__)
# 检查是否有可用的 GPU
if torch.cuda.is_available():
print("CUDA version:", torch.version.cuda)
print("GPU TRUE")
else:
print("GPU FALSE")
# 检查其他库的版本
# 这里可以添加其他库的检查
打开composite_demo/client.py修改模型位置
运行命令测试,缺少什么模块就安装
streamlit run main.py
如果显存低于12G,回答响应太慢,改变量化模型后,可以正常对话。
正常对话,代码和环境都可以运行。
切换到D:\...\ChatGLM3-main\openai_api_demo
修改openai_api.py使用chatglm3模型位置
如果有其他模型,放在一个目录
上postman测试。请求体测试。
{
"model": "string",
"messages": [
{
"role": "user",
"content": "你好",
"name": "string",
"function_call": {
"name": "string",
"arguments": "string"
}
}
],
"temperature": 0.8,
"top_p": 0.8,
"max_tokens": 0,
"stream": false,
"functions": {},
"repetition_penalty": 1.1
}
成功后运行
python openai_api.py
可替换openai_api.py代码
# coding=utf-8
# Implements API for ChatGLM3-6B in OpenAI's format. (https://platform.openai.com/docs/api-reference/chat)
# Usage: python openai_api.py
# Visit http://localhost:8000/docs for documents.
# 在OpenAI的API中,max_tokens 等价于 HuggingFace 的 max_new_tokens 而不是 max_length,
# 例如,对于6b模型,设置max_tokens = 8192,则会报错,因为扣除历史记录和提示词后,模型不能输出那么多的tokens。
import os
import time
import json
from contextlib import asynccontextmanager
from typing import List, Literal, Optional, Union
import torch
from torch.cuda import get_device_properties
import uvicorn
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from loguru import logger
from pydantic import BaseModel, Field
from sse_starlette.sse import EventSourceResponse
from transformers import AutoTokenizer, AutoModel
from utils import process_response, generate_chatglm3, generate_stream_chatglm3
MODEL_PATH = os.environ.get('MODEL_PATH', 'THUDM_chatglm3-6b')
TOKENIZER_PATH = os.environ.get("TOKENIZER_PATH", MODEL_PATH)
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
@asynccontextmanager
async def lifespan(app: FastAPI): # collects GPU memory
yield
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
app = FastAPI(lifespan=lifespan)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
class ModelCard(BaseModel):
id: str
object: str = "model"
created: int = Field(default_factory=lambda: int(time.time()))
owned_by: str = "owner"
root: Optional[str] = None
parent: Optional[str] = None
permission: Optional[list] = None
class ModelList(BaseModel):
object: str = "list"
data: List[ModelCard] = []
class FunctionCallResponse(BaseModel):
name: Optional[str] = None
arguments: Optional[str] = None
class ChatMessage(BaseModel):
role: Literal["user", "assistant", "system", "function"]
content: str = None
name: Optional[str] = None
function_call: Optional[FunctionCallResponse] = None
class DeltaMessage(BaseModel):
role: Optional[Literal["user", "assistant", "system"]] = None
content: Optional[str] = None
function_call: Optional[FunctionCallResponse] = None
class ChatCompletionRequest(BaseModel):
model: str
messages: List[ChatMessage]
temperature: Optional[float] = 0.8
top_p: Optional[float] = 0.8
max_tokens: Optional[int] = None
stream: Optional[bool] = False
functions: Optional[Union[dict, List[dict]]] = None
# Additional parameters
repetition_penalty: Optional[float] = 1.1
class ChatCompletionResponseChoice(BaseModel):
index: int
message: ChatMessage
finish_reason: Literal["stop", "length", "function_call"]
class ChatCompletionResponseStreamChoice(BaseModel):
index: int
delta: DeltaMessage
finish_reason: Optional[Literal["stop", "length", "function_call"]]
class UsageInfo(BaseModel):
prompt_tokens: int = 0
total_tokens: int = 0
completion_tokens: Optional[int] = 0
class ChatCompletionResponse(BaseModel):
model: str
object: Literal["chat.completion", "chat.completion.chunk"]
choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice]]
created: Optional[int] = Field(default_factory=lambda: int(time.time()))
usage: Optional[UsageInfo] = None
@app.get("/v1/models", response_model=ModelList)
async def list_models():
model_card = ModelCard(id="chatglm3-6b")
return ModelList(data=[model_card])
@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
async def create_chat_completion(request: ChatCompletionRequest):
global model, tokenizer
if len(request.messages) < 1 or request.messages[-1].role == "assistant":
raise HTTPException(status_code=400, detail="Invalid request")
gen_params = dict(
messages=request.messages,
temperature=request.temperature,
top_p=request.top_p,
max_tokens=request.max_tokens or 1024,
echo=False,
stream=request.stream,
repetition_penalty=request.repetition_penalty,
functions=request.functions,
)
logger.debug(f"==== request ====\n{gen_params}")
if request.stream:
# Use the stream mode to read the first few characters, if it is not a function call, direct stram output
predict_stream_generator = predict_stream(request.model, gen_params)
output = next(predict_stream_generator)
if not contains_custom_function(output):
return EventSourceResponse(predict_stream_generator, media_type="text/event-stream")
# Obtain the result directly at one time and determine whether tools needs to be called.
logger.debug(f"First result output:\n{output}")
function_call = None
if output and request.functions:
try:
function_call = process_response(output, use_tool=True)
except:
logger.warning("Failed to parse tool call")
# CallFunction
if isinstance(function_call, dict):
function_call = FunctionCallResponse(**function_call)
"""
In this demo, we did not register any tools.
You can use the tools that have been implemented in our `tool_using` and implement your own streaming tool implementation here.
Similar to the following method:
function_args = json.loads(function_call.arguments)
tool_response = dispatch_tool(tool_name: str, tool_params: dict)
"""
tool_response = ""
if not gen_params.get("messages"):
gen_params["messages"] = []
gen_params["messages"].append(ChatMessage(
role="assistant",
content=output,
))
gen_params["messages"].append(ChatMessage(
role="function",
name=function_call.name,
content=tool_response,
))
# Streaming output of results after function calls
generate = predict(request.model, gen_params)
return EventSourceResponse(generate, media_type="text/event-stream")
else:
# Handled to avoid exceptions in the above parsing function process.
generate = parse_output_text(request.model, output)
return EventSourceResponse(generate, media_type="text/event-stream")
# Here is the handling of stream = False
response = generate_chatglm3(model, tokenizer, gen_params)
# Remove the first newline character
if response["text"].startswith("\n"):
response["text"] = response["text"][1:]
response["text"] = response["text"].strip()
usage = UsageInfo()
function_call, finish_reason = None, "stop"
if request.functions:
try:
function_call = process_response(response["text"], use_tool=True)
except:
logger.warning("Failed to parse tool call, maybe the response is not a tool call or have been answered.")
if isinstance(function_call, dict):
finish_reason = "function_call"
function_call = FunctionCallResponse(**function_call)
message = ChatMessage(
role="assistant",
content=response["text"],
function_call=function_call if isinstance(function_call, FunctionCallResponse) else None,
)
logger.debug(f"==== message ====\n{message}")
choice_data = ChatCompletionResponseChoice(
index=0,
message=message,
finish_reason=finish_reason,
)
task_usage = UsageInfo.model_validate(response["usage"])
for usage_key, usage_value in task_usage.model_dump().items():
setattr(usage, usage_key, getattr(usage, usage_key) + usage_value)
return ChatCompletionResponse(model=request.model, choices=[choice_data], object="chat.completion", usage=usage)
async def predict(model_id: str, params: dict):
global model, tokenizer
choice_data = ChatCompletionResponseStreamChoice(
index=0,
delta=DeltaMessage(role="assistant"),
finish_reason=None
)
chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
yield "{}".format(chunk.model_dump_json(exclude_unset=True))
previous_text = ""
for new_response in generate_stream_chatglm3(model, tokenizer, params):
decoded_unicode = new_response["text"]
delta_text = decoded_unicode[len(previous_text):]
previous_text = decoded_unicode
finish_reason = new_response["finish_reason"]
if len(delta_text) == 0 and finish_reason != "function_call":
continue
function_call = None
if finish_reason == "function_call":
try:
function_call = process_response(decoded_unicode, use_tool=True)
except:
logger.warning(
"Failed to parse tool call, maybe the response is not a tool call or have been answered.")
if isinstance(function_call, dict):
function_call = FunctionCallResponse(**function_call)
delta = DeltaMessage(
content=delta_text,
role="assistant",
function_call=function_call if isinstance(function_call, FunctionCallResponse) else None,
)
choice_data = ChatCompletionResponseStreamChoice(
index=0,
delta=delta,
finish_reason=finish_reason
)
chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
yield "{}".format(chunk.model_dump_json(exclude_unset=True))
choice_data = ChatCompletionResponseStreamChoice(
index=0,
delta=DeltaMessage(),
finish_reason="stop"
)
chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
yield "{}".format(chunk.model_dump_json(exclude_unset=True))
yield '[DONE]'
def predict_stream(model_id, gen_params):
"""
The function call is compatible with stream mode output.
The first seven characters are determined.
If not a function call, the stream output is directly generated.
Otherwise, the complete character content of the function call is returned.
:param model_id:
:param gen_params:
:return:
"""
output = ""
is_function_call = False
has_send_first_chunk = False
for new_response in generate_stream_chatglm3(model, tokenizer, gen_params):
decoded_unicode = new_response["text"]
delta_text = decoded_unicode[len(output):]
output = decoded_unicode
# When it is not a function call and the character length is> 7,
# try to judge whether it is a function call according to the special function prefix
if not is_function_call and len(output) > 7:
# Determine whether a function is called
is_function_call = contains_custom_function(output)
if is_function_call:
continue
# Non-function call, direct stream output
finish_reason = new_response["finish_reason"]
# Send an empty string first to avoid truncation by subsequent next() operations.
if not has_send_first_chunk:
message = DeltaMessage(
content="",
role="assistant",
function_call=None,
)
choice_data = ChatCompletionResponseStreamChoice(
index=0,
delta=message,
finish_reason=finish_reason
)
chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
yield "{}".format(chunk.model_dump_json(exclude_unset=True))
send_msg = delta_text if has_send_first_chunk else output
has_send_first_chunk = True
message = DeltaMessage(
content=send_msg,
role="assistant",
function_call=None,
)
choice_data = ChatCompletionResponseStreamChoice(
index=0,
delta=message,
finish_reason=finish_reason
)
chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
yield "{}".format(chunk.model_dump_json(exclude_unset=True))
if is_function_call:
yield output
else:
yield '[DONE]'
async def parse_output_text(model_id: str, value: str):
"""
Directly output the text content of value
:param model_id:
:param value:
:return:
"""
choice_data = ChatCompletionResponseStreamChoice(
index=0,
delta=DeltaMessage(role="assistant", content=value),
finish_reason=None
)
chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
yield "{}".format(chunk.model_dump_json(exclude_unset=True))
choice_data = ChatCompletionResponseStreamChoice(
index=0,
delta=DeltaMessage(),
finish_reason="stop"
)
chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
yield "{}".format(chunk.model_dump_json(exclude_unset=True))
yield '[DONE]'
def contains_custom_function(value: str) -> bool:
"""
Determine whether 'function_call' according to a special function prefix.
For example, the functions defined in "tool_using/tool_register.py" are all "get_xxx" and start with "get_"
[Note] This is not a rigorous judgment method, only for reference.
:param value:
:return:
"""
return value and 'get_' in value
if __name__ == "__main__":
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, trust_remote_code=True)
model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True)
if torch.cuda.is_available():
total_vram_in_gb = get_device_properties(0).total_memory / 1073741824
print(f'\033[32m显存大小: {total_vram_in_gb:.2f} GB\033[0m')
with torch.cuda.device(f'cuda:{0}'):
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
if total_vram_in_gb > 13:
model = model.half().cuda()
print(f'\033[32m使用显卡fp16精度运行\033[0m')
elif total_vram_in_gb > 10:
model = model.half().quantize(8).cuda()
print(f'\033[32m使用显卡int8量化运行\033[0m')
elif total_vram_in_gb > 4.5:
model = model.half().quantize(4).cuda()
print(f'\033[32m使用显卡int4量化运行\033[0m')
else:
model = model.float()
print('\033[32m使用cpu运行\033[0m')
else:
model = model.float()
print('\033[32m使用cpu运行\033[0m')
model = model.eval()
#bilibili@十字鱼 https://space.bilibili.com/893892 感谢参考——秋葉aaaki、大江户战士
uvicorn.run(app, host='0.0.0.0', port=8000, workers=1)
2.部署One-Api
用于调用各种模型的节点,技术文档建议docker部署,可以用ubuntu20.04,windows程序里开启虚拟化。这里用VirtualBox,开启VT-x/AMD-V,需要在BIOS开启虚拟化功能,有些主板在安全设置里。网络端口转发添加3000、13000等,看需要增加规则。
打开ubuntu20,更新software可能需要一些时间,安装Code,Terminator用于之后的操作。首先解决权限问题。docker及docker守护程序的检查会涉及到权限问题。可将用户名添加到docker组,建议使用管理员权限操作。
sudo usermod -aG docker 用户名
打开code,新建terminal,拉取one-api的镜像,端口为13000
docker run --name one-api -d --restart always -p 13000:3000 -e TZ=Asia/Shanghai -v /home/ubuntu/data/one-api:/data justsong/one-api
进入localhost:13000,登录root,密码123456
chatglm3的Base URL:http//localhost:8000
继续添加m3e渠道,Base URL:http://localhost:6200
添加新令牌,提交。复制箭头下第一个到txt黏贴,例如:https://chat.oneapi.pro/#/?settings={"key":"sk-fAAfFClsyVXxvAgp57Ab758260124a958aF00a2d49CcB625","url":"http://localhost:3000"}
用docker部署m3e模型,默认用CPU运行:
docker run -d -p 6200:6008 --name=m3e-large-api registry.cn-hangzhou.aliyuncs.com/fastgpt_docker/m3e-large-api:latest
使用GPU运行:
docker run -d -p 6200:6008 --gpus all --name=m3e-large-api registry.cn-hangzhou.aliyuncs.com/fastgpt_docker/m3e-large-api:latest
原镜像:
docker run -d -p 6200:6008 --name=m3e-large-api stawky/m3e-large-api:latest
成功运行后测试,会反馈一组嵌入向量数据,说明成功部署
curl --location --request POST 'http://localhost:6200/v1/embeddings' \
--header 'Authorization: Bearer sk-aaabbbcccdddeeefffggghhhiiijjjkkk' \
--header 'Content-Type: application/json' \
--data-raw '{
"model": "m3e",
"input": ["laf是什么"]
}'
3.部署FastGPT
FastGPT也是Linux部署,这里就用Ubuntu20,打开Code,新建Terminal
下载docker-compose文件:
curl -O https://raw.githubusercontent.com/labring/FastGPT/main/files/deploy/fastgpt/docker-compose.yml
下载config文件:
curl -O https://raw.githubusercontent.com/labring/FastGPT/main/files/deploy/fastgpt/docker-compose.yml
拉取镜像:docker-compose pull
在后台运行容器:docker-compose up -d
FastGPT 4.6.8后mango副本集需要手动初始化操作
# 查看 mongo 容器是否正常运行
docker ps
# 进入容器
docker exec -it mongo bash
# 连接数据库
mongo -u myname -p mypassword --authenticationDatabase admin
# 初始化副本集。如果需要外网访问,mongo:27017 可以改成 ip:27017。但是需要同时修改 FastGPT 连接的参数(MONGODB_URI=mongodb://myname:mypassword@mongo:27017/fastgpt?authSource=admin => MONGODB_URI=mongodb://myname:mypassword@ip:27017/fastgpt?authSource=admin)
rs.initiate({
_id: "rs0",
members: [
{ _id: 0, host: "mongo:27017" }
]
})
# 检查状态。如果提示 rs0 状态,则代表运行成功
rs.status()
docker-compose文件修改OPENAI_BASE_URL:http://localhost:13000/v1
连接到One-API的端口,localhost改为本地地址
docker-compose文件修改CHAT_API_KEY:填入从OneAPI令牌复制的key
config文件修改,直接复制
{
"systemEnv": {
"openapiPrefix": "fastgpt",
"vectorMaxProcess": 15,
"qaMaxProcess": 15,
"pgHNSWEfSearch": 100
},
"llmModels": [
{
"model": "chatglm3",
"name": "chatglm3",
"maxContext": 4000,
"maxResponse": 4000,
"quoteMaxToken": 2000,
"maxTemperature": 1,
"vision": false,
"defaultSystemChatPrompt": ""
},
{
"model": "gpt-3.5-turbo-1106",
"name": "gpt-3.5-turbo",
"maxContext": 16000,
"maxResponse": 4000,
"quoteMaxToken": 13000,
"maxTemperature": 1.2,
"inputPrice": 0,
"outputPrice": 0,
"censor": false,
"vision": false,
"datasetProcess": false,
"toolChoice": true,
"functionCall": false,
"customCQPrompt": "",
"customExtractPrompt": "",
"defaultSystemChatPrompt": "",
"defaultConfig":{}
},
{
"model": "gpt-3.5-turbo-16k",
"name": "gpt-3.5-turbo-16k",
"maxContext": 16000,
"maxResponse": 16000,
"quoteMaxToken": 13000,
"maxTemperature": 1.2,
"inputPrice": 0,
"outputPrice": 0,
"censor": false,
"vision": false,
"datasetProcess": true,
"toolChoice": true,
"functionCall": false,
"customCQPrompt": "",
"customExtractPrompt": "",
"defaultSystemChatPrompt": "",
"defaultConfig":{}
},
{
"model": "gpt-4-0125-preview",
"name": "gpt-4-turbo",
"maxContext": 125000,
"maxResponse": 4000,
"quoteMaxToken": 100000,
"maxTemperature": 1.2,
"inputPrice": 0,
"outputPrice": 0,
"censor": false,
"vision": false,
"datasetProcess": false,
"toolChoice": true,
"functionCall": false,
"customCQPrompt": "",
"customExtractPrompt": "",
"defaultSystemChatPrompt": "",
"defaultConfig":{}
},
{
"model": "gpt-4-vision-preview",
"name": "gpt-4-vision",
"maxContext": 128000,
"maxResponse": 4000,
"quoteMaxToken": 100000,
"maxTemperature": 1.2,
"inputPrice": 0,
"outputPrice": 0,
"censor": false,
"vision": false,
"datasetProcess": false,
"toolChoice": true,
"functionCall": false,
"customCQPrompt": "",
"customExtractPrompt": "",
"defaultSystemChatPrompt": "",
"defaultConfig":{}
}
],
"vectorModels": [
{
"model": "m3e",
"name": "m3e",
"price": 0.1,
"defaultToken": 500,
"maxToken": 1800
},
{
"model": "text-embedding-ada-002",
"name": "Embedding-2",
"inputPrice": 0,
"outputPrice": 0,
"defaultToken": 700,
"maxToken": 3000,
"weight": 100,
"defaultConfig":{}
}
],
"reRankModels": [],
"audioSpeechModels": [
{
"model": "tts-1",
"name": "OpenAI TTS1",
"inputPrice": 0,
"outputPrice": 0,
"voices": [
{ "label": "Alloy", "value": "alloy", "bufferId": "openai-Alloy" },
{ "label": "Echo", "value": "echo", "bufferId": "openai-Echo" },
{ "label": "Fable", "value": "fable", "bufferId": "openai-Fable" },
{ "label": "Onyx", "value": "onyx", "bufferId": "openai-Onyx" },
{ "label": "Nova", "value": "nova", "bufferId": "openai-Nova" },
{ "label": "Shimmer", "value": "shimmer", "bufferId": "openai-Shimmer" }
]
}
],
"whisperModel": {
"model": "whisper-1",
"name": "Whisper1",
"inputPrice": 0,
"outputPrice": 0
}
}
如果所有步骤成功,访问localhost:3000端口,localhost记得改为本机地址,进入FastGPT页面。本地模型chatglm3的API需要开启。