试玩Yi-34B-Chat-4bits
依赖
torch 2.1.2
vllm 0.2.6
xformers 0.0.23.post1
下载
先安装modelscope的python库
pip install modelscope
在modelscope下载模型
from modelscope.hub.snapshot_download import snapshot_download
model_dir = snapshot_download('01ai/Yi-34B-Chat-4bits', cache_dir='LLMs', revision='master', ignore_file_pattern='.bin')
安装VLLM库
pip install vllm
运行
python -m vllm.entrypoints.openai.api_server \
--model Yi-34B-Chat-4bits/checkpoints/01ai/Yi-34B-Chat-4bits \
--served-model-name 01ai/Yi-34B-Chat-4bits \
--trust-remote-code \
--max-model-len 2048 -q awq
测试
curl http://localhost:8000/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "01ai/Yi-34B-Chat-4bits",
"prompt": "你是谁?",
"max_tokens": 100,
"temperature": 0
}'
benchmark测试
python benchmark_throughput.py \
--backend vllm \
--input-len 128 \
--output-len 512 \
--model Yi-34B-Chat-4bits/checkpoints/01ai/Yi-34B-Chat-4bits \
--trust-remote-code \
--max-model-len 2048 -q awq --seed 1100 --num-prompts 100
Gradio测试
版本依赖
# openai==1.6.1
pip install openai -U
pip install gradio==3.41
简单的demo
chat.py
from openai import OpenAI
import gradio as gr
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
def predict(message, history):
history_openai_format = [{"role": "system", "content": "你是一个靠谱的 AI 助手,尽量详细的解答用户的提问。"}]
for human, assistant in history:
history_openai_format.append({"role": "user", "content": human})
history_openai_format.append({"role": "assistant", "content": assistant})
history_openai_format.append({"role": "user", "content": message})
stream = client.chat.completions.create(
model='01ai/Yi-34B-Chat-4bits',
messages=history_openai_format,
temperature=0,
stream=True,
extra_body={'repetition_penalty': 1, 'stop_token_ids': [7]} # 该模型需要手动设置stop ids=7,不然模型输出不停止
)
partial_message = ""
for chunk in stream:
partial_message += (chunk.choices[0].delta.content or "")
yield partial_message
if __name__ == '__main__':
gr.ChatInterface(predict).queue().launch()
旧版本openai
# openai==0.28.1
pip install openai==0.28.1
pip install gradio==3.41
简单的demo
chat.py
import openai
import gradio as gr
openai.api_key = "EMPTY"
openai.api_base = "http://localhost:8000/v1"
def predict(message, history):
history_openai_format = [{"role": "system", "content": "你是一个靠谱的 AI 助手,尽量详细的解答用户的提问。"}]
for human, assistant in history:
history_openai_format.append({"role": "user", "content": human})
history_openai_format.append({"role": "assistant", "content": assistant})
history_openai_format.append({"role": "user", "content": message})
stream = openai.ChatCompletion.create(
model='01ai/Yi-34B-Chat-4bits',
messages=history_openai_format,
temperature=0,
stop_token_ids=[7],
# stream=True,
)
yield stream['choices'][0]['message']['content']
# 流式输出
def predict_s(message, history):
history_openai_format = [{"role": "system", "content": "你是一个靠谱的 AI 助手,尽量详细的解答用户的提问。"}]
for human, assistant in history:
history_openai_format.append({"role": "user", "content": human})
history_openai_format.append({"role": "assistant", "content": assistant})
history_openai_format.append({"role": "user", "content": message})
stream = openai.ChatCompletion.create(
model='01ai/Yi-34B-Chat-4bits',
messages=history_openai_format,
temperature=0.8,
stop_token_ids=[7],
stream=True,
)
partial_message = ""
for chunk in stream:
try:
partial_message += chunk['choices'][0]['delta']['content']
except:
partial_message += ""
yield partial_message
if __name__ == '__main__':
gr.ChatInterface(predict).queue().launch()
References