31 lines
821 B
Python
31 lines
821 B
Python
from llama_cpp import Llama
|
|
from pydantic import BaseModel
|
|
|
|
class LlamaItem(BaseModel):
|
|
prompt: str
|
|
model_path: str
|
|
temperature: float = 0.2,
|
|
top_p: float = 0.95,
|
|
top_k: int = 40,
|
|
max_tokens: int = 256,
|
|
presence_penalty: float = 0,
|
|
frequency_penalty: float = 0,
|
|
repeat_penalty: float = 1.1,
|
|
n_ctx: int = 2000
|
|
|
|
def stream_chat_llamacpp(item:LlamaItem):
|
|
if last_model_path != item.model_path or llm is None or n_ctx != item.n_ctx:
|
|
llm = Llama(model_path=item.model_path, n_ctx=n_ctx)
|
|
last_model_path = item.model_path
|
|
n_ctx = item.n_ctx
|
|
chunks = llm.create_completion(
|
|
prompt = item.prompt,
|
|
)
|
|
for chunk in chunks:
|
|
cont = chunk
|
|
print(cont, end="")
|
|
yield cont.encode()
|
|
|
|
n_ctx = 2000
|
|
last_model_path = ""
|
|
llm:Llama |