curl -fsSL https://ollama.com/install.sh | sh | Linux에 설치 |
brew install ollama | macOS에 설치 |
ollama serve | Ollama 서버 시작 |
ollama pull llama3 | 모델 다운로드 |
ollama list | 다운로드된 모델 목록 |
ollama show llama3 | 모델 정보 표시 |
ollama rm llama3 | 모델 제거 |
ollama cp llama3 my-llama3 | 모델 복사 |
ollama ps | 실행 중인 모델 목록 |
ollama run llama3 | 실행 및 채팅 |
ollama run llama3 "What is Python?" | 단일 프롬프트 |
ollama run codellama "Write a Python function" | 코드 생성 |
echo "Hello" | ollama run llama3 | 파이프 입력 |
curl http://localhost:11434/api/generate -d '{
"model": "llama3",
"prompt": "Why is the sky blue?",
"stream": false
}' curl http://localhost:11434/api/chat -d '{
"model": "llama3",
"messages": [
{ "role": "system", "content": "You are a helpful assistant." },
{ "role": "user", "content": "Hello!" }
],
"stream": false
}' curl http://localhost:11434/api/embeddings -d '{
"model": "nomic-embed-text",
"prompt": "Hello world"
}' curl http://localhost:11434/api/tags # pip install ollama
import ollama
# Generate
response = ollama.generate(
model='llama3',
prompt='Why is the sky blue?'
)
print(response['response'])
# Chat
response = ollama.chat(
model='llama3',
messages=[
{'role': 'user', 'content': 'Hello!'}
]
)
print(response['message']['content']) import ollama
# Streaming response
stream = ollama.chat(
model='llama3',
messages=[{'role': 'user', 'content': 'Tell me a story'}],
stream=True
)
for chunk in stream:
print(chunk['message']['content'], end='', flush=True) import ollama
response = ollama.embeddings(
model='nomic-embed-text',
prompt='Hello world'
)
embedding = response['embedding']
print(f'Dimension: {len(embedding)}') import ollama
# List models
models = ollama.list()
for model in models['models']:
print(model['name'])
# Pull model
ollama.pull('llama3')
# Show model info
info = ollama.show('llama3')
print(info) // npm install ollama
import { Ollama } from 'ollama';
const ollama = new Ollama();
// Generate
const response = await ollama.generate({
model: 'llama3',
prompt: 'Why is the sky blue?'
});
console.log(response.response);
// Chat
const chatResponse = await ollama.chat({
model: 'llama3',
messages: [{ role: 'user', content: 'Hello!' }]
});
console.log(chatResponse.message.content); const stream = await ollama.chat({
model: 'llama3',
messages: [{ role: 'user', content: 'Tell me a story' }],
stream: true
});
for await (const chunk of stream) {
process.stdout.write(chunk.message.content);
} # Modelfile
FROM llama3
# Set parameters
PARAMETER temperature 0.7
PARAMETER top_p 0.9
PARAMETER top_k 40
PARAMETER num_ctx 4096
PARAMETER stop "<|end|>"
# Set system prompt
SYSTEM You are a helpful coding assistant.
# Create model
# ollama create mymodel -f Modelfile # Modelfile
FROM ./my-model.gguf
TEMPLATE """{{ if .System }}<|system|>
{{ .System }}<|end|>
{{ end }}{{ if .Prompt }}<|user|>
{{ .Prompt }}<|end|>
{{ end }}<|assistant|>
""" ollama.generate(
model='llama3',
prompt='Hello',
options={
'temperature': 0.7, # Creativity (0-2)
'top_p': 0.9, # Nucleus sampling
'top_k': 40, # Top-k sampling
'num_predict': 128, # Max tokens
'num_ctx': 4096, # Context window
'repeat_penalty': 1.1, # Repetition penalty
'seed': 42, # Random seed
}
) from langchain_community.llms import Ollama
from langchain_community.chat_models import ChatOllama
from langchain_core.messages import HumanMessage
# LLM
llm = Ollama(model="llama3")
response = llm.invoke("Hello!")
# Chat model
chat = ChatOllama(model="llama3")
response = chat.invoke([HumanMessage(content="Hello!")])
# Embeddings
from langchain_community.embeddings import OllamaEmbeddings
embeddings = OllamaEmbeddings(model="nomic-embed-text")
vector = embeddings.embed_query("Hello world") # Ollama exposes OpenAI-compatible endpoint
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:11434/v1",
api_key="ollama" # Any string works
)
response = client.chat.completions.create(
model="llama3",
messages=[{"role": "user", "content": "Hello!"}]
)
print(response.choices[0].message.content) # Model storage location
export OLLAMA_MODELS=/path/to/models
# Server host/port
export OLLAMA_HOST=0.0.0.0:11434
# Keep model loaded
export OLLAMA_KEEP_ALIVE=5m
# GPU settings
export OLLAMA_NUM_GPU=1
export CUDA_VISIBLE_DEVICES=0
# Debug mode
export OLLAMA_DEBUG=1 docker run -d -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama | Docker로 실행 |
docker run -d --gpus=all -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama | GPU로 실행 |
docker exec -it ollama ollama run llama3 | 컨테이너에서 모델 실행 |