pip install transformers | transformers 설치 |
pip install transformers[torch] | PyTorch 포함 |
pip install transformers[tf-cpu] | TensorFlow 포함 |
pip install datasets | datasets 설치 |
pip install accelerate | accelerate 설치 |
huggingface-cli login | Hub 로그인 |
from transformers import pipeline
classifier = pipeline("sentiment-analysis")
result = classifier("I love this product!")
# [{'label': 'POSITIVE', 'score': 0.999}] generator = pipeline("text-generation", model="gpt2")
result = generator("Hello, I am", max_length=30, num_return_sequences=1) qa = pipeline("question-answering")
result = qa(
question="What is my name?",
context="My name is John and I live in NYC."
) summarizer = pipeline("summarization")
result = summarizer(long_text, max_length=150, min_length=50) translator = pipeline("translation_en_to_fr")
result = translator("Hello, how are you?") fill_mask = pipeline("fill-mask")
result = fill_mask("The capital of France is [MASK].") classifier = pipeline("zero-shot-classification")
result = classifier(
"This is a tutorial about Python",
candidate_labels=["education", "politics", "technology"]
) from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased") from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased") model = AutoModel.from_pretrained("./my_model_dir") model.save_pretrained("./my_model_dir")
tokenizer.save_pretrained("./my_model_dir") tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# Tokenize
tokens = tokenizer("Hello, world!")
# {'input_ids': [...], 'attention_mask': [...]}
# Decode
text = tokenizer.decode(tokens["input_ids"]) inputs = tokenizer(
["Hello!", "How are you?"],
padding=True,
truncation=True,
max_length=128,
return_tensors="pt" # or "tf"
) tokenizer.cls_token # [CLS]
tokenizer.sep_token # [SEP]
tokenizer.pad_token # [PAD]
tokenizer.mask_token # [MASK]
tokenizer.unk_token # [UNK] from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
inputs = tokenizer("I love this!", return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits
predictions = torch.argmax(logits, dim=-1) from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
inputs = tokenizer("Hello, I am", return_tensors="pt")
outputs = model.generate(
**inputs,
max_length=50,
num_beams=5,
temperature=0.7,
do_sample=True
)
text = tokenizer.decode(outputs[0]) model = AutoModel.from_pretrained("bert-base-uncased")
inputs = tokenizer("Hello world", return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
embeddings = outputs.last_hidden_state
# Shape: [batch_size, seq_length, hidden_size]
# CLS token embedding (sentence representation)
cls_embedding = embeddings[:, 0, :] from transformers import Trainer, TrainingArguments
from datasets import load_dataset
# Load dataset
dataset = load_dataset("imdb")
# Tokenize
def tokenize(examples):
return tokenizer(examples["text"], padding="max_length", truncation=True)
tokenized_datasets = dataset.map(tokenize, batched=True)
# Training arguments
training_args = TrainingArguments(
output_dir="./results",
evaluation_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=16,
num_train_epochs=3,
weight_decay=0.01,
)
# Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["test"],
)
trainer.train() from torch.utils.data import DataLoader
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=5e-5)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)
model.train()
for epoch in range(3):
for batch in dataloader:
optimizer.zero_grad()
outputs = model(**batch)
loss = outputs.loss
loss.backward()
optimizer.step() from datasets import load_dataset
# Common datasets
dataset = load_dataset("imdb")
dataset = load_dataset("squad")
dataset = load_dataset("glue", "mrpc")
# Access splits
train = dataset["train"]
test = dataset["test"] dataset = load_dataset("csv", data_files="data.csv")
dataset = load_dataset("json", data_files="data.json")
dataset = load_dataset("text", data_files="data.txt") # Map function
dataset = dataset.map(lambda x: tokenizer(x["text"]))
# Filter
dataset = dataset.filter(lambda x: len(x["text"]) > 10)
# Shuffle and select
dataset = dataset.shuffle(seed=42).select(range(1000))
# Train/test split
dataset = dataset.train_test_split(test_size=0.2) from huggingface_hub import login
login(token="your_token")
# Push model
model.push_to_hub("my-model-name")
tokenizer.push_to_hub("my-model-name")
# Or use Trainer
trainer.push_to_hub() from huggingface_hub import hf_hub_download
file_path = hf_hub_download(
repo_id="bert-base-uncased",
filename="config.json"
) from huggingface_hub import list_models
models = list_models(
filter="text-classification",
sort="downloads",
direction=-1
)