With both AMD and NVIDIA establishing themselves as top offerings for AI compute, questions have arisen over the differences in software required to run on each. Real-world workloads can run on both types of hardware with little to no code changes, and we're excited to demonstrate this further today.
We'll start by training an image classifier on the CIFAR-10 dataset in PyTorch on both NVIDIA and AMD.
This will be the only difference in process for this tutorial.
Next, navigate to the directory you'd like to set this tutorial up in. From there, create the following Python script:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import os
device = torch.device('cuda')
class SimpleCNN(nn.Module):
def __init__(self, num_classes=10):
super(SimpleCNN, self).__init__()
self.features = nn.Sequential(
nn.Conv2d(3, 64, 3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(2, 2),
nn.Conv2d(64, 128, 3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(2, 2),
nn.Conv2d(128, 256, 3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(2, 2)
)
self.classifier = nn.Sequential(
nn.Dropout(0.5),
nn.Linear(256 * 4 * 4, 512),
nn.ReLU(inplace=True),
nn.Dropout(0.5),
nn.Linear(512, num_classes)
)
def forward(self, x):
x = self.features(x)
x = x.view(x.size(0), -1)
x = self.classifier(x)
return x
def train_and_evaluate(model, train_loader, test_loader, num_epochs=10, learning_rate=0.01):
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)
for epoch in range(num_epochs):
model.train()
for images, labels in train_loader:
images, labels = images.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(images)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
model.eval()
correct = 0
total = 0
with torch.no_grad():
for images, labels in test_loader:
images, labels = images.to(device), labels.to(device)
outputs = model(images)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
accuracy = 100 * correct / total
print(f'Epoch [{epoch+1}/{num_epochs}], Accuracy: {accuracy:.2f}%')
return model
def save_model(model, path):
torch.save(model.state_dict(), path)
print(f"Model saved to {path}")
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
model = SimpleCNN()
trained_model = train_and_evaluate(model, train_loader, test_loader)
model_save_path = 'cifar10_cnn_model.pth'
save_model(trained_model, model_save_path)
trained_model.eval()
correct = 0
total = 0
with torch.no_grad():
for images, labels in test_loader:
images, labels = images.to(device), labels.to(device)
outputs = trained_model(images)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
print(f'Final Test Accuracy: {100 * correct / total:.2f}%')
This script loads the dataset, transforms it, then trains and evaluates a CNN model that can classify at around 80% accuracy. This model gets saved at the model_save_path, which can be configured on your own.
You'll notice that at the top, we set our computation device via device = torch.device('cuda'). In PyTorch's ROCm installation, 'cuda' actually points to AMD GPUs, leaving no need to make any changes to any of your desired scripts.
Next, create the following inference script:
import torch
import torch.nn as nn
from torchvision import transforms
from PIL import Image
import requests
from io import BytesIO
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_save_path = 'cifar10_cnn_model.pth'
def load_model(model, path):
model.load_state_dict(torch.load(path, map_location=device))
model.eval()
print(f"Model loaded from {path}")
return model
class SimpleCNN(nn.Module):
def __init__(self, num_classes=10):
super(SimpleCNN, self).__init__()
self.features = nn.Sequential(
nn.Conv2d(3, 64, 3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(2, 2),
nn.Conv2d(64, 128, 3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(2, 2),
nn.Conv2d(128, 256, 3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(2, 2)
)
self.classifier = nn.Sequential(
nn.Dropout(0.5),
nn.Linear(256 * 4 * 4, 512),
nn.ReLU(inplace=True),
nn.Dropout(0.5),
nn.Linear(512, num_classes)
)
def forward(self, x):
x = self.features(x)
x = x.view(x.size(0), -1)
x = self.classifier(x)
return x
def predict_image_from_url(model, image_url):
transform = transforms.Compose([
transforms.Resize((32, 32)), # CIFAR10 images are 32x32
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
# Download the image
response = requests.get(image_url)
image = Image.open(BytesIO(response.content)).convert('RGB')
image = transform(image).unsqueeze(0).to(device)
model.eval()
with torch.no_grad():
output = model(image)
_, predicted = torch.max(output, 1)
classes = ('plane', 'car', 'bird', 'cat', 'deer',
'dog', 'frog', 'horse', 'ship', 'truck')
return classes[predicted.item()]
# Initialize and load the model
model = SimpleCNN()
model = load_model(model, model_save_path)
model = model.to(device)
# Predict from URL
image_url = 'https://images.twinkl.co.uk/tw1n/image/private/t_630/u/ux/frog-2_ver_1.jpg'
predicted_class = predict_image_from_url(model, image_url)
print(f"The image is predicted to be: {predicted_class}")
This script loads the model generated by the previous script, then classifies the specified image in image_url into one of the 10 categories:
For the purposes of this tutorial, we'll be fine-tuning Facebook's OPT-350m model. We'll begin by setting up our dependencies for significantly speeding up LLM training.
The following tutorial assumes the following prerequisites. If you're using different versions, please adjust your commands accordingly.
Notice that, as above, this will be the only difference between the two training processes
From there, make the following script in a subfolder you'd like to do your work in.
# imports
from datasets import load_dataset
from trl import SFTTrainer
# get dataset
dataset = load_dataset("imdb", split="train")
# get trainer
trainer = SFTTrainer(
"facebook/opt-350m",
train_dataset=dataset,
dataset_text_field="text",
max_seq_length=512,
)
# train
trainer.train()
trainer.save_model("imdb_saved")
This script trains Facebook's OPT-350m model on an imdb review dataset, and saves the model for later inference. To conduct inference, use the following script:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
# Load the model and tokenizer
model_path = "imdb_saved"
model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
# Move the model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
def generate_text(prompt, max_length=150):
inputs = tokenizer(prompt, return_tensors="pt").to(device)
# Generate
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=max_length,
num_return_sequences=1,
no_repeat_ngram_size=2
)
# Decode and return the generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
return generated_text
# Test with a positive prompt
positive_prompt = "This movie was amazing! The plot"
print("Model loading...")
positive_response = generate_text(positive_prompt)
print("Positive prompt:")
print(positive_response )
# Test with a negative prompt
negative_prompt = "I hated this film. The acting"
print("\nNegative prompt:")
print(generate_text(negative_prompt))
# Test with a neutral prompt
neutral_prompt = "This movie was okay. It had"
print("\nNeutral prompt:")
print(generate_text(neutral_prompt))
Accelerated Inference for Llama 3.1 (and other HF Models)
For this section of the tutorial, we're going to use vLLM, a framework for accelerated LLM inference and serving.
We're going to serve Llama 3.1 8B Instruct through Docker containers. We'll start by pulling the images and serving the endpoints from there. Note that since the Llama models are gated, we'll have to log in through huggingface-cli to use them.
docker pull rocm/vllm:rocm6.2_mi300_ubuntu22.04_py3.9_vllm_7c5fd50
docker run -it --network=host --group-add=video --ipc=host --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --device /dev/kfd --device /dev/dri vllm-rocm
huggingface-cli login #paste your token as needed
vllm serve meta-llama/Llama-3.1-8B-Instruct