PaddlePaddle
This topic describes how to integrate TensorBay dataset with PaddlePaddle Pipeline using the DogsVsCats Dataset as an example.
The typical method to integrate TensorBay dataset with PaddlePaddle is to build a “Segment” class
derived from paddle.io.Dataset
.
from paddle.io import DataLoader, Dataset
from paddle.vision import transforms
from PIL import Image
from tensorbay import GAS
from tensorbay.dataset import Dataset as TensorBayDataset
class DogsVsCatsSegment(Dataset):
"""class for wrapping a DogsVsCats segment."""
def __init__(self, gas, segment_name, transform):
super().__init__()
self.dataset = TensorBayDataset("DogsVsCats", gas)
self.segment = self.dataset[segment_name]
self.category_to_index = self.dataset.catalog.classification.get_category_to_index()
self.transform = transform
def __len__(self):
return len(self.segment)
def __getitem__(self, idx):
data = self.segment[idx]
with data.open() as fp:
image_tensor = self.transform(Image.open(fp))
return image_tensor, self.category_to_index[data.label.classification.category]
Using the following code to create a PaddlePaddle dataloader and run it:
ACCESS_KEY = "Accesskey-*****"
to_tensor = transforms.ToTensor()
normalization = transforms.Normalize(mean=[0.485], std=[0.229])
my_transforms = transforms.Compose([to_tensor, normalization])
train_segment = DogsVsCatsSegment(GAS(ACCESS_KEY), segment_name="train", transform=my_transforms)
train_dataloader = DataLoader(train_segment, batch_size=4, shuffle=True, num_workers=0)
for index, (image, label) in enumerate(train_dataloader):
print(f"{index}: {label}")