imagenet_classification¶
Benchmark the performance of loading images from local file systems and classifying them using a GPU.
This script builds the data loader and instantiates an image classification model in a GPU. The data loader transfers the batch image data to the GPU concurrently, and the foreground thread run the model on data one by one.
To run the benchmark, pass it to the script like the following.
python imagenet_classification.py
--root-dir ~/imagenet/
--split val
Source¶
Source
Click here to see the source.
1# Copyright (c) Meta Platforms, Inc. and affiliates.
2# All rights reserved.
3#
4# This source code is licensed under the BSD-style license found in the
5# LICENSE file in the root directory of this source tree.
6
7"""Benchmark the performance of loading images from local file systems and
8classifying them using a GPU.
9
10This script builds the data loader and instantiates an image
11classification model in a GPU.
12The data loader transfers the batch image data to the GPU concurrently, and
13the foreground thread run the model on data one by one.
14
15.. include:: ../plots/imagenet_classification_chart.txt
16
17To run the benchmark, pass it to the script like the following.
18
19.. code-block::
20
21 python imagenet_classification.py
22 --root-dir ~/imagenet/
23 --split val
24"""
25
26# pyre-ignore-all-errors
27
28import contextlib
29import logging
30import time
31from collections.abc import Awaitable, Callable, Iterator
32from pathlib import Path
33
34import spdl.io
35import spdl.io.utils
36import torch
37from spdl.dataloader import DataLoader
38from spdl.source.imagenet import ImageNet
39from torch import Tensor
40from torch.profiler import profile
41
42_LG = logging.getLogger(__name__)
43
44
45__all__ = [
46 "entrypoint",
47 "benchmark",
48 "get_decode_func",
49 "get_dataloader",
50 "get_model",
51 "ModelBundle",
52 "Classification",
53 "Preprocessing",
54]
55
56
57def _parse_args(args):
58 import argparse
59
60 parser = argparse.ArgumentParser(
61 description=__doc__,
62 formatter_class=argparse.RawDescriptionHelpFormatter,
63 )
64 parser.add_argument("--debug", action="store_true")
65 parser.add_argument("--root-dir", type=Path, required=True)
66 parser.add_argument("--max-batches", type=int, default=float("inf"))
67 parser.add_argument("--batch-size", type=int, default=32)
68 parser.add_argument("--split", default="val", choices=["train", "val"])
69 parser.add_argument("--trace", type=Path)
70 parser.add_argument("--buffer-size", type=int, default=16)
71 parser.add_argument("--num-threads", type=int, default=16)
72 parser.add_argument("--no-compile", action="store_false", dest="compile")
73 parser.add_argument("--no-bf16", action="store_false", dest="use_bf16")
74 parser.add_argument("--use-nvjpeg", action="store_true")
75 args = parser.parse_args(args)
76 if args.trace:
77 args.max_batches = 60
78 return args
79
80
81# Handroll the transforms so as to support `torch.compile`
82class Preprocessing(torch.nn.Module):
83 """Perform pixel normalization and data type conversion.
84
85 Args:
86 mean: The mean value of the dataset.
87 std: The standard deviation of the dataset.
88 """
89
90 def __init__(self, mean: Tensor, std: Tensor) -> None:
91 super().__init__()
92 self.register_buffer("mean", mean)
93 self.register_buffer("std", std)
94
95 def forward(self, x: Tensor) -> Tensor:
96 """Normalize the given image batch.
97
98 Args:
99 x: The input image batch. Pixel values are expected to be
100 in the range of ``[0, 255]``.
101 Returns:
102 The normalized image batch.
103 """
104 x = x.float() / 255.0
105 return (x - self.mean) / self.std
106
107
108class Classification(torch.nn.Module):
109 """Classification()"""
110
111 def forward(self, x: Tensor, labels: Tensor) -> tuple[Tensor, Tensor]:
112 """Given a batch of features and labels, compute the top1 and top5 accuracy.
113
114 Args:
115 images: A batch of images. The shape is ``(batch_size, 3, 224, 224)``.
116 labels: A batch of labels. The shape is ``(batch_size,)``.
117
118 Returns:
119 A tuple of top1 and top5 accuracy.
120 """
121
122 probs = torch.nn.functional.softmax(x, dim=-1)
123 top_prob, top_catid = torch.topk(probs, 5)
124 top1 = (top_catid[:, :1] == labels).sum()
125 top5 = (top_catid == labels).sum()
126 return top1, top5
127
128
129class ModelBundle(torch.nn.Module):
130 """ModelBundle()
131
132 Bundle the transform, model backbone, and classification head into a single module
133 for a simple handling."""
134
135 def __init__(self, model, preprocessing, classification, use_bf16):
136 super().__init__()
137 self.model = model
138 self.preprocessing = preprocessing
139 self.classification = classification
140 self.use_bf16 = use_bf16
141
142 def forward(self, images: Tensor, labels: Tensor) -> tuple[Tensor, Tensor]:
143 """Given a batch of images and labels, compute the top1, top5 accuracy.
144
145 Args:
146 images: A batch of images. The shape is ``(batch_size, 3, 224, 224)``.
147 labels: A batch of labels. The shape is ``(batch_size,)``.
148
149 Returns:
150 A tuple of top1 and top5 accuracy.
151 """
152
153 x = self.preprocessing(images)
154
155 if self.use_bf16:
156 x = x.to(torch.bfloat16)
157
158 output = self.model(x)
159
160 return self.classification(output, labels)
161
162
163def _expand(vals, batch_size, res):
164 return torch.tensor(vals).view(1, 3, 1, 1).expand(batch_size, 3, res, res).clone()
165
166
167def get_model(
168 batch_size: int,
169 device_index: int,
170 compile: bool,
171 use_bf16: bool,
172 model_type: str = "mobilenetv3_large_100",
173) -> ModelBundle:
174 """Build computation model, including transfor, model, and classification head.
175
176 Args:
177 batch_size: The batch size of the input.
178 device_index: The index of the target GPU device.
179 compile: Whether to compile the model.
180 use_bf16: Whether to use bfloat16 for the model.
181 model_type: The type of the model. Passed to ``timm.create_model()``.
182
183 Returns:
184 The resulting computation model.
185 """
186 import timm
187
188 device = torch.device(f"cuda:{device_index}")
189
190 model = timm.create_model(model_type, pretrained=True)
191 model = model.eval().to(device=device)
192
193 if use_bf16:
194 model = model.to(dtype=torch.bfloat16)
195
196 preprocessing = Preprocessing(
197 mean=_expand([0.4850, 0.4560, 0.4060], batch_size, 224),
198 std=_expand([0.2290, 0.2240, 0.2250], batch_size, 224),
199 ).to(device)
200
201 classification = Classification().to(device)
202
203 if compile:
204 with torch.no_grad():
205 mode = "max-autotune"
206 model = torch.compile(model, mode=mode)
207 preprocessing = torch.compile(preprocessing, mode=mode)
208
209 return ModelBundle(model, preprocessing, classification, use_bf16)
210
211
212def get_decode_func(
213 device_index: int,
214 width: int = 224,
215 height: int = 224,
216) -> Callable[[list[tuple[str, int]]], Awaitable[tuple[Tensor, Tensor]]]:
217 """Get a function to decode images from a list of paths.
218
219 Args:
220 device_index: The index of the target GPU device.
221 width: The width of the decoded image.
222 height: The height of the decoded image.
223
224 Returns:
225 Async function to decode images in to batch tensor of NCHW format
226 and labels of shape ``(batch_size, 1)``.
227 """
228 device = torch.device(f"cuda:{device_index}")
229
230 filter_desc = spdl.io.get_video_filter_desc(
231 scale_width=256,
232 scale_height=256,
233 crop_width=width,
234 crop_height=height,
235 pix_fmt="rgb24",
236 )
237
238 async def decode_images(items: list[tuple[str, int]]):
239 paths = [item for item, _ in items]
240 labels = [[item] for _, item in items]
241 labels = torch.tensor(labels, dtype=torch.int64).to(device)
242 buffer = await spdl.io.async_load_image_batch(
243 paths,
244 width=None,
245 height=None,
246 pix_fmt=None,
247 strict=True,
248 filter_desc=filter_desc,
249 device_config=spdl.io.cuda_config(
250 device_index=0,
251 allocator=(
252 torch.cuda.caching_allocator_alloc,
253 torch.cuda.caching_allocator_delete,
254 ),
255 ),
256 )
257 batch = spdl.io.to_torch(buffer)
258 batch = batch.permute((0, 3, 1, 2))
259 return batch, labels
260
261 return decode_images
262
263
264def _get_experimental_nvjpeg_decode_function(
265 device_index: int,
266 width: int = 224,
267 height: int = 224,
268):
269 device = torch.device(f"cuda:{device_index}")
270 device_config = spdl.io.cuda_config(
271 device_index=device_index,
272 allocator=(
273 torch.cuda.caching_allocator_alloc,
274 torch.cuda.caching_allocator_delete,
275 ),
276 )
277
278 async def decode_images_nvjpeg(items: list[tuple[str, int]]):
279 paths = [item for item, _ in items]
280 labels = [[item] for _, item in items]
281 labels = torch.tensor(labels, dtype=torch.int64).to(device)
282 buffer = await spdl.io.async_load_image_batch_nvjpeg(
283 paths,
284 device_config=device_config,
285 width=width,
286 height=height,
287 pix_fmt="rgb",
288 # strict=True,
289 )
290 batch = spdl.io.to_torch(buffer)
291 return batch, labels
292
293 return decode_images_nvjpeg
294
295
296def get_dataloader(
297 src: Iterator[tuple[str, int]],
298 batch_size: int,
299 decode_func: Callable[[list[tuple[str, int]]], Awaitable[tuple[Tensor, Tensor]]],
300 buffer_size: int,
301 num_threads: int,
302) -> DataLoader:
303 """Build the dataloader for the ImageNet classification task.
304
305 The dataloader uses the ``decode_func`` for decoding images concurrently and
306 send the resulting data to GPU.
307
308 Args:
309 src: The source of the data. See :py:func:`source`.
310 batch_size: The number of images in a batch.
311 decode_func: The function to decode images.
312 buffer_size: The size of the buffer for the dataloader sink
313 num_threads: The number of worker threads.
314
315 """
316 return DataLoader(
317 src,
318 batch_size=batch_size,
319 drop_last=True,
320 aggregator=decode_func,
321 buffer_size=buffer_size,
322 num_threads=num_threads,
323 timeout=20,
324 )
325
326
327def benchmark(
328 dataloader: Iterator[tuple[Tensor, Tensor]],
329 model: ModelBundle,
330 max_batches: int = float("nan"),
331) -> None:
332 """The main loop that measures the performance of dataloading and model inference.
333
334 Args:
335 loader: The dataloader to benchmark.
336 model: The model to benchmark.
337 max_batches: The number of batch before stopping.
338 """
339
340 _LG.info("Running inference.")
341 num_frames, num_correct_top1, num_correct_top5 = 0, 0, 0
342 t0 = time.monotonic()
343 try:
344 for i, (batch, labels) in enumerate(dataloader):
345 if i == 20:
346 t0 = time.monotonic()
347 num_frames, num_correct_top1, num_correct_top5 = 0, 0, 0
348
349 with (
350 torch.profiler.record_function(f"iter_{i}"),
351 spdl.io.utils.trace_event(f"iter_{i}"),
352 ):
353 top1, top5 = model(batch, labels)
354
355 num_frames += batch.shape[0]
356 num_correct_top1 += top1
357 num_correct_top5 += top5
358
359 if i + 1 >= max_batches:
360 break
361 finally:
362 elapsed = time.monotonic() - t0
363 if num_frames != 0:
364 num_correct_top1 = num_correct_top1.item()
365 num_correct_top5 = num_correct_top5.item()
366 fps = num_frames / elapsed
367 _LG.info(f"FPS={fps:.2f} ({num_frames}/{elapsed:.2f})")
368 acc1 = 0 if num_frames == 0 else num_correct_top1 / num_frames
369 _LG.info(f"Accuracy (top1)={acc1:.2%} ({num_correct_top1}/{num_frames})")
370 acc5 = 0 if num_frames == 0 else num_correct_top5 / num_frames
371 _LG.info(f"Accuracy (top5)={acc5:.2%} ({num_correct_top5}/{num_frames})")
372
373
374def _get_dataloader(args, device_index) -> DataLoader:
375 src = ImageNet(args.root_dir, split=args.split)
376
377 if args.use_nvjpeg:
378 decode_func = _get_experimental_nvjpeg_decode_function(device_index)
379 else:
380 decode_func = get_decode_func(device_index)
381
382 return get_dataloader(
383 src,
384 args.batch_size,
385 decode_func,
386 args.buffer_size,
387 args.num_threads,
388 )
389
390
391def entrypoint(args: list[int] | None = None):
392 """CLI entrypoint. Run pipeline, transform and model and measure its performance."""
393
394 args = _parse_args(args)
395 _init_logging(args.debug)
396 _LG.info(args)
397
398 device_index = 0
399 model = get_model(args.batch_size, device_index, args.compile, args.use_bf16)
400 dataloader = _get_dataloader(args, device_index)
401
402 trace_path = f"{args.trace}"
403 if args.use_nvjpeg:
404 trace_path = f"{trace_path}.nvjpeg"
405
406 with (
407 torch.no_grad(),
408 profile() if args.trace else contextlib.nullcontext() as prof,
409 spdl.io.utils.tracing(f"{trace_path}.pftrace", enable=args.trace is not None),
410 ):
411 benchmark(dataloader, model, args.max_batches)
412
413 if args.trace:
414 prof.export_chrome_trace(f"{trace_path}.json")
415
416
417def _init_logging(debug=False):
418 fmt = "%(asctime)s [%(filename)s:%(lineno)d] [%(levelname)s] %(message)s"
419 level = logging.DEBUG if debug else logging.INFO
420 logging.basicConfig(format=fmt, level=level)
421
422
423if __name__ == "__main__":
424 entrypoint()
Functions¶
Functions
- entrypoint(args: list[int] | None = None)[source]¶
CLI entrypoint. Run pipeline, transform and model and measure its performance.
- benchmark(dataloader: Iterator[tuple[Tensor, Tensor]], model: ModelBundle, max_batches: int = nan) None [source]¶
The main loop that measures the performance of dataloading and model inference.
- Parameters:
loader – The dataloader to benchmark.
model – The model to benchmark.
max_batches – The number of batch before stopping.
- get_decode_func(device_index: int, width: int = 224, height: int = 224) Callable[[list[tuple[str, int]]], Awaitable[tuple[Tensor, Tensor]]] [source]¶
Get a function to decode images from a list of paths.
- Parameters:
device_index – The index of the target GPU device.
width – The width of the decoded image.
height – The height of the decoded image.
- Returns:
Async function to decode images in to batch tensor of NCHW format and labels of shape
(batch_size, 1)
.
- get_dataloader(src: Iterator[tuple[str, int]], batch_size: int, decode_func: Callable[[list[tuple[str, int]]], Awaitable[tuple[Tensor, Tensor]]], buffer_size: int, num_threads: int) DataLoader [source]¶
Build the dataloader for the ImageNet classification task.
The dataloader uses the
decode_func
for decoding images concurrently and send the resulting data to GPU.- Parameters:
src – The source of the data. See
source()
.batch_size – The number of images in a batch.
decode_func – The function to decode images.
buffer_size – The size of the buffer for the dataloader sink
num_threads – The number of worker threads.
- get_model(batch_size: int, device_index: int, compile: bool, use_bf16: bool, model_type: str = 'mobilenetv3_large_100') ModelBundle [source]¶
Build computation model, including transfor, model, and classification head.
- Parameters:
batch_size – The batch size of the input.
device_index – The index of the target GPU device.
compile – Whether to compile the model.
use_bf16 – Whether to use bfloat16 for the model.
model_type – The type of the model. Passed to
timm.create_model()
.
- Returns:
The resulting computation model.
Classes¶
Classes
- class ModelBundle[source]¶
Bundle the transform, model backbone, and classification head into a single module for a simple handling.
- forward(images: Tensor, labels: Tensor) tuple[Tensor, Tensor] [source]¶
Given a batch of images and labels, compute the top1, top5 accuracy.
- Parameters:
images – A batch of images. The shape is
(batch_size, 3, 224, 224)
.labels – A batch of labels. The shape is
(batch_size,)
.
- Returns:
A tuple of top1 and top5 accuracy.
- class Classification[source]¶
- forward(x: Tensor, labels: Tensor) tuple[Tensor, Tensor] [source]¶
Given a batch of features and labels, compute the top1 and top5 accuracy.
- Parameters:
images – A batch of images. The shape is
(batch_size, 3, 224, 224)
.labels – A batch of labels. The shape is
(batch_size,)
.
- Returns:
A tuple of top1 and top5 accuracy.