imagenet_classification¶
Benchmark the performance of loading images from local file systems and classifying them using a GPU.
This script builds the data loader and instantiates an image classification model in a GPU. The data loader transfers the batch image data to the GPU concurrently, and the foreground thread run the model on data one by one.
To run the benchmark, pass it to the script like the following.
python imagenet_classification.py
--root-dir ~/imagenet/
--split val
Source¶
Source
Click here to see the source.
1# Copyright (c) Meta Platforms, Inc. and affiliates.
2# All rights reserved.
3#
4# This source code is licensed under the BSD-style license found in the
5# LICENSE file in the root directory of this source tree.
6
7"""Benchmark the performance of loading images from local file systems and
8classifying them using a GPU.
9
10This script builds the data loader and instantiates an image
11classification model in a GPU.
12The data loader transfers the batch image data to the GPU concurrently, and
13the foreground thread run the model on data one by one.
14
15.. include:: ../plots/imagenet_classification_chart.txt
16
17To run the benchmark, pass it to the script like the following.
18
19.. code-block::
20
21 python imagenet_classification.py
22 --root-dir ~/imagenet/
23 --split val
24"""
25
26# pyre-ignore-all-errors
27
28import contextlib
29import logging
30import time
31from collections.abc import Awaitable, Callable, Iterator
32from pathlib import Path
33
34import spdl.io
35import spdl.utils
36import torch
37from spdl.dataloader import DataLoader
38from spdl.source.imagenet import ImageNet
39from torch import Tensor
40from torch.profiler import profile
41
42_LG = logging.getLogger(__name__)
43
44
45__all__ = [
46 "entrypoint",
47 "benchmark",
48 "get_decode_func",
49 "get_dataloader",
50 "get_model",
51 "ModelBundle",
52 "Classification",
53 "Preprocessing",
54]
55
56
57def _parse_args(args):
58 import argparse
59
60 parser = argparse.ArgumentParser(
61 description=__doc__,
62 formatter_class=argparse.RawDescriptionHelpFormatter,
63 )
64 parser.add_argument("--debug", action="store_true")
65 parser.add_argument("--root-dir", type=Path, required=True)
66 parser.add_argument("--max-batches", type=int, default=float("inf"))
67 parser.add_argument("--batch-size", type=int, default=32)
68 parser.add_argument("--split", default="val", choices=["train", "val"])
69 parser.add_argument("--trace", type=Path)
70 parser.add_argument("--buffer-size", type=int, default=16)
71 parser.add_argument("--num-threads", type=int, default=16)
72 parser.add_argument("--no-compile", action="store_false", dest="compile")
73 parser.add_argument("--no-bf16", action="store_false", dest="use_bf16")
74 parser.add_argument("--use-nvdec", action="store_true")
75 parser.add_argument("--use-nvjpeg", action="store_true")
76 args = parser.parse_args(args)
77 if args.trace:
78 args.max_batches = 60
79 return args
80
81
82# Handroll the transforms so as to support `torch.compile`
83class Preprocessing(torch.nn.Module):
84 """Perform pixel normalization and data type conversion.
85
86 Args:
87 mean: The mean value of the dataset.
88 std: The standard deviation of the dataset.
89 """
90
91 def __init__(self, mean: Tensor, std: Tensor) -> None:
92 super().__init__()
93 self.register_buffer("mean", mean)
94 self.register_buffer("std", std)
95
96 def forward(self, x: Tensor) -> Tensor:
97 """Normalize the given image batch.
98
99 Args:
100 x: The input image batch. Pixel values are expected to be
101 in the range of ``[0, 255]``.
102 Returns:
103 The normalized image batch.
104 """
105 x = x.float() / 255.0
106 return (x - self.mean) / self.std
107
108
109class Classification(torch.nn.Module):
110 """Classification()"""
111
112 def forward(self, x: Tensor, labels: Tensor) -> tuple[Tensor, Tensor]:
113 """Given a batch of features and labels, compute the top1 and top5 accuracy.
114
115 Args:
116 images: A batch of images. The shape is ``(batch_size, 3, 224, 224)``.
117 labels: A batch of labels. The shape is ``(batch_size,)``.
118
119 Returns:
120 A tuple of top1 and top5 accuracy.
121 """
122
123 probs = torch.nn.functional.softmax(x, dim=-1)
124 top_prob, top_catid = torch.topk(probs, 5)
125 top1 = (top_catid[:, :1] == labels).sum()
126 top5 = (top_catid == labels).sum()
127 return top1, top5
128
129
130class ModelBundle(torch.nn.Module):
131 """ModelBundle()
132
133 Bundle the transform, model backbone, and classification head into a single module
134 for a simple handling."""
135
136 def __init__(self, model, preprocessing, classification, use_bf16):
137 super().__init__()
138 self.model = model
139 self.preprocessing = preprocessing
140 self.classification = classification
141 self.use_bf16 = use_bf16
142
143 def forward(self, images: Tensor, labels: Tensor) -> tuple[Tensor, Tensor]:
144 """Given a batch of images and labels, compute the top1, top5 accuracy.
145
146 Args:
147 images: A batch of images. The shape is ``(batch_size, 3, 224, 224)``.
148 labels: A batch of labels. The shape is ``(batch_size,)``.
149
150 Returns:
151 A tuple of top1 and top5 accuracy.
152 """
153
154 x = self.preprocessing(images)
155
156 if self.use_bf16:
157 x = x.to(torch.bfloat16)
158
159 output = self.model(x)
160
161 return self.classification(output, labels)
162
163
164def _expand(vals, batch_size, res):
165 return torch.tensor(vals).view(1, 3, 1, 1).expand(batch_size, 3, res, res).clone()
166
167
168def get_model(
169 batch_size: int,
170 device_index: int,
171 compile: bool,
172 use_bf16: bool,
173 model_type: str = "mobilenetv3_large_100",
174) -> ModelBundle:
175 """Build computation model, including transfor, model, and classification head.
176
177 Args:
178 batch_size: The batch size of the input.
179 device_index: The index of the target GPU device.
180 compile: Whether to compile the model.
181 use_bf16: Whether to use bfloat16 for the model.
182 model_type: The type of the model. Passed to ``timm.create_model()``.
183
184 Returns:
185 The resulting computation model.
186 """
187 import timm
188
189 device = torch.device(f"cuda:{device_index}")
190
191 model = timm.create_model(model_type, pretrained=True)
192 model = model.eval().to(device=device)
193
194 if use_bf16:
195 model = model.to(dtype=torch.bfloat16)
196
197 preprocessing = Preprocessing(
198 mean=_expand([0.4850, 0.4560, 0.4060], batch_size, 224),
199 std=_expand([0.2290, 0.2240, 0.2250], batch_size, 224),
200 ).to(device)
201
202 classification = Classification().to(device)
203
204 if compile:
205 with torch.no_grad():
206 mode = "max-autotune"
207 model = torch.compile(model, mode=mode)
208 preprocessing = torch.compile(preprocessing, mode=mode)
209
210 return ModelBundle(model, preprocessing, classification, use_bf16)
211
212
213def get_decode_func(
214 device_index: int,
215 width: int = 224,
216 height: int = 224,
217) -> Callable[[list[tuple[str, int]]], Awaitable[tuple[Tensor, Tensor]]]:
218 """Get a function to decode images from a list of paths.
219
220 Args:
221 device_index: The index of the target GPU device.
222 width: The width of the decoded image.
223 height: The height of the decoded image.
224
225 Returns:
226 Async function to decode images in to batch tensor of NCHW format
227 and labels of shape ``(batch_size, 1)``.
228 """
229 device = torch.device(f"cuda:{device_index}")
230
231 filter_desc = spdl.io.get_video_filter_desc(
232 scale_width=256,
233 scale_height=256,
234 crop_width=width,
235 crop_height=height,
236 pix_fmt="rgb24",
237 )
238
239 async def decode_images(items: list[tuple[str, int]]):
240 paths = [item for item, _ in items]
241 labels = [[item] for _, item in items]
242 labels = torch.tensor(labels, dtype=torch.int64).to(device)
243 buffer = await spdl.io.async_load_image_batch(
244 paths,
245 width=None,
246 height=None,
247 pix_fmt=None,
248 strict=True,
249 filter_desc=filter_desc,
250 device_config=spdl.io.cuda_config(
251 device_index=0,
252 allocator=(
253 torch.cuda.caching_allocator_alloc,
254 torch.cuda.caching_allocator_delete,
255 ),
256 ),
257 )
258 batch = spdl.io.to_torch(buffer)
259 batch = batch.permute((0, 3, 1, 2))
260 return batch, labels
261
262 return decode_images
263
264
265def _get_experimental_nvjpeg_decode_function(
266 device_index: int,
267 width: int = 224,
268 height: int = 224,
269):
270 device = torch.device(f"cuda:{device_index}")
271 device_config = spdl.io.cuda_config(
272 device_index=device_index,
273 allocator=(
274 torch.cuda.caching_allocator_alloc,
275 torch.cuda.caching_allocator_delete,
276 ),
277 )
278
279 async def decode_images_nvjpeg(items: list[tuple[str, int]]):
280 paths = [item for item, _ in items]
281 labels = [[item] for _, item in items]
282 labels = torch.tensor(labels, dtype=torch.int64).to(device)
283 buffer = await spdl.io.async_load_image_batch_nvjpeg(
284 paths,
285 device_config=device_config,
286 width=width,
287 height=height,
288 pix_fmt="rgb",
289 # strict=True,
290 )
291 batch = spdl.io.to_torch(buffer)
292 return batch, labels
293
294 return decode_images_nvjpeg
295
296
297def _get_experimental_nvdec_decode_function(
298 device_index: int,
299 width: int = 224,
300 height: int = 224,
301):
302 device = torch.device(f"cuda:{device_index}")
303 device_config = spdl.io.cuda_config(
304 device_index=device_index,
305 allocator=(
306 torch.cuda.caching_allocator_alloc,
307 torch.cuda.caching_allocator_delete,
308 ),
309 )
310
311 async def decode_images_nvdec(items: list[tuple[str, int]]):
312 paths = [item for item, _ in items]
313 labels = [[item] for _, item in items]
314 labels = torch.tensor(labels, dtype=torch.int64).to(device)
315 buffer = await spdl.io.async_load_image_batch_nvdec(
316 paths,
317 device_config=device_config,
318 width=width,
319 height=height,
320 pix_fmt="rgba",
321 strict=True,
322 )
323 batch = spdl.io.to_torch(buffer)[:, :-1, :, :]
324 return batch, labels
325
326 return decode_images_nvdec
327
328
329def get_dataloader(
330 src: Iterator[tuple[str, int]],
331 batch_size: int,
332 decode_func: Callable[[list[tuple[str, int]]], Awaitable[tuple[Tensor, Tensor]]],
333 buffer_size: int,
334 num_threads: int,
335) -> DataLoader:
336 """Build the dataloader for the ImageNet classification task.
337
338 The dataloader uses the ``decode_func`` for decoding images concurrently and
339 send the resulting data to GPU.
340
341 Args:
342 src: The source of the data. See :py:func:`source`.
343 batch_size: The number of images in a batch.
344 decode_func: The function to decode images.
345 buffer_size: The size of the buffer for the dataloader sink
346 num_threads: The number of worker threads.
347
348 """
349 return DataLoader(
350 src,
351 batch_size=batch_size,
352 drop_last=True,
353 aggregator=decode_func,
354 buffer_size=buffer_size,
355 num_threads=num_threads,
356 timeout=20,
357 )
358
359
360def benchmark(
361 dataloader: Iterator[tuple[Tensor, Tensor]],
362 model: ModelBundle,
363 max_batches: int = float("nan"),
364) -> None:
365 """The main loop that measures the performance of dataloading and model inference.
366
367 Args:
368 loader: The dataloader to benchmark.
369 model: The model to benchmark.
370 max_batches: The number of batch before stopping.
371 """
372
373 _LG.info("Running inference.")
374 num_frames, num_correct_top1, num_correct_top5 = 0, 0, 0
375 t0 = time.monotonic()
376 try:
377 for i, (batch, labels) in enumerate(dataloader):
378 if i == 20:
379 t0 = time.monotonic()
380 num_frames, num_correct_top1, num_correct_top5 = 0, 0, 0
381
382 with (
383 torch.profiler.record_function(f"iter_{i}"),
384 spdl.utils.trace_event(f"iter_{i}"),
385 ):
386 top1, top5 = model(batch, labels)
387
388 num_frames += batch.shape[0]
389 num_correct_top1 += top1
390 num_correct_top5 += top5
391
392 if i + 1 >= max_batches:
393 break
394 finally:
395 elapsed = time.monotonic() - t0
396 if num_frames != 0:
397 num_correct_top1 = num_correct_top1.item()
398 num_correct_top5 = num_correct_top5.item()
399 fps = num_frames / elapsed
400 _LG.info(f"FPS={fps:.2f} ({num_frames}/{elapsed:.2f})")
401 acc1 = 0 if num_frames == 0 else num_correct_top1 / num_frames
402 _LG.info(f"Accuracy (top1)={acc1:.2%} ({num_correct_top1}/{num_frames})")
403 acc5 = 0 if num_frames == 0 else num_correct_top5 / num_frames
404 _LG.info(f"Accuracy (top5)={acc5:.2%} ({num_correct_top5}/{num_frames})")
405
406
407def _get_dataloader(args, device_index) -> DataLoader:
408 src = ImageNet(args.root_dir, split=args.split)
409
410 if args.use_nvjpeg:
411 decode_func = _get_experimental_nvjpeg_decode_function(device_index)
412 elif args.use_nvdec:
413 decode_func = _get_experimental_nvdec_decode_function(device_index)
414 else:
415 decode_func = get_decode_func(device_index)
416
417 return get_dataloader(
418 src,
419 args.batch_size,
420 decode_func,
421 args.buffer_size,
422 args.num_threads,
423 )
424
425
426def entrypoint(args: list[int] | None = None):
427 """CLI entrypoint. Run pipeline, transform and model and measure its performance."""
428
429 args = _parse_args(args)
430 _init_logging(args.debug)
431 _LG.info(args)
432
433 device_index = 0
434 model = get_model(args.batch_size, device_index, args.compile, args.use_bf16)
435 dataloader = _get_dataloader(args, device_index)
436
437 trace_path = f"{args.trace}"
438 if args.use_nvjpeg:
439 trace_path = f"{trace_path}.nvjpeg"
440 if args.use_nvdec:
441 trace_path = f"{trace_path}.nvdec"
442
443 with (
444 torch.no_grad(),
445 profile() if args.trace else contextlib.nullcontext() as prof,
446 spdl.utils.tracing(f"{trace_path}.pftrace", enable=args.trace is not None),
447 ):
448 benchmark(dataloader, model, args.max_batches)
449
450 if args.trace:
451 prof.export_chrome_trace(f"{trace_path}.json")
452
453
454def _init_logging(debug=False):
455 fmt = "%(asctime)s [%(filename)s:%(lineno)d] [%(levelname)s] %(message)s"
456 level = logging.DEBUG if debug else logging.INFO
457 logging.basicConfig(format=fmt, level=level)
458
459
460if __name__ == "__main__":
461 entrypoint()
Functions¶
Functions
- entrypoint(args: list[int] | None = None)[source]¶
CLI entrypoint. Run pipeline, transform and model and measure its performance.
- benchmark(dataloader: Iterator[tuple[Tensor, Tensor]], model: ModelBundle, max_batches: int = nan) None [source]¶
The main loop that measures the performance of dataloading and model inference.
- Parameters:
loader – The dataloader to benchmark.
model – The model to benchmark.
max_batches – The number of batch before stopping.
- get_decode_func(device_index: int, width: int = 224, height: int = 224) Callable[[list[tuple[str, int]]], Awaitable[tuple[Tensor, Tensor]]] [source]¶
Get a function to decode images from a list of paths.
- Parameters:
device_index – The index of the target GPU device.
width – The width of the decoded image.
height – The height of the decoded image.
- Returns:
Async function to decode images in to batch tensor of NCHW format and labels of shape
(batch_size, 1)
.
- get_dataloader(src: Iterator[tuple[str, int]], batch_size: int, decode_func: Callable[[list[tuple[str, int]]], Awaitable[tuple[Tensor, Tensor]]], buffer_size: int, num_threads: int) DataLoader [source]¶
Build the dataloader for the ImageNet classification task.
The dataloader uses the
decode_func
for decoding images concurrently and send the resulting data to GPU.- Parameters:
src – The source of the data. See
source()
.batch_size – The number of images in a batch.
decode_func – The function to decode images.
buffer_size – The size of the buffer for the dataloader sink
num_threads – The number of worker threads.
- get_model(batch_size: int, device_index: int, compile: bool, use_bf16: bool, model_type: str = 'mobilenetv3_large_100') ModelBundle [source]¶
Build computation model, including transfor, model, and classification head.
- Parameters:
batch_size – The batch size of the input.
device_index – The index of the target GPU device.
compile – Whether to compile the model.
use_bf16 – Whether to use bfloat16 for the model.
model_type – The type of the model. Passed to
timm.create_model()
.
- Returns:
The resulting computation model.
Classes¶
Classes
- class ModelBundle[source]¶
Bundle the transform, model backbone, and classification head into a single module for a simple handling.
- forward(images: Tensor, labels: Tensor) tuple[Tensor, Tensor] [source]¶
Given a batch of images and labels, compute the top1, top5 accuracy.
- Parameters:
images – A batch of images. The shape is
(batch_size, 3, 224, 224)
.labels – A batch of labels. The shape is
(batch_size,)
.
- Returns:
A tuple of top1 and top5 accuracy.
- class Classification[source]¶
- forward(x: Tensor, labels: Tensor) tuple[Tensor, Tensor] [source]¶
Given a batch of features and labels, compute the top1 and top5 accuracy.
- Parameters:
images – A batch of images. The shape is
(batch_size, 3, 224, 224)
.labels – A batch of labels. The shape is
(batch_size,)
.
- Returns:
A tuple of top1 and top5 accuracy.