Imagenet classification¶
Benchmark the performance of loading images from local file systems and classifying them using a GPU.
This script builds the data loader and instantiates an image classification model in a GPU. The data loader transfers the batch image data to the GPU concurrently, and the foreground thread run the model on data one by one.
To run the benchmark, pass it to the script like the following.
python imagenet_classification.py
--root-dir ~/imagenet/
--split val
Source¶
Source
Click here to see the source.
1# Copyright (c) Meta Platforms, Inc. and affiliates.
2# All rights reserved.
3#
4# This source code is licensed under the BSD-style license found in the
5# LICENSE file in the root directory of this source tree.
6
7"""Benchmark the performance of loading images from local file systems and
8classifying them using a GPU.
9
10This script builds the data loader and instantiates an image
11classification model in a GPU.
12The data loader transfers the batch image data to the GPU concurrently, and
13the foreground thread run the model on data one by one.
14
15.. include:: ../plots/imagenet_classification_chart.txt
16
17To run the benchmark, pass it to the script like the following.
18
19.. code-block::
20
21 python imagenet_classification.py
22 --root-dir ~/imagenet/
23 --split val
24"""
25
26# pyre-strict
27
28import argparse
29import contextlib
30import logging
31import time
32from argparse import Namespace
33from collections.abc import Awaitable, Callable, Iterator
34from pathlib import Path
35
36import spdl.io
37import spdl.io.utils
38import torch
39from spdl.dataloader import DataLoader
40from spdl.source.imagenet import ImageNet
41from torch import Tensor
42from torch.profiler import profile
43
44_LG: logging.Logger = logging.getLogger(__name__)
45
46
47__all__ = [
48 "entrypoint",
49 "benchmark",
50 "get_decode_func",
51 "get_dataloader",
52 "get_model",
53 "ModelBundle",
54 "Classification",
55 "Preprocessing",
56]
57
58
59def _parse_args(args: list[str] | None) -> Namespace:
60 parser = argparse.ArgumentParser(
61 description=__doc__,
62 formatter_class=argparse.RawDescriptionHelpFormatter,
63 )
64 parser.add_argument("--debug", action="store_true")
65 parser.add_argument("--root-dir", type=Path, required=True)
66 parser.add_argument("--max-batches", type=int, default=float("inf"))
67 parser.add_argument("--batch-size", type=int, default=32)
68 parser.add_argument("--split", default="val", choices=["train", "val"])
69 parser.add_argument("--trace", type=Path)
70 parser.add_argument("--buffer-size", type=int, default=16)
71 parser.add_argument("--num-threads", type=int, default=16)
72 parser.add_argument("--no-compile", action="store_false", dest="compile")
73 parser.add_argument("--no-bf16", action="store_false", dest="use_bf16")
74 parser.add_argument("--use-nvjpeg", action="store_true")
75 ns = parser.parse_args(args)
76 if ns.trace:
77 ns.max_batches = 60
78 return ns
79
80
81# Handroll the transforms so as to support `torch.compile`
82class Preprocessing(torch.nn.Module):
83 """Perform pixel normalization and data type conversion.
84
85 Args:
86 mean: The mean value of the dataset.
87 std: The standard deviation of the dataset.
88 """
89
90 def __init__(self, mean: Tensor, std: Tensor) -> None:
91 super().__init__()
92 self.register_buffer("mean", mean)
93 self.register_buffer("std", std)
94
95 def forward(self, x: Tensor) -> Tensor:
96 """Normalize the given image batch.
97
98 Args:
99 x: The input image batch. Pixel values are expected to be
100 in the range of ``[0, 255]``.
101 Returns:
102 The normalized image batch.
103 """
104 x = x.float() / 255.0
105 return (x - self.mean) / self.std
106
107
108class Classification(torch.nn.Module):
109 """Classification()"""
110
111 def forward(self, x: Tensor, labels: Tensor) -> tuple[Tensor, Tensor]:
112 """Given a batch of features and labels, compute the top1 and top5 accuracy.
113
114 Args:
115 images: A batch of images. The shape is ``(batch_size, 3, 224, 224)``.
116 labels: A batch of labels. The shape is ``(batch_size,)``.
117
118 Returns:
119 A tuple of top1 and top5 accuracy.
120 """
121
122 probs = torch.nn.functional.softmax(x, dim=-1)
123 top_prob, top_catid = torch.topk(probs, 5)
124 top1 = (top_catid[:, :1] == labels).sum()
125 top5 = (top_catid == labels).sum()
126 return top1, top5
127
128
129class ModelBundle(torch.nn.Module):
130 """ModelBundle()
131
132 Bundle the transform, model backbone, and classification head into a single module
133 for a simple handling."""
134
135 def __init__(
136 self,
137 model: torch.nn.Module,
138 preprocessing: Preprocessing,
139 classification: Classification,
140 use_bf16: bool,
141 ) -> None:
142 super().__init__()
143 self.model = model
144 self.preprocessing = preprocessing
145 self.classification = classification
146 self.use_bf16 = use_bf16
147
148 def forward(self, images: Tensor, labels: Tensor) -> tuple[Tensor, Tensor]:
149 """Given a batch of images and labels, compute the top1, top5 accuracy.
150
151 Args:
152 images: A batch of images. The shape is ``(batch_size, 3, 224, 224)``.
153 labels: A batch of labels. The shape is ``(batch_size,)``.
154
155 Returns:
156 A tuple of top1 and top5 accuracy.
157 """
158
159 x = self.preprocessing(images)
160
161 if self.use_bf16:
162 x = x.to(torch.bfloat16)
163
164 output = self.model(x)
165
166 return self.classification(output, labels)
167
168
169def _expand(vals: list[float], batch_size: int, res: int) -> Tensor:
170 return torch.tensor(vals).view(1, 3, 1, 1).expand(batch_size, 3, res, res).clone()
171
172
173def get_model(
174 batch_size: int,
175 device_index: int,
176 compile: bool,
177 use_bf16: bool,
178 model_type: str = "mobilenetv3_large_100",
179) -> ModelBundle:
180 """Build computation model, including transfor, model, and classification head.
181
182 Args:
183 batch_size: The batch size of the input.
184 device_index: The index of the target GPU device.
185 compile: Whether to compile the model.
186 use_bf16: Whether to use bfloat16 for the model.
187 model_type: The type of the model. Passed to ``timm.create_model()``.
188
189 Returns:
190 The resulting computation model.
191 """
192 import timm
193
194 device = torch.device(f"cuda:{device_index}")
195
196 model = timm.create_model(model_type, pretrained=True)
197 model = model.eval().to(device=device)
198
199 if use_bf16:
200 model = model.to(dtype=torch.bfloat16)
201
202 preprocessing = Preprocessing(
203 mean=_expand([0.4850, 0.4560, 0.4060], batch_size, 224),
204 std=_expand([0.2290, 0.2240, 0.2250], batch_size, 224),
205 ).to(device)
206
207 classification = Classification().to(device)
208
209 if compile:
210 with torch.no_grad():
211 mode = "max-autotune"
212 model = torch.compile(model, mode=mode)
213 preprocessing = torch.compile(preprocessing, mode=mode)
214
215 return ModelBundle(model, preprocessing, classification, use_bf16) # pyre-ignore[6]
216
217
218def get_decode_func(
219 device_index: int,
220 width: int = 224,
221 height: int = 224,
222) -> Callable[[list[tuple[str, int]]], Awaitable[tuple[Tensor, Tensor]]]:
223 """Get a function to decode images from a list of paths.
224
225 Args:
226 device_index: The index of the target GPU device.
227 width: The width of the decoded image.
228 height: The height of the decoded image.
229
230 Returns:
231 Async function to decode images in to batch tensor of NCHW format
232 and labels of shape ``(batch_size, 1)``.
233 """
234 device: torch.device = torch.device(f"cuda:{device_index}")
235
236 filter_desc: str | None = spdl.io.get_video_filter_desc(
237 scale_width=256,
238 scale_height=256,
239 crop_width=width,
240 crop_height=height,
241 pix_fmt="rgb24",
242 )
243
244 async def decode_images(items: list[tuple[str, int]]) -> tuple[Tensor, Tensor]:
245 paths = [item for item, _ in items]
246 labels = [[item] for _, item in items]
247 labels = torch.tensor(labels, dtype=torch.int64).to(device)
248 buffer = await spdl.io.async_load_image_batch(
249 paths,
250 width=None,
251 height=None,
252 pix_fmt=None,
253 strict=True,
254 filter_desc=filter_desc,
255 device_config=spdl.io.cuda_config(
256 device_index=0,
257 allocator=(
258 torch.cuda.caching_allocator_alloc,
259 torch.cuda.caching_allocator_delete,
260 ),
261 ),
262 )
263 batch = spdl.io.to_torch(buffer)
264 batch = batch.permute((0, 3, 1, 2))
265 return batch, labels
266
267 return decode_images
268
269
270def _get_experimental_nvjpeg_decode_function(
271 device_index: int,
272 width: int = 224,
273 height: int = 224,
274) -> Callable[[list[tuple[str, int]]], Awaitable[tuple[Tensor, Tensor]]]:
275 device: torch.device = torch.device(f"cuda:{device_index}")
276 device_config: spdl.io.CUDAConfig = spdl.io.cuda_config(
277 device_index=device_index,
278 allocator=(
279 torch.cuda.caching_allocator_alloc,
280 torch.cuda.caching_allocator_delete,
281 ),
282 )
283
284 async def decode_images_nvjpeg(
285 items: list[tuple[str, int]],
286 ) -> tuple[Tensor, Tensor]:
287 paths = [item for item, _ in items]
288 labels = [[item] for _, item in items]
289 labels = torch.tensor(labels, dtype=torch.int64).to(device)
290 buffer = await spdl.io.async_load_image_batch_nvjpeg(
291 paths,
292 device_config=device_config,
293 width=width,
294 height=height,
295 pix_fmt="rgb",
296 # strict=True,
297 )
298 batch = spdl.io.to_torch(buffer)
299 return batch, labels
300
301 return decode_images_nvjpeg
302
303
304def get_dataloader(
305 src: Iterator[tuple[str, int]],
306 batch_size: int,
307 decode_func: Callable[[list[tuple[str, int]]], Awaitable[tuple[Tensor, Tensor]]],
308 buffer_size: int,
309 num_threads: int,
310) -> Iterator[tuple[Tensor, Tensor]]:
311 """Build the dataloader for the ImageNet classification task.
312
313 The dataloader uses the ``decode_func`` for decoding images concurrently and
314 send the resulting data to GPU.
315
316 Args:
317 src: The source of the data. See :py:func:`source`.
318 batch_size: The number of images in a batch.
319 decode_func: The function to decode images.
320 buffer_size: The size of the buffer for the dataloader sink
321 num_threads: The number of worker threads.
322
323 """
324 return DataLoader( # pyre-ignore[7]
325 src,
326 batch_size=batch_size,
327 drop_last=True,
328 aggregator=decode_func,
329 buffer_size=buffer_size,
330 num_threads=num_threads,
331 timeout=20,
332 )
333
334
335def benchmark(
336 dataloader: Iterator[tuple[Tensor, Tensor]],
337 model: ModelBundle,
338 max_batches: float = float("nan"),
339) -> None:
340 """The main loop that measures the performance of dataloading and model inference.
341
342 Args:
343 loader: The dataloader to benchmark.
344 model: The model to benchmark.
345 max_batches: The number of batch before stopping.
346 """
347
348 _LG.info("Running inference.")
349 num_frames, num_correct_top1, num_correct_top5 = 0, 0, 0
350 t0 = time.monotonic()
351 try:
352 for i, (batch, labels) in enumerate(dataloader):
353 if i == 20:
354 t0 = time.monotonic()
355 num_frames, num_correct_top1, num_correct_top5 = 0, 0, 0
356
357 with (
358 torch.profiler.record_function(f"iter_{i}"),
359 spdl.io.utils.trace_event(f"iter_{i}"),
360 ):
361 top1, top5 = model(batch, labels)
362
363 num_frames += batch.shape[0]
364 num_correct_top1 += top1
365 num_correct_top5 += top5
366
367 if i + 1 >= max_batches:
368 break
369 finally:
370 elapsed = time.monotonic() - t0
371 if num_frames != 0:
372 num_correct_top1 = num_correct_top1.item() # pyre-ignore[16]
373 num_correct_top5 = num_correct_top5.item()
374 fps = num_frames / elapsed
375 _LG.info(f"FPS={fps:.2f} ({num_frames}/{elapsed:.2f})")
376 acc1 = 0 if num_frames == 0 else num_correct_top1 / num_frames
377 _LG.info(f"Accuracy (top1)={acc1:.2%} ({num_correct_top1}/{num_frames})")
378 acc5 = 0 if num_frames == 0 else num_correct_top5 / num_frames
379 _LG.info(f"Accuracy (top5)={acc5:.2%} ({num_correct_top5}/{num_frames})")
380
381
382def _get_dataloader(
383 args: Namespace, device_index: int
384) -> Iterator[tuple[Tensor, Tensor]]:
385 src = ImageNet(args.root_dir, split=args.split)
386
387 if args.use_nvjpeg:
388 decode_func = _get_experimental_nvjpeg_decode_function(device_index)
389 else:
390 decode_func = get_decode_func(device_index)
391
392 return get_dataloader(
393 src, # pyre-ignore[6]
394 args.batch_size,
395 decode_func,
396 args.buffer_size,
397 args.num_threads,
398 )
399
400
401def entrypoint(args_: list[str] | None = None) -> None:
402 """CLI entrypoint. Run pipeline, transform and model and measure its performance."""
403
404 args = _parse_args(args_)
405 _init_logging(args.debug)
406 _LG.info(args)
407
408 device_index = 0
409 model = get_model(args.batch_size, device_index, args.compile, args.use_bf16)
410 dataloader = _get_dataloader(args, device_index)
411
412 trace_path = f"{args.trace}"
413 if args.use_nvjpeg:
414 trace_path = f"{trace_path}.nvjpeg"
415
416 with (
417 torch.no_grad(),
418 profile() if args.trace else contextlib.nullcontext() as prof,
419 spdl.io.utils.tracing(f"{trace_path}.pftrace", enable=args.trace is not None),
420 ):
421 benchmark(dataloader, model, args.max_batches)
422
423 if args.trace:
424 prof.export_chrome_trace(f"{trace_path}.json")
425
426
427def _init_logging(debug: bool = False) -> None:
428 fmt = "%(asctime)s [%(filename)s:%(lineno)d] [%(levelname)s] %(message)s"
429 level = logging.DEBUG if debug else logging.INFO
430 logging.basicConfig(format=fmt, level=level)
431
432
433if __name__ == "__main__":
434 entrypoint()
Functions¶
Functions
- entrypoint(args_: list[str] | None = None) None [source]¶
CLI entrypoint. Run pipeline, transform and model and measure its performance.
- benchmark(dataloader: Iterator[tuple[Tensor, Tensor]], model: ModelBundle, max_batches: float = nan) None [source]¶
The main loop that measures the performance of dataloading and model inference.
- Parameters:
loader – The dataloader to benchmark.
model – The model to benchmark.
max_batches – The number of batch before stopping.
- get_decode_func(device_index: int, width: int = 224, height: int = 224) Callable[[list[tuple[str, int]]], Awaitable[tuple[Tensor, Tensor]]] [source]¶
Get a function to decode images from a list of paths.
- Parameters:
device_index – The index of the target GPU device.
width – The width of the decoded image.
height – The height of the decoded image.
- Returns:
Async function to decode images in to batch tensor of NCHW format and labels of shape
(batch_size, 1)
.
- get_dataloader(src: Iterator[tuple[str, int]], batch_size: int, decode_func: Callable[[list[tuple[str, int]]], Awaitable[tuple[Tensor, Tensor]]], buffer_size: int, num_threads: int) Iterator[tuple[Tensor, Tensor]] [source]¶
Build the dataloader for the ImageNet classification task.
The dataloader uses the
decode_func
for decoding images concurrently and send the resulting data to GPU.- Parameters:
src – The source of the data. See
source()
.batch_size – The number of images in a batch.
decode_func – The function to decode images.
buffer_size – The size of the buffer for the dataloader sink
num_threads – The number of worker threads.
- get_model(batch_size: int, device_index: int, compile: bool, use_bf16: bool, model_type: str = 'mobilenetv3_large_100') ModelBundle [source]¶
Build computation model, including transfor, model, and classification head.
- Parameters:
batch_size – The batch size of the input.
device_index – The index of the target GPU device.
compile – Whether to compile the model.
use_bf16 – Whether to use bfloat16 for the model.
model_type – The type of the model. Passed to
timm.create_model()
.
- Returns:
The resulting computation model.
Classes¶
Classes
- class ModelBundle[source]¶
Bundle the transform, model backbone, and classification head into a single module for a simple handling.
- forward(images: Tensor, labels: Tensor) tuple[Tensor, Tensor] [source]¶
Given a batch of images and labels, compute the top1, top5 accuracy.
- Parameters:
images – A batch of images. The shape is
(batch_size, 3, 224, 224)
.labels – A batch of labels. The shape is
(batch_size,)
.
- Returns:
A tuple of top1 and top5 accuracy.
- class Classification[source]¶
- forward(x: Tensor, labels: Tensor) tuple[Tensor, Tensor] [source]¶
Given a batch of features and labels, compute the top1 and top5 accuracy.
- Parameters:
images – A batch of images. The shape is
(batch_size, 3, 224, 224)
.labels – A batch of labels. The shape is
(batch_size,)
.
- Returns:
A tuple of top1 and top5 accuracy.