streaming_nvdec_decoding

This example shows how to decode a video with GPU in streaming fashion.

Source

Source

Click here to see the source.
  1# Copyright (c) Meta Platforms, Inc. and affiliates.
  2# All rights reserved.
  3#
  4# This source code is licensed under the BSD-style license found in the
  5# LICENSE file in the root directory of this source tree.
  6
  7"""This example shows how to decode a video with GPU in streaming fashion."""
  8
  9__all__ = [
 10    "main",
 11    "parse_args",
 12    "run",
 13    "decode",
 14    "torch_cuda_warmup",
 15]
 16
 17import argparse
 18import contextlib
 19import logging
 20import pathlib
 21import time
 22
 23import spdl.io
 24import torch
 25from PIL import Image
 26from spdl.io import CUDAConfig
 27from torch.profiler import profile
 28
 29# pyre-strict
 30
 31
 32def parse_args(args: list[str] | None = None) -> tuple[argparse.Namespace, list[str]]:
 33    """Parse command line arguments.
 34
 35    Args:
 36        args: The command line arguments. By default it reads ``sys.argv``.
 37
 38    Returns:
 39        Tuple of parsed arguments and unused arguments, as returned by
 40        :py:meth:`argparse.ArgumentParser.parse_known_args`.
 41    """
 42
 43    parser = argparse.ArgumentParser(
 44        description=__doc__,
 45    )
 46    parser.add_argument(
 47        "--input-file", required=True, help="The input video to process."
 48    )
 49    parser.add_argument(
 50        "--plot-dir",
 51        type=pathlib.Path,
 52        help="If provided, plot the result to the given dirctory.",
 53    )
 54    parser.add_argument(
 55        "--trace-path",
 56        help="If provided, trace the execution. e.g. 'trace.json.gz'",
 57    )
 58    parser.add_argument(
 59        "--device-index",
 60        type=int,
 61        help="The CUDA device index. By default it use the last one.",
 62    )
 63    parser.add_argument(
 64        "--width",
 65        type=int,
 66        default=320,
 67        help="Rescale the video to this width. Provide -1 to disable.",
 68    )
 69    parser.add_argument(
 70        "--height",
 71        type=int,
 72        default=240,
 73        help="Rescale the video to this height. Provide -1 to disable.",
 74    )
 75    return parser.parse_known_args(args)
 76
 77
 78def decode(
 79    src: str,
 80    device_config: CUDAConfig,
 81    post_processing_params: dict[str, int],
 82    profiler: torch.profiler.profile | None,
 83    plot_dir: pathlib.Path | None,
 84) -> None:
 85    """Decode video in streaming fashion with optional resizing, profiling and exporting.
 86
 87    Args:
 88        src: The path or URL to the source video.
 89        device_config: The GPU configuration.
 90        post_processing_params: Post processing argument.
 91            See :py:func:`spdl.io.streaming_load_video_nvdec`.
 92        profiler: PyTorch Profiler or ``None``.
 93        plot_dir: If provided, the decoded frames are exported as images to the directory.
 94    """
 95    streamer = spdl.io.streaming_load_video_nvdec(
 96        src,
 97        device_config,
 98        num_frames=32,
 99        post_processing_params=post_processing_params,
100    )
101
102    i, num_frames = 0, 0
103    t0 = time.monotonic()
104    for buffers in streamer:
105        buffer = spdl.io.nv12_to_rgb(buffers, device_config=device_config, sync=True)
106        tensor = spdl.io.to_torch(buffer)
107        num_frames += len(tensor)
108
109        if plot_dir is not None:
110            for f in tensor.permute(0, 2, 3, 1):
111                img = Image.fromarray(f.cpu().numpy())
112                img.save(plot_dir / f"{i:05d}.png")
113                i += 1
114
115        if profiler is not None:
116            profiler.step()
117            if num_frames >= 500:
118                break
119
120    elapsed = time.monotonic() - t0
121    qps = num_frames / elapsed
122    print(f"Processed {num_frames} frames in {elapsed:.1f} sec. QPS: {qps:.1f}")
123
124
125def torch_cuda_warmup(device_index: int | None) -> tuple[int, torch.cuda.Stream]:
126    """Initialize the CUDA context perform dry-run.
127
128    Args:
129        device_index: The CUDA device to use. If ``None``, the last available device is used.
130    """
131    assert torch.cuda.is_available()
132
133    cuda_index: int = device_index or (torch.cuda.device_count() - 1)
134    stream = torch.cuda.Stream(device=cuda_index)
135    with torch.cuda.stream(stream):
136        a = torch.empty([32, 3, 1080, 1920])
137        a.pin_memory().to(f"cuda:{cuda_index}", non_blocking=True)
138    stream.synchronize()
139    return cuda_index, stream
140
141
142def run(
143    src: str,
144    device_index: int | None,
145    post_processing_params: dict[str, int],
146    profiler: torch.profiler.profile,
147    plot_dir: pathlib.Path,
148) -> None:
149    """Run the benchmark."""
150    cuda_index, stream = torch_cuda_warmup(device_index)
151
152    device_config = spdl.io.cuda_config(
153        device_index=cuda_index,
154        allocator=(
155            torch.cuda.caching_allocator_alloc,
156            torch.cuda.caching_allocator_delete,
157        ),
158        stream=stream.cuda_stream,
159    )
160
161    for i in range(3):
162        with torch.autograd.profiler.record_function(f"decode_{i}"):
163            decode(src, device_config, post_processing_params, profiler, plot_dir)
164
165
166def main(args: list[str] | None = None) -> None:
167    """The main entrypoint for the CLI."""
168    ns, _ = parse_args(args)
169
170    logging.basicConfig(level=logging.INFO)
171
172    prof = None
173    post_process = {
174        "scale_width": ns.width if ns.width > 0 else None,
175        "scale_height": ns.height if ns.height > 0 else None,
176    }
177    with contextlib.ExitStack() as stack:
178        if ns.trace_path:
179            prof = stack.enter_context(
180                profile(
181                    with_stack=True,
182                    on_trace_ready=lambda p: p.export_chrome_trace(ns.trace_path),
183                )
184            )
185
186        run(ns.input_file, ns.device_index, post_process, prof, ns.plot_dir)
187
188
189if __name__ == "__main__":
190    main()

Functions

Functions

main(args: list[str] | None = None) None[source]

The main entrypoint for the CLI.

parse_args(args: list[str] | None = None) tuple[Namespace, list[str]][source]

Parse command line arguments.

Parameters:

args – The command line arguments. By default it reads sys.argv.

Returns:

Tuple of parsed arguments and unused arguments, as returned by argparse.ArgumentParser.parse_known_args().

run(src: str, device_index: int | None, post_processing_params: dict[str, int], profiler: profile, plot_dir: Path) None[source]

Run the benchmark.

decode(src: str, device_config: CUDAConfig, post_processing_params: dict[str, int], profiler: profile | None, plot_dir: Path | None) None[source]

Decode video in streaming fashion with optional resizing, profiling and exporting.

Parameters:
  • src – The path or URL to the source video.

  • device_config – The GPU configuration.

  • post_processing_params – Post processing argument. See spdl.io.streaming_load_video_nvdec().

  • profiler – PyTorch Profiler or None.

  • plot_dir – If provided, the decoded frames are exported as images to the directory.

torch_cuda_warmup(device_index: int | None) tuple[int, Stream][source]

Initialize the CUDA context perform dry-run.

Parameters:

device_index – The CUDA device to use. If None, the last available device is used.