Module audiocraft.data.audio
Audio IO methods are defined in this module (info, read, write), We rely on av library for faster read when possible, otherwise on torchaudio.
Functions
def audio_info(filepath: str | pathlib.Path) ‑> AudioFileInfo
-
Expand source code
def audio_info(filepath: tp.Union[str, Path]) -> AudioFileInfo: # torchaudio no longer returns useful duration informations for some formats like mp3s. filepath = Path(filepath) if filepath.suffix in ['.flac', '.ogg']: # TODO: Validate .ogg can be safely read with av_info # ffmpeg has some weird issue with flac. return _soundfile_info(filepath) else: return _av_info(filepath)
def audio_read(filepath: str | pathlib.Path,
seek_time: float = 0.0,
duration: float = -1.0,
pad: bool = False) ‑> Tuple[torch.Tensor, int]-
Expand source code
def audio_read(filepath: tp.Union[str, Path], seek_time: float = 0., duration: float = -1.0, pad: bool = False) -> tp.Tuple[torch.Tensor, int]: """Read audio by picking the most appropriate backend tool based on the audio format. Args: filepath (str or Path): Path to audio file to read. seek_time (float): Time at which to start reading in the file. duration (float): Duration to read from the file. If set to -1, the whole file is read. pad (bool): Pad output audio if not reaching expected duration. Returns: tuple of torch.Tensor, int: Tuple containing audio data and sample rate. """ fp = Path(filepath) if fp.suffix in ['.flac', '.ogg']: # TODO: check if we can safely use av_read for .ogg # There is some bug with ffmpeg and reading flac info = _soundfile_info(filepath) frames = -1 if duration <= 0 else int(duration * info.sample_rate) frame_offset = int(seek_time * info.sample_rate) wav, sr = soundfile.read(filepath, start=frame_offset, frames=frames, dtype=np.float32) assert info.sample_rate == sr, f"Mismatch of sample rates {info.sample_rate} {sr}" wav = torch.from_numpy(wav).t().contiguous() if len(wav.shape) == 1: wav = torch.unsqueeze(wav, 0) else: wav, sr = _av_read(filepath, seek_time, duration) if pad and duration > 0: expected_frames = int(duration * sr) wav = F.pad(wav, (0, expected_frames - wav.shape[-1])) return wav, sr
Read audio by picking the most appropriate backend tool based on the audio format.
Args
filepath
:str
orPath
- Path to audio file to read.
seek_time
:float
- Time at which to start reading in the file.
duration
:float
- Duration to read from the file. If set to -1, the whole file is read.
pad
:bool
- Pad output audio if not reaching expected duration.
Returns
tuple
oftorch.Tensor, int
- Tuple containing audio data and sample rate.
def audio_write(stem_name: str | pathlib.Path,
wav: torch.Tensor,
sample_rate: int,
format: str = 'wav',
mp3_rate: int = 320,
ogg_rate: int | None = None,
normalize: bool = True,
strategy: str = 'peak',
peak_clip_headroom_db: float = 1,
rms_headroom_db: float = 18,
loudness_headroom_db: float = 14,
loudness_compressor: bool = False,
log_clipping: bool = True,
make_parent_dir: bool = True,
add_suffix: bool = True) ‑> pathlib.Path-
Expand source code
def audio_write(stem_name: tp.Union[str, Path], wav: torch.Tensor, sample_rate: int, format: str = 'wav', mp3_rate: int = 320, ogg_rate: tp.Optional[int] = None, normalize: bool = True, strategy: str = 'peak', peak_clip_headroom_db: float = 1, rms_headroom_db: float = 18, loudness_headroom_db: float = 14, loudness_compressor: bool = False, log_clipping: bool = True, make_parent_dir: bool = True, add_suffix: bool = True) -> Path: """Convenience function for saving audio to disk. Returns the filename the audio was written to. Args: stem_name (str or Path): Filename without extension which will be added automatically. wav (torch.Tensor): Audio data to save. sample_rate (int): Sample rate of audio data. format (str): Either "wav", "mp3", "ogg", or "flac". mp3_rate (int): kbps when using mp3s. ogg_rate (int): kbps when using ogg/vorbis. If not provided, let ffmpeg decide for itself. normalize (bool): if `True` (default), normalizes according to the prescribed strategy (see after). If `False`, the strategy is only used in case clipping would happen. strategy (str): Can be either 'clip', 'peak', or 'rms'. Default is 'peak', i.e. audio is normalized by its largest value. RMS normalizes by root-mean-square with extra headroom to avoid clipping. 'clip' just clips. peak_clip_headroom_db (float): Headroom in dB when doing 'peak' or 'clip' strategy. rms_headroom_db (float): Headroom in dB when doing 'rms' strategy. This must be much larger than the `peak_clip` one to avoid further clipping. loudness_headroom_db (float): Target loudness for loudness normalization. loudness_compressor (bool): Uses tanh for soft clipping when strategy is 'loudness'. when strategy is 'loudness' log_clipping (bool): If True, basic logging on stderr when clipping still occurs despite strategy (only for 'rms'). make_parent_dir (bool): Make parent directory if it doesn't exist. Returns: Path: Path of the saved audio. """ assert wav.dtype.is_floating_point, "wav is not floating point" if wav.dim() == 1: wav = wav[None] elif wav.dim() > 2: raise ValueError("Input wav should be at most 2 dimension.") assert wav.isfinite().all() wav = normalize_audio(wav, normalize, strategy, peak_clip_headroom_db, rms_headroom_db, loudness_headroom_db, loudness_compressor, log_clipping=log_clipping, sample_rate=sample_rate, stem_name=str(stem_name)) if format == 'mp3': suffix = '.mp3' flags = ['-f', 'mp3', '-c:a', 'libmp3lame', '-b:a', f'{mp3_rate}k'] elif format == 'wav': suffix = '.wav' flags = ['-f', 'wav', '-c:a', 'pcm_s16le'] elif format == 'ogg': suffix = '.ogg' flags = ['-f', 'ogg', '-c:a', 'libvorbis'] if ogg_rate is not None: flags += ['-b:a', f'{ogg_rate}k'] elif format == 'flac': suffix = '.flac' flags = ['-f', 'flac'] else: raise RuntimeError(f"Invalid format {format}. Only wav or mp3 are supported.") if not add_suffix: suffix = '' path = Path(str(stem_name) + suffix) if make_parent_dir: path.parent.mkdir(exist_ok=True, parents=True) try: _piping_to_ffmpeg(path, wav, sample_rate, flags) except Exception: if path.exists(): # we do not want to leave half written files around. path.unlink() raise return path
Convenience function for saving audio to disk. Returns the filename the audio was written to.
Args
stem_name
:str
orPath
- Filename without extension which will be added automatically.
wav
:torch.Tensor
- Audio data to save.
sample_rate
:int
- Sample rate of audio data.
format
:str
- Either "wav", "mp3", "ogg", or "flac".
mp3_rate
:int
- kbps when using mp3s.
ogg_rate
:int
- kbps when using ogg/vorbis. If not provided, let ffmpeg decide for itself.
normalize
:bool
- if
True
(default), normalizes according to the prescribed strategy (see after). IfFalse
, the strategy is only used in case clipping would happen. strategy
:str
- Can be either 'clip', 'peak', or 'rms'. Default is 'peak', i.e. audio is normalized by its largest value. RMS normalizes by root-mean-square with extra headroom to avoid clipping. 'clip' just clips.
peak_clip_headroom_db
:float
- Headroom in dB when doing 'peak' or 'clip' strategy.
rms_headroom_db
:float
- Headroom in dB when doing 'rms' strategy. This must be much larger
than the
peak_clip
one to avoid further clipping. loudness_headroom_db
:float
- Target loudness for loudness normalization.
loudness_compressor
:bool
- Uses tanh for soft clipping when strategy is 'loudness'.
- when strategy is 'loudness' log_clipping (bool): If True, basic logging on stderr when clipping still
- occurs despite strategy (only for 'rms').
make_parent_dir
:bool
- Make parent directory if it doesn't exist.
Returns
Path
- Path of the saved audio.
def get_spec(y, sr=16000, n_fft=4096, hop_length=128, dur=8) ‑> numpy.ndarray
-
Expand source code
def get_spec(y, sr=16000, n_fft=4096, hop_length=128, dur=8) -> np.ndarray: """Get the mel-spectrogram from the raw audio. Args: y (numpy array): raw input sr (int): Sampling rate n_fft (int): Number of samples per FFT. Default is 2048. hop_length (int): Number of samples between successive frames. Default is 512. dur (float): Maxium duration to get the spectrograms Returns: spectro histogram as a numpy array """ import librosa import librosa.display spectrogram = librosa.feature.melspectrogram( y=y, sr=sr, n_fft=n_fft, hop_length=hop_length ) spectrogram_db = librosa.power_to_db(spectrogram, ref=np.max) return spectrogram_db
Get the mel-spectrogram from the raw audio.
Args
y
:numpy array
- raw input
sr
:int
- Sampling rate
n_fft
:int
- Number of samples per FFT. Default is 2048.
hop_length
:int
- Number of samples between successive frames. Default is 512.
dur
:float
- Maxium duration to get the spectrograms
Returns
spectro histogram as a numpy array
def save_spectrograms(ys: List[numpy.ndarray],
sr: int,
path: str,
names: List[str],
n_fft: int = 4096,
hop_length: int = 128,
dur: float = 8.0)-
Expand source code
def save_spectrograms( ys: tp.List[np.ndarray], sr: int, path: str, names: tp.List[str], n_fft: int = 4096, hop_length: int = 128, dur: float = 8.0, ): """Plot a spectrogram for an audio file. Args: ys: List of audio spectrograms sr (int): Sampling rate of the audio file. Default is 22050 Hz. path (str): Path to the plot file. names: name of each spectrogram plot n_fft (int): Number of samples per FFT. Default is 2048. hop_length (int): Number of samples between successive frames. Default is 512. dur (float): Maxium duration to plot the spectrograms Returns: None (plots the spectrogram using matplotlib) """ import matplotlib as mpl # type: ignore import matplotlib.pyplot as plt # type: ignore import librosa.display if not names: names = ["Ground Truth", "Audio Watermarked", "Watermark"] ys = [wav[: int(dur * sr)] for wav in ys] # crop assert len(names) == len( ys ), f"There are {len(ys)} wavs but {len(names)} names ({names})" # Set matplotlib stuff BIGGER_SIZE = 10 SMALLER_SIZE = 8 linewidth = 234.8775 # linewidth in pt plt.rc("font", size=BIGGER_SIZE, family="serif") # controls default text sizes plt.rcParams["font.family"] = "DeJavu Serif" plt.rcParams["font.serif"] = ["Times New Roman"] plt.rc("axes", titlesize=BIGGER_SIZE) # fontsize of the axes title plt.rc("axes", labelsize=BIGGER_SIZE) # fontsize of the x and y labels plt.rc("xtick", labelsize=BIGGER_SIZE) # fontsize of the tick labels plt.rc("ytick", labelsize=SMALLER_SIZE) # fontsize of the tick labels plt.rc("legend", fontsize=BIGGER_SIZE) # legend fontsize plt.rc("figure", titlesize=BIGGER_SIZE) height = 1.6 * linewidth / 72.0 fig, ax = plt.subplots( nrows=len(ys), ncols=1, sharex=True, figsize=(linewidth / 72.0, height), ) fig.tight_layout() # Plot the spectrogram for i, ysi in enumerate(ys): spectrogram_db = get_spec(ysi, sr=sr, n_fft=n_fft, hop_length=hop_length) if i == 0: cax = fig.add_axes( [ ax[0].get_position().x1 + 0.01, # type: ignore ax[-1].get_position().y0, 0.02, ax[0].get_position().y1 - ax[-1].get_position().y0, ] ) fig.colorbar( mpl.cm.ScalarMappable( norm=mpl.colors.Normalize( np.min(spectrogram_db), np.max(spectrogram_db) ), cmap="magma", ), ax=ax, orientation="vertical", format="%+2.0f dB", cax=cax, ) librosa.display.specshow( spectrogram_db, sr=sr, hop_length=hop_length, x_axis="time", y_axis="mel", ax=ax[i], ) ax[i].set(title=names[i]) ax[i].yaxis.set_label_text(None) ax[i].label_outer() fig.savefig(path, bbox_inches="tight") plt.close()
Plot a spectrogram for an audio file.
Args
ys
- List of audio spectrograms
sr
:int
- Sampling rate of the audio file. Default is 22050 Hz.
path
:str
- Path to the plot file.
names
- name of each spectrogram plot
n_fft
:int
- Number of samples per FFT. Default is 2048.
hop_length
:int
- Number of samples between successive frames. Default is 512.
dur
:float
- Maxium duration to plot the spectrograms
Returns
None (plots the spectrogram using matplotlib)
Classes
class AudioFileInfo (sample_rate: int, duration: float, channels: int)
-
Expand source code
@dataclass(frozen=True) class AudioFileInfo: sample_rate: int duration: float channels: int
AudioFileInfo(sample_rate: int, duration: float, channels: int)
Class variables
var channels : int
var duration : float
var sample_rate : int