refactor to use source shits (c88f0b6d) · Commits · IVAS Codec Public Collaboration / IVAS Processing Scripts

ivas_processing_scripts/generation/generate_stereo_items.py

+83 −36

Original line number	Diff line number	Diff line
		@@ -33,7 +33,6 @@
		import logging
		import os
		from itertools import groupby, repeat
		from math import floor
		from pathlib import Path

		import numpy as np
		@@ -202,22 +201,37 @@ def generate_stereo_scene(
		# extract the number of audio sources
		N_inputs = len(np.atleast_1d(scene["input"]))

		# initialize output dirs
		# get the output filename
		output_filename = Path(scene["output"]).parent / (
		cfg.use_output_prefix + Path(scene["output"]).name
		)

		# initialize output dirs
		dir_path = output_filename.parent
		if dir_path and not dir_path.exists():
		dir_path.mkdir(parents=True, exist_ok=True)

		# initialize output audio object
		# initialize output STEREO object
		y = audio.ChannelBasedAudio(cfg.format)
		y.fs = cfg.fs

		# set the frame length
		frame_len = int(cfg.fs / 50)

		# repeat for all source files
		offset = 0
		for i in range(N_inputs):
		# parse parameters from the scene description
		source_file = np.atleast_1d(scene["input"])[i]
		IR_file = np.atleast_1d(scene["IR"])[i]
		source_file = (
		scene["input"][i] if isinstance(scene["input"], list) else scene["input"]
		)
		IR_file = (
		scene["IR"][i] if isinstance(scene["IR"], list) else scene["IR"]
		)

		# get input filename and IR filename
		input_filename = Path(source_file).parent / (cfg.use_input_prefix + Path(source_file).name)
		IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name)

		# read the overlap length
		if "shift" in scene.keys():
		@@ -229,6 +243,13 @@ def generate_stereo_scene(
		else:
		source_shift = 0.0

		# convert overlap to samples and ensure it is a multiple of 20ms
		source_shift = source_shift * cfg.fs
		if source_shift >= 0:
		source_shift = int(np.floor(source_shift / frame_len) * frame_len)
		else:
		source_shift = int(np.ceil(source_shift / frame_len) * frame_len)

		# read the level
		if "level" in scene.keys():
		level = (
		@@ -241,62 +262,88 @@ def generate_stereo_scene(

		logger.info(f"Convolving {source_file} with {IR_file}")

		# get input filename and IR filename
		input_filename = Path(source_file).parent / (
		cfg.use_input_prefix + Path(source_file).name
		)
		IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name)

		# read source file
		x = audio.fromfile("MONO", input_filename, fs=cfg.fs)
		x = audio.fromfile("MONO", input_filename)

		# resample to the target fs if necessary
		if x.fs != cfg.fs:
		logger.warning(
		f"Warning: Sample rate of the audio source is {x.fs} Hz and needs to be resampled to {cfg.fs}!"
		)
		resampled_audio = audioarray.resample(x.audio, x.fs, cfg.fs)
		x.audio = resampled_audio
		x.fs = cfg.fs

		# read the IR file (!must be in STEREO format!)
		IR = audio.fromfile("STEREO", IR_filename, fs=cfg.IR_fs)
		IR = audio.fromfile("STEREO", IR_filename)

		# convolve mono source signal with stereo IR
		# convolve MONO source audio with STEREO IR -> results in STEREO audio object
		x = reverb_stereo(x, IR)

		# adjust the level of the stereo signal
		x.audio, _ = loudness_norm(x, level, loudness_format="STEREO")

		# shift the source signal (positive shift creates overlap, negative shift creates a gap)
		if int(floor(-source_shift)) != 0:
		x.audio = audioarray.trim(x.audio, x.fs, limits=[-source_shift, 0])

		# get the number of frames (multiple of 20ms)
		frame_len = int(x.fs / 50)

		# ensure the length of the audio source signal is a multiple of 20ms
		if len(x.audio) % frame_len != 0:
		# pad with zeros to ensure that the signal length is a multiple of 20ms
		if len(x.audio) % frame_len != 0:
		N_pad = int(frame_len - len(x.audio) % frame_len)
		x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True)

		# add the convolved STEREO audio source signal to the output signal
		if y.audio is None:
		# add source signal to the array of all source signals
		y.audio = x.audio.copy()
		y.fs = x.fs

		# if source_shift < 0:
		# # insert zeros to the new audio source signal to shift it right
		# metadata.trim_meta(y, limits=[source_shift, 0], samples=True)
		offset = source_shift
		else:
		# pad the signal with zeros to have the same length as the previous signal(s)
		N_pad = y.audio.shape[0] - x.audio.shape[0]
		if N_pad != 0:
		x.audio = audioarray.trim(
		x.audio, x.fs, limits=[0, -N_pad], samples=True
		)
		# shift the beginning of the audio source signal
		delta_offset = source_shift - offset
		if delta_offset > 0:
		# insert zeros to the existing output signal to shift it right
		y.audio = audioarray.trim(y.audio, y.fs, limits=[0, -delta_offset], samples=True)
		offset = source_shift
		else:
		# insert zeros to the new audio source signal to shift it right
		x.audio = audioarray.trim(x.audio, x.fs, limits=[0, delta_offset], samples=True)

		# adjust the length of the audio source signal
		delta_length = len(x.audio) - len(y.audio)
		if delta_length > 0:
		# pad zeros to the existing output signal
		y.audio = audioarray.trim(y.audio, y.fs, limits=[0, -delta_length], samples=True)
		else:
		# pad zeros to the new audio source signal
		x.audio = audioarray.trim(x.audio, x.fs, limits=[0, delta_length], samples=True)

		# superimpose
		y.audio += x.audio

		# append pre-amble and post-amble to all sources
		y.audio = audioarray.trim(y.audio, y.fs, limits=[-cfg.preamble, -cfg.postamble])
		# append pre-amble and post-amble
		preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms
		postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms
		y.audio = audioarray.trim(y.audio, y.fs, limits=[-preamble, -postamble], samples=True)

		# add random noise
		if cfg.add_low_level_random_noise:
		# create uniformly distributed noise between -4 and 4
		np.random.seed(SEED_RANDOM_NOISE)
		noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float")

		# superimpose
		y.audio += noise

		# write the output STEREO audio signal into output file
		# adjust the length of the output signal
		if "duration" in cfg.__dict__:
		# trim the output signal such that the total duration is X seconds
		duration = int(cfg.duration * cfg.fs) # convert to samples
		else:
		# do not change the length of the audio signal
		duration = len(y.audio)
		duration = int(np.floor(duration / frame_len) * frame_len) # ensure multiple of 20ms
		if len(y.audio) != duration:
		y.audio = audioarray.trim(y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True)

		# write the STEREO audio signal into output file
		audiofile.write(output_filename, y.audio, y.fs)