adjust pre-amble and post-amble to 20ms boundary (b323333d) · Commits · IVAS Codec Public Collaboration / IVAS Processing Scripts

ivas_processing_scripts/generation/generate_ambi_items.py

+83 −37

Original line number	Diff line number	Diff line
		@@ -32,7 +32,6 @@

		import logging
		from itertools import groupby, repeat
		from math import floor
		from pathlib import Path

		import numpy as np
		@@ -196,22 +195,37 @@ def generate_ambi_scene(
		# extract the number of audio sources
		N_inputs = len(np.atleast_1d(scene["input"]))

		# initialize output dirs
		# get the output filename
		output_filename = Path(scene["output"]).parent / (
		cfg.use_output_prefix + Path(scene["output"]).name
		)

		# initialize output dirs
		dir_path = output_filename.parent
		if dir_path and not dir_path.exists():
		dir_path.mkdir(parents=True, exist_ok=True)

		# initialize output audio object
		# initialize output SBA object
		y = audio.SceneBasedAudio(cfg.format)
		y.fs = cfg.fs

		# set the frame length
		frame_len = int(cfg.fs / 50)

		# repeat for all source files
		offset = 0
		for i in range(N_inputs):
		# parse parameters from the scene description
		source_file = np.atleast_1d(scene["input"])[i]
		IR_file = np.atleast_1d(scene["IR"])[i]
		source_file = (
		scene["input"][i] if isinstance(scene["input"], list) else scene["input"]
		)
		IR_file = (
		scene["IR"][i] if isinstance(scene["IR"], list) else scene["IR"]
		)

		# get input filename and IR filename
		input_filename = Path(source_file).parent / (cfg.use_input_prefix + Path(source_file).name)
		IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name)

		# read the overlap length
		if "shift" in scene.keys():
		@@ -223,6 +237,13 @@ def generate_ambi_scene(
		else:
		source_shift = 0.0

		# convert overlap to samples and ensure it is a multiple of 20ms
		source_shift = source_shift * cfg.fs
		if source_shift >= 0:
		source_shift = int(np.floor(source_shift / frame_len) * frame_len)
		else:
		source_shift = int(np.ceil(source_shift / frame_len) * frame_len)

		# read the level
		if "level" in scene.keys():
		level = (
		@@ -235,19 +256,22 @@ def generate_ambi_scene(

		logger.info(f"Convolving {source_file} with {IR_file}")

		# get input filename and IR filename
		input_filename = Path(source_file).parent / (
		cfg.use_input_prefix + Path(source_file).name
		)
		IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name)

		# read source file
		x = audio.fromfile("MONO", input_filename, fs=cfg.fs)
		x = audio.fromfile("MONO", input_filename)

		# resample to the target fs if necessary
		if x.fs != cfg.fs:
		logger.warning(
		f"Warning: Sample rate of the audio source is {x.fs} Hz and needs to be resampled to {cfg.fs}!"
		)
		resampled_audio = audioarray.resample(x.audio, x.fs, cfg.fs)
		x.audio = resampled_audio
		x.fs = cfg.fs

		# read the IR file (!must be in target format!)
		IR = audio.fromfile(cfg.format, IR_filename, fs=cfg.IR_fs)
		IR = audio.fromfile(cfg.format, IR_filename)

		# convolve with the FOA/HOA2/HOA3 IR
		# convolve MONO source audio with FOA/HOA2/HOA3 IR -> results in FOA/HOA2/HOA3 audio object
		if cfg.format == "FOA":
		x = reverb_foa(x, IR)
		elif cfg.format == "HOA2":
		@@ -258,46 +282,69 @@ def generate_ambi_scene(
		# adjust the level of the target signal
		x.audio, _ = loudness_norm(x, level, loudness_format="STEREO")

		# shift the source signal (positive shift creates overlap, negative shift creates a gap)
		if int(floor(-source_shift)) != 0:
		x.audio = audioarray.trim(x.audio, x.fs, limits=[-source_shift, 0])

		# get the number of frames (multiple of 20ms)
		frame_len = int(x.fs / 50)

		# ensure the length of the audio source signal is a multiple of 20ms
		if len(x.audio) % frame_len != 0:
		# pad with zeros to ensure that the signal length is a multiple of 20ms
		if len(x.audio) % frame_len != 0:
		N_pad = int(frame_len - len(x.audio) % frame_len)
		x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True)

		# add the convolved FOA/HOA2/HOA3 audio source signal to the output signal
		if y.audio is None:
		# add source signal to the array of all source signals
		y.audio = x.audio.copy()
		y.fs = x.fs

		# if source_shift < 0:
		# # insert zeros to the new audio source signal to shift it right
		# metadata.trim_meta(y, limits=[source_shift, 0], samples=True)
		offset = source_shift
		else:
		# adjust the signal length (trim from the end or pad with zeros) to align its length with the previous signal(s)
		N_pad = y.audio.shape[0] - x.audio.shape[0]
		if N_pad != 0:
		x.audio = audioarray.trim(
		x.audio, x.fs, limits=[0, -N_pad], samples=True
		)
		# shift the beginning of the audio source signal
		delta_offset = source_shift - offset
		if delta_offset > 0:
		# insert zeros to the existing output signal to shift it right
		y.audio = audioarray.trim(y.audio, y.fs, limits=[0, -delta_offset], samples=True)
		offset = source_shift
		else:
		# insert zeros to the new audio source signal to shift it right
		x.audio = audioarray.trim(x.audio, x.fs, limits=[0, delta_offset], samples=True)

		# adjust the length of the audio source signal
		delta_length = len(x.audio) - len(y.audio)
		if delta_length > 0:
		# pad zeros to the existing output signal
		y.audio = audioarray.trim(y.audio, y.fs, limits=[0, -delta_length], samples=True)
		else:
		# pad zeros to the new audio source signal
		x.audio = audioarray.trim(x.audio, x.fs, limits=[0, delta_length], samples=True)

		# superimpose
		y.audio += x.audio

		# append pre-amble and post-amble to all sources
		y.audio = audioarray.trim(y.audio, y.fs, limits=[-cfg.preamble, -cfg.postamble])
		# append pre-amble and post-amble
		preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms
		postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms
		y.audio = audioarray.trim(y.audio, y.fs, limits=[-preamble, -postamble], samples=True)

		# add random noise
		if cfg.add_low_level_random_noise:
		# create uniformly distributed noise between -4 and 4
		np.random.seed(SEED_RANDOM_NOISE)
		noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float")

		# superimpose
		y.audio += noise

		# write the FOA/HOA2/HOA3 audio into output file
		# adjust the length of the output signal
		if "duration" in cfg.__dict__:
		# trim the output signal such that the total duration is X seconds
		duration = int(cfg.duration * cfg.fs) # convert to samples
		else:
		# do not change the length of the audio signal
		duration = len(y.audio)
		duration = int(np.floor(duration / frame_len) * frame_len) # ensure multiple of 20ms
		if len(y.audio) != duration:
		y.audio = audioarray.trim(y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True)

		# write the FOA/HOA2/HOA3 audio signal into output file
		audiofile.write(output_filename, y.audio, y.fs)

		# convert to BINAURAL, if option was chosen
		@@ -314,4 +361,3 @@ def generate_ambi_scene(
		binaudio.fs,
		)
		logger.info(f"Written BINAURAL output to: {binaural_output_filename}")
		return

ivas_processing_scripts/generation/generate_ismN_items.py

+3 −1

Original line number	Diff line number	Diff line
		@@ -388,7 +388,9 @@ def generate_ismN_scene(
		y.metadata_files.insert(i, str(output_filename.with_suffix(f".{i}.csv")))

		# append pre-amble and post-amble
		metadata.trim_meta(y, limits=[-cfg.preamble * 1000, -cfg.postamble * 1000])
		preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms
		postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms
		metadata.trim_meta(y, limits=[-preamble, -postamble], samples=True)

		# add random noise
		if cfg.add_low_level_random_noise:

ivas_processing_scripts/generation/generate_omasa_items.py

+5 −3

Original line number	Diff line number	Diff line
		@@ -277,7 +277,7 @@ def generate_OMASA_scene(
		sys.exit(-1)

		# read source file
		x = audio.fromfile(fmt, input_filename, fs=cfg.fs)
		x = audio.fromfile(fmt, input_filename)

		# resample to the target fs if necessary
		if x.fs != cfg.fs:
		@@ -290,7 +290,7 @@ def generate_OMASA_scene(

		# adjust the level of the source file
		if fmt in ["FOA", "HOA2", "HOA3"]:
		x.audio, _ = loudness_norm(x, level, loudness_format="STEREO", rms=True)
		x.audio, _ = loudness_norm(x, level, loudness_format="STEREO")
		else:
		x.audio, _ = loudness_norm(x, level, loudness_format="MONO")

		@@ -436,7 +436,9 @@ def generate_OMASA_scene(
		y.metadata_files.insert(i - 1, str(output_filename.with_suffix(f".{i - 1}.csv")))

		# append pre-amble and post-amble
		metadata.trim_meta(y, limits=[-cfg.preamble * 1000, -cfg.postamble * 1000])
		preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms
		postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms
		metadata.trim_meta(y, limits=[-preamble, -postamble], samples=True)

		# add random noise
		if cfg.add_low_level_random_noise:

ivas_processing_scripts/generation/generate_osba_items.py

+8 −14

Original line number	Diff line number	Diff line
		@@ -33,7 +33,6 @@
		import logging
		import sys
		from itertools import groupby, repeat
		from math import floor
		from pathlib import Path

		import numpy as np
		@@ -182,7 +181,7 @@ def generate_OSBA_scene(
		N_inputs = len(np.atleast_1d(scene["input"]))
		N_ISMs = N_inputs - 1

		# get input and output filenames
		# get OSBA format and output filename
		osba_format = f"ISM{N_ISMs}SBA{cfg.sba_order}"
		output_filename = Path(scene["output"]).parent / (
		cfg.use_output_prefix + Path(scene["output"]).name
		@@ -209,9 +208,7 @@ def generate_OSBA_scene(
		)

		# get input filename
		input_filename = Path(source_file).parent / (
		cfg.use_input_prefix + Path(source_file).name
		)
		input_filename = Path(source_file).parent / (cfg.use_input_prefix + Path(source_file).name)

		# read azimuth and elevation information
		source_azi = (
		@@ -254,11 +251,6 @@ def generate_OSBA_scene(

		logger.info(f"Encoding {source_file} at position(s) {source_azi},{source_ele}")

		# get input filename
		input_filename = Path(source_file).parent / (
		cfg.use_input_prefix + Path(source_file).name
		)

		# get the number of channels from the .wav file header
		wav_header = audiofile.parse_wave_header(input_filename)
		N_channels = wav_header["channels"]
		@@ -280,7 +272,7 @@ def generate_OSBA_scene(
		sys.exit(-1)

		# read source file
		x = audio.fromfile(fmt, input_filename, fs=cfg.fs)
		x = audio.fromfile(fmt, input_filename)

		# resample to the target fs if necessary
		if x.fs != cfg.fs:
		@@ -293,7 +285,7 @@ def generate_OSBA_scene(

		# adjust the level of the source file
		if fmt in ["FOA", "HOA2", "HOA3"]:
		x.audio, _ = loudness_norm(x, level, loudness_format="STEREO", rms=True)
		x.audio, _ = loudness_norm(x, level, loudness_format="STEREO")
		else:
		x.audio, _ = loudness_norm(x, level, loudness_format="MONO")

		@@ -307,7 +299,7 @@ def generate_OSBA_scene(
		# get the number of frames (multiple of 20ms)
		N_frames = int(len(x.audio) / frame_len)

		# convert the input audio source signal to ISM
		# convert the input MONO audio source signal to ISM1 object
		if fmt == "MONO":
		# convert MONO to ISM1
		x_ism = audio.ObjectBasedAudio("ISM1") # ISM with 1 channel
		@@ -426,7 +418,9 @@ def generate_OSBA_scene(
		y.metadata_files.insert(i - 1, str(output_filename.with_suffix(f".{i - 1}.csv")))

		# append pre-amble and post-amble
		metadata.trim_meta(y, limits=[-cfg.preamble * 1000, -cfg.postamble * 1000])
		preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms
		postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms
		metadata.trim_meta(y, limits=[-preamble, -postamble], samples=True)

		# add random noise
		if cfg.add_low_level_random_noise: