support for X(i_ref) notation to allow specifying overlap between items (93d4dbfe) · Commits · IVAS Codec Public Collaboration / IVAS Processing Scripts

ivas_processing_scripts/generation/generate_ismN_items.py

+34 −12

Original line number	Diff line number	Diff line
		@@ -30,6 +30,7 @@
		# the United Nations Convention on Contracts on the International Sales of Goods.
		#
		import logging
		import re
		import sys
		from itertools import groupby, repeat
		from pathlib import Path
		@@ -196,6 +197,7 @@ def generate_ismN_scene(

		# repeat for all source files
		offset = 0
		end_position = []
		for i in range(N_inputs):
		# read input filename
		source_file = (
		@@ -232,16 +234,33 @@ def generate_ismN_scene(
		if isinstance(scene["shift"], list)
		else scene["shift"]
		)

		# check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index
		# of the reference signal (0-based index)
		if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]:
		# extract X and i_ref
		match = re.match(r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i])

		if match:
		overlap = float(match.group(1))
		overlap_ref = int(match.group(2))
		else:
		scene_shift_str = scene["shift"][i]
		logger.error(f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!")
		sys.exit(-1)

		# calculate absolute shift of the source signal in seconds
		source_shift = end_position[overlap_ref] + overlap
		else:
		source_shift = 0.0

		# convert shift from seconds to samples and ensure it is a multiple of 20ms
		source_shift_in_seconds = source_shift
		source_shift = source_shift * cfg.fs
		if source_shift >= 0:
		source_shift = int(np.floor(source_shift / frame_len) * frame_len)
		else:
		source_shift = int(np.ceil(source_shift / frame_len) * frame_len)
		source_shift_in_seconds = source_shift / cfg.fs

		# read the level
		if "level" in scene.keys():
		@@ -278,6 +297,9 @@ def generate_ismN_scene(
		x = audio.fromtype("ISM1")
		x.audio, x.fs = audiofile.read(input_filename)

		# record the total duration of the source signal, taking into account the shift of the starting position
		end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds)

		# resample to the target fs if necessary
		if x.fs != cfg.fs:
		logger.warning(
		@@ -288,12 +310,12 @@ def generate_ismN_scene(
		x.fs = cfg.fs

		# adjust the level of the audio source file (need to convert to MONO first)
		if level is None:
		# do not change the level of the audio source signal
		logger.info("-- Level of the audio source signal is not changed")
		elif np.isinf(level):
		if np.isinf(level):
		# set all channels to zero
		x.audio = np.zeros_like(x.audio)
		elif level is None:
		# do not change the level of the audio source signal
		logger.info("-- Level of the audio source signal is not changed")
		else:
		x_temp = audio.ChannelBasedAudio(
		"MONO"
		@@ -391,21 +413,21 @@ def generate_ismN_scene(
		y.object_pos = x.object_pos.copy()
		y.fs = x.fs

		if source_shift < 0:
		if source_shift > 0:
		# insert zeros to the new audio source signal to shift it right
		metadata.trim_meta(y, limits=[source_shift, 0], samples=True)
		metadata.trim_meta(y, limits=[-source_shift, 0], samples=True)
		else:
		offset = source_shift
		else:
		# shift the beginning of the audio source signal
		delta_offset = source_shift - offset
		if delta_offset > 0:
		if delta_offset < 0:
		# insert zeros to the previous ISM signal(s) to shift them right
		metadata.trim_meta(y, limits=[-delta_offset, 0], samples=True)
		metadata.trim_meta(y, limits=[delta_offset, 0], samples=True)
		offset = source_shift
		else:
		# insert zeros to the new audio source signal to shift it right
		metadata.trim_meta(x, limits=[delta_offset, 0], samples=True)
		metadata.trim_meta(x, limits=[-delta_offset, 0], samples=True)

		# adjust the length of the audio source signal
		delta_length = len(x.audio) - len(y.audio)
		@@ -443,14 +465,14 @@ def generate_ismN_scene(
		noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float")
		y.audio += noise

		# trim the output signal such if the total duration exceeds X seconds
		# trim the output signal if the total duration exceeds X seconds
		if "duration" in cfg.__dict__:
		# convert from seconds to samples (ensure multiple of 20ms)
		duration = int(np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len)

		# check if the current length of the output signal exceeds the duration
		if len(y.audio) > duration:
		metadata.trim_meta(y, limits=[0, duration], samples=True)
		metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True)

		# adjust the loudness of the output signal
		if "loudness" in cfg.__dict__:

ivas_processing_scripts/generation/generate_masa_items.py

+47 −10

Original line number	Diff line number	Diff line
		@@ -31,6 +31,7 @@
		#

		import logging
		import re
		import sys
		from itertools import groupby, repeat
		from pathlib import Path
		@@ -209,6 +210,7 @@ def generate_MASA_scene(
		# repeat for all source files
		offset = 0
		y_int = None
		end_position = []
		for i in range(N_inputs):
		# parse parameters from the scene description
		source_file = (
		@@ -232,13 +234,44 @@ def generate_MASA_scene(
		else:
		source_shift = 0.0

		# read the source shift length (in seconds)
		if "shift" in scene.keys():
		source_shift = (
		scene["shift"][i]
		if isinstance(scene["shift"], list)
		else scene["shift"]
		)

		# check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index
		# of the reference signal (0-based index)
		if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]:
		# extract X and i_ref
		match = re.match(
		r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i]
		)

		if match:
		overlap = float(match.group(1))
		overlap_ref = int(match.group(2))
		else:
		scene_shift_str = scene["shift"][i]
		logger.error(
		f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!"
		)
		sys.exit(-1)

		# calculate absolute shift of the source signal in seconds
		source_shift = end_position[overlap_ref] - overlap
		else:
		source_shift = 0.0

		# convert shift from seconds to samples and ensure it is a multiple of 20ms
		source_shift_in_seconds = source_shift
		source_shift = source_shift * cfg.fs
		if source_shift >= 0:
		source_shift = int(np.floor(source_shift / frame_len) * frame_len)
		else:
		source_shift = int(np.ceil(source_shift / frame_len) * frame_len)
		source_shift_in_seconds = source_shift / cfg.fs

		# read the level
		if "level" in scene.keys():
		@@ -295,6 +328,9 @@ def generate_MASA_scene(
		# read source file
		x = audio.fromfile("MONO", input_filename)

		# record the total duration of the source signal, taking into account the shift of the starting position
		end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds)

		# resample to the target fs if necessary
		if x.fs != cfg.fs:
		logger.warning(
		@@ -339,26 +375,26 @@ def generate_MASA_scene(
		# this is the first SBA source signal
		y_int.audio = x.audio.copy()

		if source_shift < 0:
		if source_shift > 0:
		# insert zeros to the first SBA source signal to shift it right
		y_int.audio = audioarray.trim(
		y_int.audio, y_int.fs, limits=[source_shift, 0], samples=True
		y_int.audio, y_int.fs, limits=[-source_shift, 0], samples=True
		)
		else:
		offset = source_shift
		else:
		# shift the beginning of the audio source signal
		delta_offset = source_shift - offset
		if delta_offset > 0:
		if delta_offset < 0:
		# insert zeros to the output SBA signal to shift it right
		y_int.audio = audioarray.trim(
		y_int.audio, y_int.fs, limits=[-delta_offset, 0], samples=True
		y_int.audio, y_int.fs, limits=[delta_offset, 0], samples=True
		)
		offset = source_shift
		else:
		# insert zeros to the new SBA source signal to shift it right
		x.audio = audioarray.trim(
		x.audio, x.fs, limits=[delta_offset, 0], samples=True
		x.audio, x.fs, limits=[-delta_offset, 0], samples=True
		)

		# adjust the length of the audio source signal
		@@ -396,14 +432,15 @@ def generate_MASA_scene(
		# trim the output signal if the total duration exceeds X seconds
		if "duration" in cfg.__dict__:
		# convert from seconds to samples (ensure multiple of 20ms)
		duration = int(
		np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len
		)
		duration = int(np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len)

		# check if the current length of the output signal exceeds the duration
		if len(y_int.audio) > duration:
		y_int.audio = audioarray.trim(
		y_int.audio, y_int.fs, limits=[0, len(y_int.audio) - duration], samples=True
		y_int.audio,
		y_int.fs,
		limits=[0, len(y_int.audio) - duration],
		samples=True,
		)

		# adjust the loudness of the output signal

ivas_processing_scripts/generation/generate_mc_items.py

+29 −7

Original line number	Diff line number	Diff line
		@@ -31,6 +31,7 @@
		#

		import logging
		import re
		import sys
		from itertools import groupby, repeat
		from pathlib import Path
		@@ -209,6 +210,7 @@ def generate_MC_scene(
		# repeat for all source files
		offset = 0
		y_int = None
		end_position = []
		for i in range(N_inputs):
		# parse parameters from the scene description
		source_file = (
		@@ -222,23 +224,40 @@ def generate_MC_scene(
		)
		IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name)

		# read the shift time in seconds
		# read the source shift length (in seconds)
		if "shift" in scene.keys():
		source_shift = (
		scene["shift"][i]
		if isinstance(scene["shift"], list)
		else scene["shift"]
		)

		# check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index
		# of the reference signal (0-based index)
		if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]:
		# extract X and i_ref
		match = re.match(r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i])

		if match:
		overlap = float(match.group(1))
		overlap_ref = int(match.group(2))
		else:
		scene_shift_str = scene["shift"][i]
		logger.error(f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!")
		sys.exit(-1)

		# calculate absolute shift of the source signal in seconds
		source_shift = end_position[overlap_ref] + overlap
		else:
		source_shift = 0.0

		# convert shift from seconds to samples and ensure it is a multiple of 20ms
		source_shift_in_seconds = source_shift
		source_shift = source_shift * cfg.fs
		if source_shift >= 0:
		source_shift = int(np.floor(source_shift / frame_len) * frame_len)
		else:
		source_shift = int(np.ceil(source_shift / frame_len) * frame_len)
		source_shift_in_seconds = source_shift / cfg.fs

		# read the level
		if "level" in scene.keys():
		@@ -295,6 +314,9 @@ def generate_MC_scene(
		# read source file
		x = audio.fromfile("MONO", input_filename)

		# record the total duration of the source signal, taking into account the shift of the starting position
		end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds)

		# resample to the target fs if necessary
		if x.fs != cfg.fs:
		logger.warning(
		@@ -339,26 +361,26 @@ def generate_MC_scene(
		# this is the first SBA source signal
		y_int.audio = x.audio.copy()

		if source_shift < 0:
		if source_shift > 0:
		# insert zeros to the first SBA source signal to shift it right
		y_int.audio = audioarray.trim(
		y_int.audio, y_int.fs, limits=[source_shift, 0], samples=True
		y_int.audio, y_int.fs, limits=[-source_shift, 0], samples=True
		)
		else:
		offset = source_shift
		else:
		# shift the beginning of the audio source signal
		delta_offset = source_shift - offset
		if delta_offset > 0:
		if delta_offset < 0:
		# insert zeros to the output SBA signal to shift it right
		y_int.audio = audioarray.trim(
		y_int.audio, y_int.fs, limits=[-delta_offset, 0], samples=True
		y_int.audio, y_int.fs, limits=[delta_offset, 0], samples=True
		)
		offset = source_shift
		else:
		# insert zeros to the new SBA source signal to shift it right
		x.audio = audioarray.trim(
		x.audio, x.fs, limits=[delta_offset, 0], samples=True
		x.audio, x.fs, limits=[-delta_offset, 0], samples=True
		)

		# adjust the length of the audio source signal

ivas_processing_scripts/generation/generate_omasa_items.py

+31 −22

Original line number	Diff line number	Diff line
		@@ -31,6 +31,7 @@
		#

		import logging
		import re
		import sys
		from itertools import groupby, repeat
		from pathlib import Path
		@@ -183,6 +184,7 @@ def generate_OMASA_scene(

		# repeat for all source files
		offset = 0
		end_position = []
		for i in range(N_inputs):
		# parse parameters from the scene description
		source_file = (
		@@ -220,16 +222,33 @@ def generate_OMASA_scene(
		if isinstance(scene["shift"], list)
		else scene["shift"]
		)

		# check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index
		# of the reference signal (0-based index)
		if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]:
		# extract X and i_ref
		match = re.match(r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i])

		if match:
		overlap = float(match.group(1))
		overlap_ref = int(match.group(2))
		else:
		scene_shift_str = scene["shift"][i]
		logger.error(f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!")
		sys.exit(-1)

		# calculate absolute shift of the source signal in seconds
		source_shift = end_position[overlap_ref] + overlap
		else:
		source_shift = 0.0

		# convert shift from seconds to samples and ensure it is a multiple of 20ms
		source_shift_in_seconds = source_shift
		source_shift = source_shift * cfg.fs
		if source_shift >= 0:
		source_shift = int(np.floor(source_shift / frame_len) * frame_len)
		else:
		source_shift = int(np.ceil(source_shift / frame_len) * frame_len)
		source_shift_in_seconds = source_shift / cfg.fs

		# read the level
		if "level" in scene.keys():
		@@ -300,6 +319,9 @@ def generate_OMASA_scene(
		# read source file
		x = audio.fromfile(fmt, input_filename)

		# record the total duration of the source signal, taking into account the shift of the starting position
		end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds)

		# resample to the target fs if necessary
		if x.fs != cfg.fs:
		logger.warning(
		@@ -417,21 +439,21 @@ def generate_OMASA_scene(
		# add the first audio source signal (should be FOA/HOA2/HOA3) to the array of all source signals
		y_int.audio = x.audio.copy()

		if source_shift < 0:
		if source_shift > 0:
		# insert zeros to the new audio source signal to shift it right
		metadata.trim_meta(y_int, limits=[source_shift, 0], samples=True)
		metadata.trim_meta(y_int, limits=[-source_shift, 0], samples=True)
		else:
		offset = source_shift
		else:
		# shift the beginning of the audio source signal
		delta_offset = source_shift - offset
		if delta_offset > 0:
		if delta_offset < 0:
		# insert zeros to the existing intermediate OSBA object to shift it right
		metadata.trim_meta(y_int, limits=[-delta_offset, 0], samples=True)
		metadata.trim_meta(y_int, limits=[delta_offset, 0], samples=True)
		offset = source_shift
		else:
		# insert zeros to the new audio source signal to shift it right
		metadata.trim_meta(x, limits=[delta_offset, 0], samples=True)
		metadata.trim_meta(x, limits=[-delta_offset, 0], samples=True)

		# adjust the length of the audio source signal
		delta_length = len(x.audio) - len(y_int.audio)
		@@ -472,29 +494,16 @@ def generate_OMASA_scene(
		noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float")
		y_int.audio += noise

		# trim the output signal such if the total duration exceeds X seconds
		# trim the output signal if the total duration exceeds X seconds
		if "duration" in cfg.__dict__:
		<<<<<<< Updated upstream
		# trim the output signal such that the total duration is X seconds
		duration = int(cfg.duration * cfg.fs) # convert to samples
		else:
		# do not change the length of the audio signal
		duration = len(y_int.audio)
		duration = int(
		np.floor(duration / frame_len) * frame_len
		) # ensure multiple of 20ms
		if len(y_int.audio) != duration:
		metadata.trim_meta(y_int, limits=[0, len(y_int.audio) - duration], samples=True)
		=======
		# convert from seconds to samples (ensure multiple of 20ms)
		duration = int(
		np.floor(int(cfg.duration * cfg.fs) / frame_len) * frame_len
		)

		# check if the current length of the output signal exceeds the duration
		if len(y.audio) > duration:
		metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True)
		>>>>>>> Stashed changes
		if len(y_int.audio) > duration:
		metadata.trim_meta(y_int, limits=[0, len(y_int.audio) - duration], samples=True)

		# adjust the loudness of the output signal
		if "loudness" in cfg.__dict__:

ivas_processing_scripts/generation/generate_osba_items.py

+29 −7

Original line number	Diff line number	Diff line
		@@ -31,6 +31,7 @@
		#

		import logging
		import re
		import sys
		from itertools import groupby, repeat
		from pathlib import Path
		@@ -187,6 +188,7 @@ def generate_OSBA_scene(

		# repeat for all source files
		offset = 0
		end_position = []
		for i in range(N_inputs):
		# parse parameters from the scene description
		source_file = (
		@@ -210,23 +212,40 @@ def generate_OSBA_scene(
		else scene["elevation"]
		)

		# read the shift time in seconds
		# read the source shift length (in seconds)
		if "shift" in scene.keys():
		source_shift = (
		scene["shift"][i]
		if isinstance(scene["shift"], list)
		else scene["shift"]
		)

		# check if shift is defined with X(i_ref) notation where X specifies the overlap value and i_ref is the index
		# of the reference signal (0-based index)
		if isinstance(scene["shift"][i], str) and "(" in scene["shift"][i]:
		# extract X and i_ref
		match = re.match(r"([+-]?\d*\.?\d+)[\(\[]([+-]?\d+)[\)\]]", scene["shift"][i])

		if match:
		overlap = float(match.group(1))
		overlap_ref = int(match.group(2))
		else:
		scene_shift_str = scene["shift"][i]
		logger.error(f"Unable to parse {scene_shift_str}. The specification of overlap or reference is incorrect!")
		sys.exit(-1)

		# calculate absolute shift of the source signal in seconds
		source_shift = end_position[overlap_ref] + overlap
		else:
		source_shift = 0.0

		# convert shift from seconds to samples and ensure it is a multiple of 20ms
		source_shift_in_seconds = source_shift
		source_shift = source_shift * cfg.fs
		if source_shift >= 0:
		source_shift = int(np.floor(source_shift / frame_len) * frame_len)
		else:
		source_shift = int(np.ceil(source_shift / frame_len) * frame_len)
		source_shift_in_seconds = source_shift / cfg.fs

		# read the level
		if "level" in scene.keys():
		@@ -282,6 +301,9 @@ def generate_OSBA_scene(
		# read source file
		x = audio.fromfile(fmt, input_filename)

		# record the total duration of the source signal, taking into account the shift of the starting position
		end_position.append(x.audio.shape[0] / x.fs + source_shift_in_seconds)

		# resample to the target fs if necessary
		if x.fs != cfg.fs:
		logger.warning(
		@@ -403,21 +425,21 @@ def generate_OSBA_scene(
		# if ISM, append object position to the OSBA object
		y.object_pos = x.object_pos.copy()

		if source_shift < 0:
		if source_shift > 0:
		# insert zeros to the new audio source signal to shift it right
		metadata.trim_meta(y, limits=[source_shift, 0], samples=True)
		metadata.trim_meta(y, limits=[-source_shift, 0], samples=True)
		else:
		offset = source_shift
		else:
		# shift the beginning of the audio source signal
		delta_offset = source_shift - offset
		if delta_offset > 0:
		if delta_offset < 0:
		# insert zeros to the previous ISM signal(s) to shift them right
		metadata.trim_meta(y, limits=[-delta_offset, 0], samples=True)
		metadata.trim_meta(y, limits=[delta_offset, 0], samples=True)
		offset = source_shift
		else:
		# insert zeros to the new audio source signal to shift it right
		metadata.trim_meta(x, limits=[delta_offset, 0], samples=True)
		metadata.trim_meta(x, limits=[-delta_offset, 0], samples=True)

		# adjust the length of the audio source signal
		delta_length = len(x.audio) - len(y.audio)