support for +- overlap in STEREO items, expect trimmed sentences, support for... (f41efcb8) · Commits · IVAS Codec Public Collaboration / IVAS Processing Scripts

item_generation_scripts/init.py

+1 −0

Original line number	Diff line number	Diff line
		@@ -110,6 +110,7 @@ def main(args):
		IR_fs=cfg.IR_fs,
		preamble=cfg.preamble,
		postamble=cfg.postamble,
		add_low_level_random_noise=cfg.add_low_level_random_noise,
		)

		# copy configuration to output directory

item_generation_scripts/config/STEREO_CONFIG.yml

+68 −61

Original line number	Diff line number	Diff line
		@@ -35,6 +35,13 @@ output_path: "./items_STEREO"
		### Target loudness in LKFS; default = null (no loudness normalization applied)
		loudness: -26

		### Pre-amble and Post-amble length in seconds (default = 0.0)
		preamble: 0.5
		postamble: 0.5

		### Flag for adding low-level random background noise (amplitude +-4) instead of silence; default = false (silence)
		add_low_level_random_noise: true


		################################################
		### Scene description
		@@ -43,7 +50,7 @@ loudness: -26
		### Each scene must start with the sceneN tag
		### Specify the mono source filename (the program will search for it in the input_path folder)
		### Specify the stereo IR source filename (the program will search for it in the input_path_IR folder)
		### Specify the delay in seconds for each input source
		### Specify the overlap length in seconds for each input source (negative value creates a gap)
		### Note 1: use [val1, val2, ...] for multiple sources in a scene
		### Note 2: use the "start:step:stop" notation for moving sources, where step will be applied in 20ms frames

		@@ -51,252 +58,252 @@ scenes:
		a1:
		name: "G1S1.wav"
		description: "Two speakers sitting at oval table in opposite corners in a large echoic conference room."
		source: ["test_single.wav", "test_single.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		IR: ["LEABP04.wav", "LEABP11.wav"]
		delay: [0, 3]
		overlap: 0.5

		a2:
		name: "G6S2.wav"
		description: "Two speakers sitting at oval table in opposite corners in a large echoic conference room."
		source: ["test_single.wav", "test_single.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		IR: ["LEABP05.wav", "LEABP11.wav"]
		delay: [0, 3]
		overlap: 0.5

		a3:
		name: "G5S3.wav"
		description: "Two speakers sitting at oval table in opposite corners in a large echoic conference room."
		source: ["test_single.wav", "test_single.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		IR: ["LEABP06.wav", "LEABP11.wav"]
		delay: [0, 3]
		overlap: 0.5

		a4:
		name: "G4S4.wav"
		description: "Two speakers sitting at oval table in opposite corners in a large echoic conference room."
		source: ["test_single.wav", "test_single.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		IR: ["LEABP05.wav", "LEABP10.wav"]
		delay: [0, 1.5]
		overlap: -0.5

		a5:
		name: "G3S5.wav"
		description: "Two speakers sitting at oval table in opposite corners in a large echoic conference room."
		source: ["test_single.wav", "test_single.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		IR: ["LEABP05.wav", "LEABP11.wav"]
		delay: [0, 1.5]
		overlap: -0.5

		a6:
		name: "G2S6.wav"
		description: "Two speakers sitting at oval table in opposite corners in a large echoic conference room."
		source: ["test_single.wav", "test_single.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		IR: ["LEABP05.wav", "LEABP12.wav"]
		delay: [0, 1.5]
		overlap: -0.5

		b1:
		name: "G2S1.wav"
		description: "Two speakers sitting at oval table side by side in a large anechoic conference room."
		source: ["test_single.wav", "test_single.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		IR: ["LAABP05.wav", "LAABP06.wav"]
		delay: [0, 35]
		overlap: -0.5

		b2:
		name: "G1S2.wav"
		description: "Two speakers sitting at oval table side by side in a large anechoic conference room."
		source: ["test_single.wav", "test_single.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		IR: ["LAABP07.wav", "LAABP08.wav"]
		delay: [0, 3]
		overlap: 0.5

		b3:
		name: "G6S3.wav"
		description: "Two speakers sitting at oval table side by side in a large anechoic conference room."
		source: ["test_single.wav", "test_single.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		IR: ["LAABP09.wav", "LAABP10.wav"]
		delay: [0, 3]
		overlap: 0.5

		b4:
		name: "G5S4.wav"
		description: "Two speakers sitting at oval table side by side in a large anechoic conference room."
		source: ["test_single.wav", "test_single.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		IR: ["LAABP11.wav", "LAABP12.wav"]
		delay: [0, 1.5]
		overlap: -0.5

		b5:
		name: "G4S5.wav"
		description: "Two speakers sitting at oval table side by side in a large anechoic conference room."
		source: ["test_single.wav", "test_single.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		IR: ["LAABP01.wav", "LAABP02.wav"]
		delay: [0, 1.5]
		overlap: -0.5

		b6:
		name: "G3S6.wav"
		description: "Two speakers sitting at oval table side by side in a large anechoic conference room."
		source: ["test_single.wav", "test_single.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		IR: ["LAABP03.wav", "LAABP04.wav"]
		delay: [0, 1.5]
		overlap: -0.5

		c1:
		name: "G3S1.wav"
		description: "One talker sitting at table in a small anechoic conference room."
		source: ["test_single.wav"]
		IR: ["SAMSP01.wav"]
		delay: [0]
		overlap: -0.5

		c2:
		name: "G2S2.wav"
		description: "One talker sitting at table in a small anechoic conference room."
		source: ["test_single.wav"]
		IR: ["SAMSP04.wav"]
		delay: [0]
		overlap: -0.5

		c3:
		name: "G1S3.wav"
		description: "One talker sitting at table in a small anechoic conference room."
		source: ["test_single.wav"]
		IR: ["SAMSP07.wav"]
		delay: [0]
		overlap: -0.5

		c4:
		name: "G6S4.wav"
		description: "One talker sitting at table in a small echoic conference room."
		source: ["test_single.wav"]
		IR: ["SEABP01.wav"]
		delay: [0]
		overlap: -0.5

		c5:
		name: "G5S5.wav"
		description: "One talker sitting at table in a small echoic conference room."
		source: ["test_single.wav"]
		IR: ["SEABP03.wav"]
		delay: [0]
		overlap: -0.5

		c6:
		name: "G4S6.wav"
		description: "One talker sitting at table in a small echoic conference room."
		source: ["test_single.wav"]
		IR: ["SEABP06.wav"]
		delay: [0]
		overlap: -0.5

		d1:
		name: "G4S1.wav"
		description: "One talker sitting at table in a small anechoic conference room."
		source: ["test_single.wav"]
		IR: ["SEBIP01.wav"]
		delay: [0]
		overlap: -0.5

		d2:
		name: "G3S2.wav"
		description: "One talker sitting at table in a small anechoic conference room."
		source: ["test_single.wav"]
		IR: ["SEBIP04.wav"]
		delay: [0]
		overlap: -0.5

		d3:
		name: "G3S2.wav"
		description: "One talker sitting at table in a small anechoic conference room."
		source: ["test_single.wav"]
		IR: ["SEBIP07.wav"]
		delay: [0]
		overlap: -0.5

		d4:
		name: "G1S4.wav"
		description: "One talker sitting at table in a small echoic conference room."
		source: ["test_single.wav"]
		IR: ["SEBIP07.wav"]
		delay: [0]
		overlap: -0.5

		d5:
		name: "G6S5.wav"
		description: "One talker sitting at table in a small echoic conference room."
		source: ["test_single.wav"]
		IR: ["SEBIP07.wav"]
		delay: [0]
		overlap: -0.5

		d6:
		name: "G5S6.wav"
		description: "One talker sitting at table in a small echoic conference room."
		source: ["test_single.wav"]
		IR: ["SEBIP07.wav"]
		delay: [0]
		overlap: -0.5

		e1:
		name: "G5S1.wav"
		description: "Two talkers sitting in a room."
		source: ["test_single.wav", "test_single.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		IR: ["SEMSP01.wav", "SEMSP03.wav"]
		delay: [0, 3]
		overlap: 0.5

		e2:
		name: "G4S2.wav"
		description: "Two talkers sitting in a room."
		source: ["test_single.wav", "test_single.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		IR: ["SEMSP01.wav", "SEMSP05.wav"]
		delay: [0, 3]
		overlap: 0.5

		e3:
		name: "G3S3.wav"
		description: "Two talkers sitting in a room."
		source: ["test_single.wav", "test_single.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		IR: ["SEMSP01.wav", "SEMSP07.wav"]
		delay: [0, 3]
		overlap: 0.5

		e4:
		name: "G2S4.wav"
		description: "Two talkers sitting in a room."
		source: ["test_single.wav", "test_single.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		IR: ["SEMSP03.wav", "SEMSP04.wav"]
		delay: [0, 1.5]
		overlap: -0.5

		e5:
		name: "G1S5.wav"
		description: "Two talkers sitting in a room."
		source: ["test_single.wav", "test_single.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		IR: ["SEMSP05.wav", "SEMSP07.wav"]
		delay: [0, 1.5]
		overlap: -0.5

		e6:
		name: "G6S6.wav"
		description: "Two talkers sitting in a room."
		source: ["test_single.wav", "test_single.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		IR: ["SEMSP06.wav", "SEMSP02.wav"]
		delay: [0, 1.5]
		overlap: -0.5

		f1:
		name: "G6S1.wav"
		description: "Two talkers sitting in a room."
		source: ["test_single.wav", "test_single.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		IR: ["SEBIP05.wav", "SEBIP01.wav"]
		delay: [0, 3]
		overlap: 0.5

		f2:
		name: "G5S2.wav"
		description: "Two talkers sitting in a room."
		source: ["test_single.wav", "test_single.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		IR: ["SEBIP07.wav", "SEBIP01.wav"]
		delay: [0, 3]
		overlap: 0.5

		f3:
		name: "G4S3.wav"
		description: "Two talkers sitting in a room."
		source: ["test_single.wav", "test_single.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		IR: ["SEBIP04.wav", "SEBIP01.wav"]
		delay: [0, 3]
		overlap: 0.5

		f4:
		name: "G3S4.wav"
		description: "Two talkers sitting in a room."
		source: ["test_single.wav", "test_single.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		IR: ["SEBIP02.wav", "SEBIP06.wav"]
		delay: [0, 1.5]
		overlap: -0.5

		f5:
		name: "G2S5.wav"
		description: "Two talkers sitting in a room."
		source: ["test_single.wav", "test_single.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		IR: ["SEBIP02.wav", "SEBIP06.wav"]
		delay: [0, 1.5]
		overlap: -0.5

		f6:
		name: "G1S6.wav"
		description: "Two talkers sitting in a room."
		source: ["test_single.wav", "test_single.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		IR: ["SEBIP03.wav", "SEBIP04.wav"]
		delay: [0, 1.5]
		overlap: -0.5

		No newline at end of file

item_generation_scripts/processing/process_ism_items.py

+1 −4

Original line number	Diff line number	Diff line
		@@ -100,9 +100,6 @@ def generate_ism_items(
		# read source file
		x = audio.fromfile("MONO", os.path.join(input_path, source_file), fs=fs)

		############### DEBUG ############33
		# x.audio = x.audio[:-10]

		# get the number of frames (multiple of 20ms)
		N_frames = int(len(x.audio) / x.fs * 50)

item_generation_scripts/processing/process_stereo_items.py

+65 −20

Original line number	Diff line number	Diff line
		@@ -40,11 +40,12 @@ from copy import copy
		import numpy as np
		from math import floor


		from item_generation_scripts.audiotools import audio, audiofile
		from item_generation_scripts.audiotools.wrappers.bs1770 import get_loudness
		from item_generation_scripts.audiotools.wrappers.reverb import reverb_stereo

		SEED_RANDOM_NOISE = 0


		# function for converting nd numpy array to strings with 2 decimal digits
		def csv_formatdata(data):
		@@ -62,6 +63,9 @@ def generate_stereo_items(
		logger: logging.Logger,
		fs: Optional[int] = 48000,
		IR_fs: Optional[int] = 48000,
		preamble: Optional[float] = 0.0,
		postamble: Optional[float] = 0.0,
		add_low_level_random_noise: Optional[bool] = False,
		):
		"""Generate STEREO items from mono items based on scene description"""

		@@ -77,16 +81,18 @@ def generate_stereo_items(
		# read the IR (check if stereo or two mono files were provided)
		source_IR = np.atleast_1d(scene["IR"])

		# read the overlap length
		if 'overlap' in scene.keys():
		source_overlap = float(scene["overlap"])
		else:
		source_overlap = 0.0

		y = audio.ChannelBasedAudio("STEREO")
		for i in range(N_sources):

		# parse parameters from the scene description
		source_file = np.atleast_1d(scene["source"])[i]
		IR_file = np.atleast_1d(scene["IR"])[i]
		if 'delay' in scene.keys():
		source_delay = np.atleast_1d(scene["delay"])[i]
		else:
		source_delay = np.array([0])

		logger.info(
		f"Convolving {source_file} with {source_IR}"
		@@ -98,35 +104,46 @@ def generate_stereo_items(
		# get the number of frames (multiple of 20ms)
		N_frames = int(len(x.audio) / x.fs * 50)

		# trim the source signal to align to 20ms boundary
		N_trim = int(N_frames * x.fs / 50)
		x.audio = x.audio[:N_trim]

		# read the IR file
		IR = audio.fromfile("STEREO", os.path.join(IR_path, IR_file), fs=IR_fs)

		# delay the source file
		if source_delay > 0:
		# convolve with stereo IR
		x_rev = reverb_stereo(x, IR)

		# adjust the level of the stereo signal
		_, scale_factor = get_loudness(x_rev, target_level, "STEREO")
		x_rev.audio *= scale_factor

		# shift the second (and all other) source files (positive shift creates overlap, negative shift creates a gap)
		if i > 0 and source_overlap != 0.0:
		# get the length of the first source file
		N_delay = len(y.audio[:,0])

		# add the shift
		N_delay += int(source_overlap * x.fs)

		# ensure delay is a multiple of 20ms
		N_delay = int(floor(source_delay * 50) / 50 * x.fs)
		# N_delay = int(floor(source_shift * 50) / 50 * x.fs)

		# insert all-zero preamble
		pre = np.zeros((N_delay, x.audio.shape[1]))
		x.audio = np.concatenate([pre, x.audio])

		# convolve with stereo IR
		x_rev = reverb_stereo(x, IR)
		# pad with zeros to ensure that the signal length is a multiple of 20ms
		N_frame = x.fs / 50
		if len(x.audio) % N_frame != 0:
		N_pad = int(N_frame - len(x.audio) % N_frame)

		# adjust the level of the stereo signal
		_, scale_factor = get_loudness(x_rev, target_level, "STEREO")
		x_rev.audio *= scale_factor
		# insert all-zero preamble
		pre = np.zeros((N_pad, x.audio.shape[1]))
		x.audio = np.concatenate([pre, x.audio])

		# add source signal to the array of source signals
		y.fs = x.fs
		if y.audio is None:
		y.audio = x_rev.audio
		else:
		# append zeros to have equal length of all source signals
		# pad with zeros to have equal length of all source signals
		if x_rev.audio.shape[0] > y.audio.shape[0]:
		y.audio = np.vstack((y.audio, np.zeros((x_rev.audio.shape[0] - y.audio.shape[0], y.audio.shape[1]))))
		elif y.audio.shape[0] > x_rev.audio.shape[0]:
		@@ -135,6 +152,34 @@ def generate_stereo_items(
		# superimpose
		y.audio += x_rev.audio

		# append pre-amble and post-amble to all sources
		if preamble != 0.0:
		# ensure that pre-mable is a multiple of 20ms
		N_pre = int(floor(preamble * 50) / 50 * y.fs)

		# insert all-zero preamble to all sources
		pre = np.zeros((N_pre, y.audio.shape[1]))
		y.audio = np.concatenate([pre, y.audio])

		if postamble != 0.0:
		# ensure that post-mable is a multiple of 20ms
		N_post = int(floor(postamble * 50) / 50 * y.fs)

		# append all-zero postamble to all sources
		post = np.zeros((N_post, y.audio.shape[1]))
		y.audio = np.concatenate([y.audio, post])

		# add random noise
		if add_low_level_random_noise:
		# create uniformly distributed noise between -4 and 4
		np.random.seed(SEED_RANDOM_NOISE)
		noise = np.random.randint(
		low=-4, high=5, size=y.audio.shape
		).astype("float")

		# superimpose
		y.audio += noise

		# write the reverberated audio into output file
		output_filename = scene["name"]
		audiofile.write(