support for +- overlap in ISM items, expect trimmed sentences, support for... (8a6542d4) · Commits · IVAS Codec Public Collaboration / IVAS Processing Scripts

item_generation_scripts/init.py

+6 −1

Original line number	Diff line number	Diff line
		@@ -91,7 +91,10 @@ def main(args):
		cfg.output_path,
		cfg.scenes,
		logger,
		fs=cfg.fs
		fs=cfg.fs,
		preamble=cfg.preamble,
		postamble=cfg.postamble,
		add_low_level_random_noise=cfg.add_low_level_random_noise,
		)
		elif cfg.format == "STEREO":
		# generate STEREO items according to scene description
		@@ -105,6 +108,8 @@ def main(args):
		logger,
		fs=cfg.fs,
		IR_fs=cfg.IR_fs,
		preamble=cfg.preamble,
		postamble=cfg.postamble,
		)

		# copy configuration to output directory

item_generation_scripts/config/ISM1_CONFIG.yml

+4 −0

Original line number	Diff line number	Diff line
		@@ -29,6 +29,10 @@ output_path: "./items_ISM1"
		### Target loudness in LKFS; default = null (no loudness normalization applied)
		loudness: -26

		### Pre-amble and Post-amble length in seconds (default = None)
		preamble: 0.5
		postamble: 0.5


		################################################
		### Scene description

item_generation_scripts/config/ISM2_CONFIG.yml

+79 −73

Original line number	Diff line number	Diff line
		@@ -29,6 +29,12 @@ output_path: "./items_ISM2"
		### Target loudness in LKFS; default = null (no loudness normalization applied)
		loudness: -26

		### Pre-amble and Post-amble length in seconds (default = 0.0)
		preamble: 0.5
		postamble: 0.5

		### Flag for adding low-level random background noise (amplitude +-4) instead of silence; default = false (silence)
		add_low_level_random_noise: true

		################################################
		### Scene description
		@@ -37,7 +43,7 @@ loudness: -26
		### Each scene must start with the sceneN tag
		### Specify the mono source filename (the program will search for it in the input_path folder)
		### Specify azimuth and elevation for each input source
		### Specify the delay in seconds for each input source
		### Specify the overlap length in seconds for each input source (negative value creates a gap)
		### Note 1: use [val1, val2, ...] for multiple sources in a scene
		### Note 2: use the "start:step:stop" notation for moving sources, where step will be applied in 20ms frames

		@@ -52,288 +58,288 @@ scenes:
		a1:
		name: "G1S1.wav"
		description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances."
		source: ["test_double.wav", "test_double.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		azimuth: [0, 50]
		elevation: [0, 0]
		delay: [0, 0]
		overlap: -0.5

		a2:
		name: "G6S2.wav"
		description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances."
		source: ["test_double.wav", "test_double.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		azimuth: [50, 350]
		elevation: [0, 0]
		delay: [0, 0]
		overlap: -0.5

		a3:
		name: "G5S3.wav"
		description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances."
		source: ["test_double.wav", "test_double.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		azimuth: [40, 290]
		elevation: [0, 0]
		delay: [0, 0]
		overlap: -0.5

		a4:
		name: "G4S4.wav"
		description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances."
		source: ["test_double.wav", "test_double.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		azimuth: [30, 230]
		elevation: [15, 15]
		delay: [0, 0]
		overlap: -0.5

		a5:
		name: "G3S5.wav"
		description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances."
		source: ["test_double.wav", "test_double.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		azimuth: [20, 170]
		elevation: [15, 15]
		delay: [0, 0]
		overlap: -0.5

		a6:
		name: "G2S6.wav"
		description: "two talkers sitting at a table, at different azimuth angles with respect to the microphone, non-overlapping utterances."
		source: ["test_double.wav", "test_double.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		azimuth: [10, 110]
		elevation: [15, 15]
		delay: [0, 0]
		overlap: -0.5

		b1:
		name: "G2S1.wav"
		description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances."
		source: ["test_double.wav", "test_double.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		azimuth: [20, 170]
		elevation: [30, 30]
		delay: [0, 1.5]
		overlap: 0.5

		b2:
		name: "G1S2.wav"
		description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances."
		source: ["test_double.wav", "test_double.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		azimuth: [10, 110]
		elevation: [30, 30]
		delay: [0, 1.5]
		overlap: 0.5

		b3:
		name: "G6S3.wav"
		description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances."
		source: ["test_double.wav", "test_double.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		azimuth: [0, 50]
		elevation: [30, 30]
		delay: [0, 1.5]
		overlap: 0.5

		b4:
		name: "G5S4.wav"
		description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances."
		source: ["test_double.wav", "test_double.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		azimuth: [50, 350]
		elevation: [60, 60]
		delay: [0, 1.5]
		overlap: 0.5

		b5:
		name: "G4S5.wav"
		description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances."
		source: ["test_double.wav", "test_double.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		azimuth: [40, 290]
		elevation: [60, 60]
		delay: [0, 1.5]
		overlap: 0.5

		b6:
		name: "G3S6.wav"
		description: "two standing talkers, at different azimuth angles with respect to the microphone, ~30% overlapping utterances."
		source: ["test_double.wav", "test_double.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		azimuth: [30, 230]
		elevation: [60, 60]
		delay: [0, 1.5]
		overlap: 0.5

		c1:
		name: "G3S1.wav"
		description: "one talker sitting at a table, second talker standing beside the table, non-overlapping utterances."
		source: ["test_double.wav", "test_double.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		azimuth: [40, 290]
		elevation: [0, 60]
		delay: [0, 0]
		overlap: -0.5

		c2:
		name: "G2S2.wav"
		description: "one talker sitting at a table, second talker standing beside the table, non-overlapping utterances."
		source: ["test_double.wav", "test_double.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		azimuth: [30, 230]
		elevation: [0, 60]
		delay: [0, 0]
		overlap: -0.5

		c3:
		name: "G1S3.wav"
		description: "one talker sitting at a table, second talker standing beside the table, non-overlapping utterances."
		source: ["test_double.wav", "test_double.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		azimuth: [20, 170]
		elevation: [0, 60]
		delay: [0, 0]
		overlap: -0.5

		c4:
		name: "G6S4.wav"
		description: "one talker sitting at a table, second talker standing beside the table, non-overlapping utterances."
		source: ["test_double.wav", "test_double.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		azimuth: [10, 110]
		elevation: [0, 60]
		delay: [0, 1]
		shift: [0, 1]

		c5:
		name: "G5S5.wav"
		description: "one talker sitting at a table, second talker standing beside the table, non-overlapping utterances."
		source: ["test_double.wav", "test_double.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		azimuth: [0, 50]
		elevation: [0, 60]
		delay: [0, 0]
		overlap: -0.5

		c6:
		name: "G4S6.wav"
		description: "one talker sitting at a table, second talker standing beside the table, non-overlapping utterances."
		source: ["test_double.wav", "test_double.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		azimuth: [50, 350]
		elevation: [0, 60]
		delay: [0, 0]
		overlap: -0.5

		d1:
		name: "G4S1.wav"
		description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances."
		source: ["test_double.wav", "test_double.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		azimuth: [50, "180:1:120 + 360"]
		elevation: [0, 60]
		delay: [0, 1.5]
		overlap: 0.5

		d2:
		name: "G3S2.wav"
		description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances."
		source: ["test_double.wav", "test_double.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		azimuth: [300, "-70:-1:-10 - 360"]
		elevation: [0, 60]
		delay: [0, 1.5]
		overlap: 0.5

		d3:
		name: "G2S3.wav"
		description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances."
		source: ["test_double.wav", "test_double.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		azimuth: [250, "-20:-1:-320"]
		elevation: [0, 60]
		delay: [0, 1.5]
		overlap: 0.5

		d4:
		name: "G1S4.wav"
		description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances."
		source: ["test_double.wav", "test_double.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		azimuth: [200, "30:-1:-270"]
		elevation: [0, 60]
		delay: [0, 1.5]
		overlap: 0.5

		d5:
		name: "G6S5.wav"
		description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances."
		source: ["test_double.wav", "test_double.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		azimuth: [150, "80:1:20 + 360"]
		elevation: [0, 60]
		delay: [0, 1.5]
		overlap: 0.5

		d6:
		name: "G5S6.wav"
		description: "one talker sitting at a table, second talker walking around the table, ~30% overlapping utterances."
		source: ["test_double.wav", "test_double.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		azimuth: [100, "130:1:70 + 360"]
		elevation: [0, 60]
		delay: [0, 1.5]
		overlap: 0.5

		e1:
		name: "G5S1.wav"
		description: "two talkers walking side-by-side around the table, ~30% overlapping utterances"
		source: ["test_double.wav", "test_double.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		azimuth: ["80:1:20 + 360", "80:1:20 + 360"]
		elevation: [10, 60]
		delay: [0, 1.5]
		overlap: 0.5

		e2:
		name: "G4S2.wav"
		description: "two talkers walking side-by-side around the table, ~30% overlapping utterances"
		source: ["test_double.wav", "test_double.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		azimuth: ["130:1:70 + 360", "130:1:70 + 360"]
		elevation: [10, 60]
		delay: [0, 1.5]
		overlap: 0.5

		e3:
		name: "G3S3.wav"
		description: "two talkers walking side-by-side around the table, ~30% overlapping utterances"
		source: ["test_double.wav", "test_double.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		azimuth: ["180:1:120 + 360", "180:1:120 + 360"]
		elevation: [10, 60]
		delay: [0, 1.5]
		overlap: 0.5

		e4:
		name: "G2S4.wav"
		description: "two talkers walking side-by-side around the table, ~30% overlapping utterances"
		source: ["test_double.wav", "test_double.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		azimuth: ["-70:-1:-10 - 360", "-70:-1:-10 - 360"]
		elevation: [10, 60]
		delay: [0, 1.5]
		overlap: 0.5

		e5:
		name: "G1S5.wav"
		description: "two talkers walking side-by-side around the table, ~30% overlapping utterances"
		source: ["test_double.wav", "test_double.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		azimuth: ["-20:-1:-320", "-20:-1:-320"]
		elevation: [10, 60]
		delay: [0, 1.5]
		overlap: 0.5

		e6:
		name: "G6S6.wav"
		description: "two talkers walking side-by-side around the table, ~30% overlapping utterances"
		source: ["test_double.wav", "test_double.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		azimuth: ["30:-1:-270", "30:-1:-270"]
		elevation: [10, 60]
		delay: [0, 1.5]
		overlap: 0.5

		f1:
		name: "G6S1.wav"
		description: "two talkers walking around the table in opposite directions, non-overlapping utterances."
		source: ["test_double.wav", "test_double.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		azimuth: ["60:1:0 + 360", "60:-1:120 - 360"]
		elevation: [20, 50]
		delay: [0, 0]
		overlap: -0.5

		f2:
		name: "G5S2.wav"
		description: "two talkers walking around the table in opposite directions, non-overlapping utterances."
		source: ["test_double.wav", "test_double.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		azimuth: ["0:1:300", "0:-1:60 - 360"]
		elevation: [20, 50]
		delay: [0, 0]
		overlap: -0.5

		f3:
		name: "G4S3.wav"
		description: "two talkers walking around the table in opposite directions, non-overlapping utterances."
		source: ["test_double.wav", "test_double.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		azimuth: ["300:1:240 + 360", "300:-1:0"]
		elevation: [20, 50]
		delay: [0, 0]
		overlap: -0.5

		f4:
		name: "G3S4.wav"
		description: "two talkers walking around the table in opposite directions, non-overlapping utterances."
		source: ["test_double.wav", "test_double.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		azimuth: ["240:1:180 + 360", "240:-1:-60"]
		elevation: [20, 50]
		delay: [0, 0]
		overlap: -0.5

		f5:
		name: "G2S5.wav"
		description: "two talkers walking around the table in opposite directions, non-overlapping utterances."
		source: ["test_double.wav", "test_double.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		azimuth: ["180:1:120 + 360", "180:-1:-120"]
		elevation: [20, 50]
		delay: [0, 0]
		overlap: -0.5

		f6:
		name: "G1S6.wav"
		description: "two talkers walking around the table in opposite directions, non-overlapping utterances."
		source: ["test_double.wav", "test_double.wav"]
		source: ["test_talker1_trimmed.wav", "test_talker2_trimmed.wav"]
		azimuth: ["120:1:60 + 360", "120:-1:180 - 360"]
		elevation: [20, 50]
		delay: [0, 0]
		overlap: -0.5

		No newline at end of file

item_generation_scripts/processing/process_ism_items.py

+81 −11

Original line number	Diff line number	Diff line
		@@ -41,6 +41,7 @@ from math import floor
		from item_generation_scripts.audiotools import audio, audiofile
		from item_generation_scripts.audiotools.wrappers.bs1770 import get_loudness

		SEED_RANDOM_NOISE = 0

		# function for converting nd numpy array to strings with 2 decimal digits
		def csv_formatdata(data):
		@@ -56,6 +57,9 @@ def generate_ism_items(
		scenes: dict,
		logger: logging.Logger,
		fs: Optional[int] = 48000,
		preamble: Optional[float] = 0.0,
		postamble: Optional[float] = 0.0,
		add_low_level_random_noise: Optional[bool] = False,
		):
		"""Generate ISM items with metadata from mono items based on scene description"""

		@@ -75,6 +79,12 @@ def generate_ism_items(
		y = audio.ChannelBasedAudio("MONO")
		y_meta = None

		# read the overlap length
		if 'overlap' in scene.keys():
		source_overlap = float(scene["overlap"])
		else:
		source_overlap = 0.0

		# repeat for all source files
		for i in range(N_sources):

		@@ -82,10 +92,6 @@ def generate_ism_items(
		source_file = np.atleast_1d(scene["source"])[i]
		source_azi = np.atleast_1d(scene["azimuth"])[i]
		source_ele = np.atleast_1d(scene["elevation"])[i]
		if 'delay' in scene.keys():
		source_delay = np.atleast_1d(scene["delay"])[i]
		else:
		source_delay = np.array([0])

		logger.info(
		f"Encoding {source_file} at position(s) {source_azi},{source_ele}"
		@@ -94,12 +100,15 @@ def generate_ism_items(
		# read source file
		x = audio.fromfile("MONO", os.path.join(input_path, source_file), fs=fs)

		############### DEBUG ############33
		# x.audio = x.audio[:-10]

		# get the number of frames (multiple of 20ms)
		N_frames = int(len(x.audio) / x.fs * 50)

		# trim the source signal to align to 20ms boundary
		N_trim = int(N_frames * x.fs / 50)
		x.audio = x.audio[:N_trim]
		# N_trim = int(N_frames * x.fs / 50)
		# x.audio = x.audio[:N_trim]

		# adjust the level of the source file
		_, scale_factor = get_loudness(x, target_level, "MONO")
		@@ -172,10 +181,16 @@ def generate_ism_items(
		# arrange all metadata fields column-wise into a matrix
		x_meta = np.column_stack((azi, ele, dist, spread, gain))

		# delay the source file
		if source_delay > 0:
		# shift the second (and all other) source files (positive shift creates overlap, negative shift creates a gap)
		if i > 0 and source_overlap != 0.0:
		# get the length of the first source file
		N_delay = len(y.audio[:,0])

		# add the shift
		N_delay += int(source_overlap * x.fs)

		# ensure delay is a multiple of 20ms
		N_delay = int(floor(source_delay * 50) / 50 * x.fs)
		# N_delay = int(floor(source_shift * 50) / 50 * x.fs)

		# insert all-zero preamble
		pre = np.zeros((N_delay, x.audio.shape[1]))
		@@ -187,12 +202,27 @@ def generate_ism_items(
		) # !!!! TBD - check if we should insert netrual position or the first position of the metadata
		x_meta = np.concatenate([pre, x_meta])

		# pad with zeros to ensure that the signal length is a multiple of 20ms
		N_frame = x.fs / 50
		if len(x.audio) % N_frame != 0:
		N_pad = int(N_frame - len(x.audio) % N_frame)

		# insert all-zero preamble
		pre = np.zeros((N_pad, x.audio.shape[1]))
		x.audio = np.concatenate([pre, x.audio])

		# insert neutral position as a pre-amble
		pre = np.tile(
		[0.00, 0.00, 1.00, 0.00, 1.00], (N_pad, 1)
		) # !!!! TBD - check if we should insert netrual position or the first position of the metadata
		x_meta = np.concatenate([pre, x_meta])

		# add source signal to the array of all source signals
		y.fs = x.fs
		if y.audio is None:
		y.audio = x.audio
		else:
		# append zeros to have equal length of all source signals
		# pad with zeros to have the same length of all source signals
		if x.audio.shape[0] > y.audio.shape[0]:
		y.audio = np.vstack((y.audio, np.zeros((x.audio.shape[0] - y.audio.shape[0], y.audio.shape[1]))))
		elif y.audio.shape[0] > x.audio.shape[0]:
		@@ -234,6 +264,46 @@ def generate_ism_items(

		y_meta = np.concatenate([y_meta, x_meta])

		# append pre-amble and post-amble to all sources
		if preamble != 0.0:
		# ensure that pre-mable is a multiple of 20ms
		N_pre = int(floor(preamble * 50) / 50 * y.fs)

		# insert all-zero preamble to all sources
		pre = np.zeros((N_pre, y.audio.shape[1]))
		y.audio = np.concatenate([pre, y.audio])

		# insert neutral position as a pre-amble to all sources
		pre = np.tile(
		[0.00, 0.00, 1.00, 0.00, 1.00], (y_meta.shape[0], N_pre, 1)
		) # !!!! TBD - check if we should insert netrual position or the first position of the metadata
		y_meta = np.concatenate([pre, y_meta], axis=1)

		if postamble != 0.0:
		# ensure that post-mable is a multiple of 20ms
		N_post = int(floor(postamble * 50) / 50 * y.fs)

		# append all-zero postamble to all sources
		post = np.zeros((N_post, y.audio.shape[1]))
		y.audio = np.concatenate([y.audio, post])

		# append neutral position as a post-amble to all sources
		post = np.tile(
		[0.00, 0.00, 1.00, 0.00, 1.00], (y_meta.shape[0], N_post, 1)
		) # !!!! TBD - check if we should insert netrual position or the last position of the metadata
		y_meta = np.concatenate([y_meta, post], axis=1)

		# add random noise
		if add_low_level_random_noise:
		# create uniformly distributed noise between -4 and 4
		np.random.seed(SEED_RANDOM_NOISE)
		noise = np.random.randint(
		low=-4, high=5, size=y.audio.shape
		).astype("float")

		# superimpose
		y.audio += noise

		# write individual ISM audio streams to the output file in an interleaved format
		output_filename = scene["name"]
		audiofile.write(