refactoring to use the trim_meta() function (c1ec9329) · Commits · IVAS Codec Public Collaboration / IVAS Processing Scripts

ivas_processing_scripts/generation/generate_omasa_items.py

+191 −129

Original line number	Diff line number	Diff line
		@@ -38,7 +38,7 @@ from pathlib import Path

		import numpy as np

		from ivas_processing_scripts.audiotools import audio, audioarray, audiofile
		from ivas_processing_scripts.audiotools import audio, audioarray, audiofile, metadata
		from ivas_processing_scripts.audiotools.convert.omasa import convert_omasa
		from ivas_processing_scripts.audiotools.convert.scenebased import render_sba_to_masa
		from ivas_processing_scripts.audiotools.wrappers.bs1770 import loudness_norm
		@@ -197,34 +197,62 @@ def generate_OMASA_scene(

		# initialize output OMASA object
		y = audio.OMASAAudio(omasa_format)
		y.fs = cfg.fs

		# set the frame length
		frame_len = int(cfg.fs / 50)

		# repeat for all source files
		offset = 0
		for i in range(N_inputs):
		# parse parameters from the scene description
		source_file = (
		scene["input"][i] if isinstance(scene["input"], list) else scene["input"]
		)

		# get input filename
		input_filename = Path(source_file).parent / (
		cfg.use_input_prefix + Path(source_file).name
		)

		# read azimuth and elevation information
		if "azimuth" in scene.keys():
		source_azi = (
		scene["azimuth"][i]
		if isinstance(scene["azimuth"], list)
		else scene["azimuth"]
		scene["azimuth"][i] if isinstance(scene["azimuth"], list) else scene["azimuth"]
		)
		else:
		source_azi = 0.0

		if "elevation" in scene.keys():
		source_ele = (
		scene["elevation"][i]
		if isinstance(scene["elevation"], list)
		else scene["elevation"]
		scene["elevation"][i] if isinstance(scene["elevation"], list) else scene["elevation"]
		)
		else:
		source_ele = 0.0

		# read the shift length
		# read the source shift length (in seconds)
		if "shift" in scene.keys():
		source_shift = (
		scene["shift"][i]
		if isinstance(scene["shift"], list)
		else scene["shift"]
		scene["shift"][i] if isinstance(scene["shift"], list) else scene["shift"]
		)
		else:
		source_shift = 0.0

		# convert overlap to samples and ensure it is a multiple of 20ms
		source_shift = source_shift * cfg.fs
		if source_shift >= 0:
		source_shift = int(np.floor(source_shift / frame_len) * frame_len)
		else:
		source_shift = int(np.ceil(source_shift / frame_len) * frame_len)

		# read the level
		if "level" in scene.keys():
		level = (
		scene["level"][i] if isinstance(scene["level"], list) else scene["level"]
		)
		else:
		level = -26

		# read the level
		if "level" in scene.keys():
		level = (
		@@ -237,11 +265,6 @@ def generate_OMASA_scene(

		logger.info(f"Encoding {source_file} at position(s) {source_azi},{source_ele}")

		# get input filename
		input_filename = Path(source_file).parent / (
		cfg.use_input_prefix + Path(source_file).name
		)

		# get the number of channels from the .wav file header
		wav_header = audiofile.parse_wave_header(input_filename)
		N_channels = wav_header["channels"]
		@@ -257,99 +280,71 @@ def generate_OMASA_scene(
		elif N_channels == 16:
		fmt = "HOA3"
		else:
		logger.info(
		logger.error(
		f"Error: Input format of the source file with {N_channels} channels is not supported!"
		)
		sys.exit(-1)

		if fmt in ["FOA", "HOA2", "HOA3"]:
		# generate MASA metadata .met filename (should end with .met)
		y.metadata_files.append(output_filename.with_suffix(".met"))
		elif fmt == "MONO":
		# generate ISM metadata .csv filename (should end with .wav.0.csv, .wav.1.csv, ...)
		y.metadata_files.insert(i - 1, output_filename.with_suffix(f".{i-1}.csv"))

		# read source file
		x = audio.fromfile(fmt, input_filename, fs=cfg.fs)

		# resample to the target fs if necessary
		if x.fs != cfg.fs:
		logger.warning(
		f"Warning: Sample rate of the audio source is {x.fs} Hz and needs to be resampled to {cfg.fs}!"
		)
		resampled_audio = audioarray.resample(x.audio, x.fs, cfg.fs)
		x.audio = resampled_audio
		x.fs = cfg.fs

		# adjust the level of the source file
		if fmt in ["FOA", "HOA2", "HOA3"]:
		x.audio, _ = loudness_norm(x, level, loudness_format="STEREO", rms=True)
		else:
		x.audio, _ = loudness_norm(x, level, loudness_format="MONO")

		# shift the source signal (positive shift creates overlap, negative shift creates a gap)
		if int(floor(-source_shift)) != 0:
		x.audio = audioarray.trim(x.audio, x.fs, limits=[-source_shift, 0])

		# get the number of frames (multiple of 20ms)
		frame_len = int(x.fs / 50)
		N_frames = int(len(x.audio) / frame_len)

		# ensure the length of the audio source signal is a multiple of 20ms
		if len(x.audio) % frame_len != 0:
		# pad with zeros to ensure that the signal length is a multiple of 20ms
		if len(x.audio) % frame_len != 0:
		# pad the source signal
		N_pad = int(frame_len - len(x.audio) % frame_len)
		x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True)

		# convert FOA/HOA2/HOA3 to MASA
		# get the number of frames (multiple of 20ms)
		N_frames = int(len(x.audio) / frame_len)

		# convert input audio source signal to MASA or ISM
		if fmt in ["FOA", "HOA2", "HOA3"]:
		# convert FOA/HOA2/HOA3 to MASA
		x_masa = audio.MetadataAssistedSpatialAudio(
		f"MASA{cfg.masa_tc}DIR{cfg.masa_dirs}"
		)
		x_masa.metadata_file = y.metadata_files[i]
		x_masa.fs = cfg.fs
		# generate MASA metadata filename (should end with .met)
		x_masa.metadata_file = output_filename.with_suffix(".met")
		render_sba_to_masa(x, x_masa)
		y.audio = x_masa.audio
		y.fs = x.fs
		else:
		# pad ISM signal with zeros to have the same length as the MASA signal
		N_pad = y.audio.shape[0] - x.audio.shape[0]
		if N_pad != 0:
		x.audio = audioarray.trim(
		x.audio, x.fs, limits=[0, -N_pad], samples=True
		)

		# append ISM signal to the OMASA object (ISM comes first !!!)
		y.audio = np.insert(y.audio, [i - 1], x.audio, axis=1)

		# append pre-amble and post-amble to all sources
		y.audio = audioarray.trim(y.audio, y.fs, limits=[-cfg.preamble, -cfg.postamble])

		# add random noise
		if cfg.add_low_level_random_noise:
		# create uniformly distributed noise between -4 and 4
		np.random.seed(SEED_RANDOM_NOISE)
		noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float")

		# superimpose
		y.audio += noise

		# generate ISM metadata files
		for i in range(1, N_ISMs + 1):
		# parse metadata parameters from the scene description
		source_azi = (
		scene["azimuth"][i]
		if isinstance(scene["azimuth"], list)
		else scene["azimuth"]
		)
		source_ele = (
		scene["elevation"][i]
		if isinstance(scene["elevation"], list)
		else scene["elevation"]
		)

		N_frames = int(np.rint((len(y.audio) / y.fs * 50)))
		x = x_masa # replace x with the MASA object
		elif fmt == "MONO":
		# convert MONO to ISM1
		x_ism = audio.ObjectBasedAudio("ISM1") # ISM with 1 channel
		x_ism.fs = cfg.fs
		x_ism.audio = x.audio.copy()

		# read azimuth information and convert to an array
		# convert azimuth information in case of moving object
		if isinstance(source_azi, str):
		if ":" in source_azi:
		# start with the initial azimuth value and apply step N_frames times
		source_azi = source_azi.split(":")
		azi = np.arange(
		float(eval(source_azi[0])),
		float(eval(source_azi[0])) + N_frames * float(eval(source_azi[1])),
		float(eval(source_azi[1])),
		)
		# convert into array (initial_value:step:stop_value)
		start_str, step_str, stop_str = source_azi.split(":")
		start = float(eval(start_str))
		step = float(eval(step_str))
		stop = float(eval(stop_str))
		azi = np.arange(start, stop, step)

		# adjust length to N_frames
		if len(azi) > N_frames:
		azi = azi[:N_frames]
		elif len(azi) < N_frames:
		azi = np.append(azi, np.full(N_frames - len(azi), azi[-1]))
		else:
		# replicate static azimuth value N_frames times
		azi = np.repeat(float(eval(source_azi)), N_frames)
		@@ -366,21 +361,22 @@ def generate_OMASA_scene(
		f"Incorrect value(s) of azimuth: {azi[(azi > 180) \| (azi < -180)]}"
		)

		# read elevation information and convert to an array
		# convert elevation information in case mof moving object
		if isinstance(source_ele, str):
		if ":" in source_ele:
		# convert into array (initial_value:step:stop_value)
		# note: the stop_value value is +-90 degrees depending on the sign of the step
		source_ele = source_ele.split(":")
		ele = np.arange(
		float(eval(source_ele[0])),
		np.sign(float(eval(source_ele[1]))) * 90,
		float(eval(source_ele[1])),
		)[:N_frames]

		# repeat the last elevation value, if array is shorter than N_frames
		if len(ele) < N_frames:
		start_str, step_str, stop_str = source_ele.split(":")
		start = float(eval(start_str))
		step = float(eval(step_str))
		stop = float(eval(stop_str))
		ele = np.arange(start, stop, step)

		# adjust length to N_frames
		if len(ele) > N_frames:
		ele = ele[:N_frames]
		elif len(ele) < N_frames:
		ele = np.append(ele, np.full(N_frames - len(ele), ele[-1]))

		else:
		# replicate static elevation value N_frames times
		ele = np.repeat(float(eval(source_ele)), N_frames)
		@@ -388,22 +384,90 @@ def generate_OMASA_scene(
		# replicate static elevation value N_frames times
		ele = np.repeat(float(source_ele), N_frames)

		# wrap elevation angle to -90 .. +90
		ele = ((ele + 90) % 180) - 90

		# check if elevation is from -90 .. +90
		if any(ele > 90) or any(ele < -90):
		logger.error(
		f"Incorrect value(s) of elevation: {ele[(ele > 90) \| (ele < -90)]}"
		)

		# generate radius vector with all values equal to 1.0
		rad = np.ones(N_frames)

		# arrange all metadata fields column-wise into a matrix
		x_meta = np.column_stack((azi, ele))
		x_ism.object_pos.append(np.column_stack((azi, ele, rad)))

		# write to .csv output metadata file
		np.savetxt(y.metadata_files[i - 1], x_meta, fmt="%0.2f", delimiter=",", encoding="utf-8")
		x = x_ism # replace x with the ISM object

		y.init_metadata() # this is needed to populate 'y.object_pos[]'
		# copy new audio source signal to the OMASA object
		if y.audio is None:
		# add the first audio source signal (should be MASA) to the array of all source signals
		y.audio = x.audio.copy()

		# write the OMASA output to .wav file in an interleaved format
		if "MASA" in x.name:
		# if MASA, append metadata file to the OMASA object
		y.metadata_files.append(x.metadata_file)
		else:
		# if ISM, append metadata file to the OMASA object
		y.object_pos = x.object_pos.copy()

		# if source_shift < 0:
		# # insert zeros to the new audio source signal to shift it right
		# metadata.trim_meta(y, limits=[source_shift, 0], samples=True)
		offset = source_shift
		else:
		# shift the beginning of the audio source signal
		delta_offset = source_shift - offset
		if delta_offset > 0:
		# insert zeros to the previous ISM signal(s) to shift them right
		metadata.trim_meta(y, limits=[-delta_offset, 0], samples=True)
		offset = source_shift
		else:
		# insert zeros to the new audio source signal to shift it right
		metadata.trim_meta(x, limits=[delta_offset, 0], samples=True)

		# adjust the length of the audio source signal
		delta_length = len(x.audio) - len(y.audio)
		if delta_length > 0:
		# pad zeros to the previous ISM signal(s)
		metadata.trim_meta(y, limits=[0, -delta_length], samples=True)
		else:
		# pad zeros to the new audio source signal
		metadata.trim_meta(x, limits=[0, delta_length], samples=True)

		# append ISM signal to the OMASA object (ISM comes first !!!)
		y.audio = np.insert(y.audio, [i - 1], x.audio, axis=1)
		y.object_pos.extend(x.object_pos)

		# add ISM metadata .csv filename (should end with .wav.0.csv, .wav.1.csv, ...)
		y.metadata_files.insert(i-1, str(output_filename.with_suffix(f".{i-1}.csv")))

		# append pre-amble and post-amble
		metadata.trim_meta(y, limits=[-cfg.preamble * 1000, -cfg.postamble * 1000])

		# add random noise
		if cfg.add_low_level_random_noise:
		# create uniformly distributed noise between -4 and 4
		np.random.seed(SEED_RANDOM_NOISE)
		noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype("float")
		y.audio += noise

		# adjust the length of the output signal
		if "duration" in cfg.__dict__:
		# trim the output signal such that the total duration is X seconds
		duration = int(cfg.duration * cfg.fs) # convert to samples
		else:
		# do not change the length of the audio signal
		duration = len(y.audio)
		duration = int(np.floor(duration / frame_len) * frame_len) # ensure multiple of 20ms
		if len(y.audio) != duration:
		metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True)

		# write the ISMn output to .wav file in an interleaved format and ISM metadata in .csv files
		audiofile.write(output_filename, y.audio, y.fs)
		metadata.write_ISM_metadata_in_file(y.object_pos, y.metadata_files[:-1])

		# convert to OMASA output to BINAURAL, if option was chosen
		if cfg.binaural_output:
		@@ -418,5 +482,3 @@ def generate_OMASA_scene(
		binaudio.audio,
		binaudio.fs,
		)

		return

Admin message