improve prinout messages, apply fade-in, fade-out, correct input source shifting (8cc50861) · Commits · IVAS Codec Public Collaboration / IVAS Processing Scripts

ivas_processing_scripts/generation/generate_ambi_items.py

+23 −11

Original line number	Diff line number	Diff line
		@@ -188,8 +188,9 @@ def generate_ambi_scene(
		- Writes the processed FOA/HOA2/HOA3 audio to the output file.
		"""

		scenes = list(cfg.scenes.keys())
		logger.info(
		f"Processing scene: {scene_name} out of {len(cfg.scenes)} scenes, output: {scene['output']}"
		f"Processing scene \"{scene_name}\" ({scenes.index(scene_name) + 1} out of {len(scenes)}), output file: {scene['output']}"
		)

		# extract the number of audio sources
		@@ -238,6 +239,7 @@ def generate_ambi_scene(
		source_shift = 0.0

		# convert overlap to samples and ensure it is a multiple of 20ms
		source_shift_in_seconds = source_shift
		source_shift = source_shift * cfg.fs
		if source_shift >= 0:
		source_shift = int(np.floor(source_shift / frame_len) * frame_len)
		@@ -254,7 +256,7 @@ def generate_ambi_scene(
		else:
		level = -26

		logger.info(f"Convolving {source_file} with {IR_file}")
		logger.info(f"-- Convolving {source_file} with {IR_file} at {level} LUFS with shift of {source_shift_in_seconds} seconds")

		# read source file
		x = audio.fromfile("MONO", input_filename)
		@@ -279,7 +281,7 @@ def generate_ambi_scene(
		elif cfg.format == "HOA3":
		x = reverb_hoa3(x, IR)

		# adjust the level of the target signal
		# adjust the level of the FOA/HOA2/HOA3 signal
		x.audio, _ = loudness_norm(x, level, loudness_format="STEREO")

		# ensure the length of the audio source signal is a multiple of 20ms
		@@ -294,9 +296,10 @@ def generate_ambi_scene(
		# add source signal to the array of all source signals
		y.audio = x.audio.copy()

		# if source_shift < 0:
		# # insert zeros to the new audio source signal to shift it right
		# metadata.trim_meta(y, limits=[source_shift, 0], samples=True)
		if source_shift < 0:
		# insert zeros to the new audio source signal to shift it right
		y.audio = audioarray.trim_meta(y.audio, y.fs, limits=[source_shift, 0], samples=True)
		else:
		offset = source_shift
		else:
		# shift the beginning of the audio source signal
		@@ -344,20 +347,29 @@ def generate_ambi_scene(
		if len(y.audio) != duration:
		y.audio = audioarray.trim(y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True)

		# adjust the loudness of the output signal
		logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LUFS")
		y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="BINAURAL")

		# apply fade-in and fade-out
		if "fade_in_out" in cfg.__dict__ and cfg.fade_in_out > 0:
		logger.info(f"-- Applying fade-in and fade-out with {cfg.fade_in_out} seconds")
		y.audio = audioarray.window(y.audio, y.fs, cfg.fade_in_out * 1000)

		# write the FOA/HOA2/HOA3 audio signal into output file
		audiofile.write(output_filename, y.audio, y.fs)

		# convert to BINAURAL, if option was chosen
		if cfg.binaural_output:
		binaudio = audio.fromtype("BINAURAL")
		binaudio.fs = y.fs
		convert_scenebased(y, binaudio)
		binaural_output_filename = output_filename.with_name(
		output_filename.stem + "_BINAURAL" + output_filename.suffix
		)
		logger.info(f"-- Converting to BINAURAL output file: {binaural_output_filename}")
		binaudio = audio.fromtype("BINAURAL")
		binaudio.fs = y.fs
		convert_scenebased(y, binaudio)
		audiofile.write(
		binaural_output_filename,
		binaudio.audio,
		binaudio.fs,
		)
		logger.info(f"Written BINAURAL output to: {binaural_output_filename}")

ivas_processing_scripts/generation/generate_ismN_items.py

+23 −9

Original line number	Diff line number	Diff line
		@@ -176,8 +176,9 @@ def generate_ismN_scene(
		- Writes the processed audio and metadata to output files.
		"""

		scenes = list(cfg.scenes.keys())
		logger.info(
		f"Processing {scene_name} out of {len(cfg.scenes)} scenes, output: {scene['output']}"
		f"Processing scene \"{scene_name}\" ({scenes.index(scene_name) + 1} out of {len(scenes)}), output file: {scene['output']}"
		)

		# extract the number of audio sources
		@@ -236,6 +237,7 @@ def generate_ismN_scene(
		source_shift = 0.0

		# convert overlap to samples and ensure it is a multiple of 20ms
		source_shift_in_seconds = source_shift
		source_shift = source_shift * cfg.fs
		if source_shift >= 0:
		source_shift = int(np.floor(source_shift / frame_len) * frame_len)
		@@ -250,7 +252,7 @@ def generate_ismN_scene(
		else:
		level = -26

		logger.info(f"Encoding {source_file} at position(s) {source_azi},{source_ele}")
		logger.info(f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LUFS with shift of {source_shift_in_seconds} seconds")

		# read source file
		x = audio.fromtype("ISM1")
		@@ -357,9 +359,11 @@ def generate_ismN_scene(
		y.audio = x.audio.copy()
		y.object_pos = x.object_pos.copy()
		y.fs = x.fs
		# if source_shift < 0:
		# # insert zeros to the new audio source signal to shift it right
		# metadata.trim_meta(y, limits=[source_shift, 0], samples=True)

		if source_shift < 0:
		# insert zeros to the new audio source signal to shift it right
		metadata.trim_meta(y, limits=[source_shift, 0], samples=True)
		else:
		offset = source_shift
		else:
		# shift the beginning of the audio source signal
		@@ -410,18 +414,28 @@ def generate_ismN_scene(
		if len(y.audio) != duration:
		metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True)

		# adjust the loudness of the output signal
		logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LUFS")
		y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="BINAURAL")

		# apply fade-in and fade-out
		if "fade_in_out" in cfg.__dict__ and cfg.fade_in_out > 0:
		logger.info(f"-- Applying fade-in and fade-out with {cfg.fade_in_out} seconds")
		y.audio = audioarray.window(y.audio, y.fs, cfg.fade_in_out * 1000)

		# write the ISMn output to .wav file in an interleaved format and ISM metadata in .csv files
		audiofile.write(output_filename, y.audio, y.fs)
		metadata.write_ISM_metadata_in_file(y.object_pos, y.metadata_files)

		# convert to BINAURAL, if option was chosen
		if cfg.binaural_output:
		binaudio = audio.fromtype("BINAURAL")
		binaudio.fs = y.fs
		convert_objectbased(y, binaudio)
		binaural_output_filename = output_filename.with_name(
		output_filename.stem + "_BINAURAL" + output_filename.suffix
		)
		logger.info(f"-- Converting to BINAURAL output file: {binaural_output_filename}")
		binaudio = audio.fromtype("BINAURAL")
		binaudio.fs = y.fs
		convert_objectbased(y, binaudio)
		audiofile.write(
		binaural_output_filename,
		binaudio.audio,

ivas_processing_scripts/generation/generate_omasa_items.py

+22 −9

Original line number	Diff line number	Diff line
		@@ -175,8 +175,9 @@ def generate_OMASA_scene(
		- Handles various audio formats (e.g., MONO, FOA, HOA2) and applies transformations like loudness normalization, trimming, and padding.
		"""

		scenes = list(cfg.scenes.keys())
		logger.info(
		f"Processing {scene_name} out of {len(cfg.scenes)} scenes, output: {scene['output']}"
		f"Processing scene \"{scene_name}\" ({scenes.index(scene_name) + 1} out of {len(scenes)}), output file: {scene['output']}"
		)

		# extract the number of audio sources
		@@ -238,6 +239,7 @@ def generate_OMASA_scene(
		source_shift = 0.0

		# convert overlap to samples and ensure it is a multiple of 20ms
		source_shift_in_seconds = source_shift
		source_shift = source_shift * cfg.fs
		if source_shift >= 0:
		source_shift = int(np.floor(source_shift / frame_len) * frame_len)
		@@ -254,7 +256,7 @@ def generate_OMASA_scene(
		else:
		level = -26

		logger.info(f"Encoding {source_file} at position(s) {source_azi},{source_ele}")
		logger.info(f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LUFS with shift of {source_shift_in_seconds} seconds")

		# get the number of channels from the .wav file header
		wav_header = audiofile.parse_wave_header(input_filename)
		@@ -404,9 +406,10 @@ def generate_OMASA_scene(
		# if ISM, append object position to the OMASA object
		y.object_pos = x.object_pos.copy()

		# if source_shift < 0:
		# # insert zeros to the new audio source signal to shift it right
		# metadata.trim_meta(y, limits=[source_shift, 0], samples=True)
		if source_shift < 0:
		# insert zeros to the new audio source signal to shift it right
		metadata.trim_meta(y, limits=[source_shift, 0], samples=True)
		else:
		offset = source_shift
		else:
		# shift the beginning of the audio source signal
		@@ -458,18 +461,28 @@ def generate_OMASA_scene(
		if len(y.audio) != duration:
		metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True)

		# adjust the loudness of the output signal
		logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LUFS")
		y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="BINAURAL")

		# apply fade-in and fade-out
		if "fade_in_out" in cfg.__dict__ and cfg.fade_in_out > 0:
		logger.info(f"-- Applying fade-in and fade-out with {cfg.fade_in_out} seconds")
		y.audio = audioarray.window(y.audio, y.fs, cfg.fade_in_out * 1000)

		# write the OMASA audio output to .wav file in an interleaved format and ISM metadata in .csv files
		audiofile.write(output_filename, y.audio, y.fs)
		metadata.write_ISM_metadata_in_file(y.object_pos, y.metadata_files[:-1])

		# convert to OMASA output to BINAURAL, if option was chosen
		if cfg.binaural_output:
		binaudio = audio.fromtype("BINAURAL")
		binaudio.fs = y.fs
		convert_omasa(y, binaudio)
		binaural_output_filename = output_filename.with_name(
		output_filename.stem + "_BINAURAL" + output_filename.suffix
		)
		logger.info(f"-- Converting to BINAURAL output file: {binaural_output_filename}")
		binaudio = audio.fromtype("BINAURAL")
		binaudio.fs = y.fs
		convert_omasa(y, binaudio)
		audiofile.write(
		binaural_output_filename,
		binaudio.audio,

ivas_processing_scripts/generation/generate_osba_items.py

+23 −10

Original line number	Diff line number	Diff line
		@@ -173,8 +173,9 @@ def generate_OSBA_scene(
		- Handles various audio formats (e.g., FOA, HOA2, HOA3) and applies transformations like loudness normalization, trimming, and padding.
		"""

		scenes = list(cfg.scenes.keys())
		logger.info(
		f"Processing {scene_name} out of {len(cfg.scenes)} scenes, output: {scene['output']}"
		f"Processing scene \"{scene_name}\" ({scenes.index(scene_name) + 1} out of {len(scenes)}), output file: {scene['output']}"
		)

		# extract the number of audio sources
		@@ -233,6 +234,7 @@ def generate_OSBA_scene(
		source_shift = 0.0

		# convert overlap to samples and ensure it is a multiple of 20ms
		source_shift_in_seconds = source_shift
		source_shift = source_shift * cfg.fs
		if source_shift >= 0:
		source_shift = int(np.floor(source_shift / frame_len) * frame_len)
		@@ -249,7 +251,7 @@ def generate_OSBA_scene(
		else:
		level = -26

		logger.info(f"Encoding {source_file} at position(s) {source_azi},{source_ele}")
		logger.info(f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LUFS with shift of {source_shift_in_seconds} seconds")

		# get the number of channels from the .wav file header
		wav_header = audiofile.parse_wave_header(input_filename)
		@@ -266,7 +268,7 @@ def generate_OSBA_scene(
		elif N_channels == 16:
		fmt = "HOA3"
		else:
		logger.info(
		logger.error(
		f"Error: Input format of the source file with {N_channels} channels is not supported!"
		)
		sys.exit(-1)
		@@ -386,9 +388,10 @@ def generate_OSBA_scene(
		# if ISM, append object position to the OSBA object
		y.object_pos = x.object_pos.copy()

		# if source_shift < 0:
		# # insert zeros to the new audio source signal to shift it right
		# metadata.trim_meta(y, limits=[source_shift, 0], samples=True)
		if source_shift < 0:
		# insert zeros to the new audio source signal to shift it right
		metadata.trim_meta(y, limits=[source_shift, 0], samples=True)
		else:
		offset = source_shift
		else:
		# shift the beginning of the audio source signal
		@@ -440,18 +443,28 @@ def generate_OSBA_scene(
		if len(y.audio) != duration:
		metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True)

		# adjust the loudness of the output signal
		logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LUFS")
		y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="BINAURAL")

		# apply fade-in and fade-out
		if "fade_in_out" in cfg.__dict__ and cfg.fade_in_out > 0:
		logger.info(f"-- Applying fade-in and fade-out with {cfg.fade_in_out} seconds")
		y.audio = audioarray.window(y.audio, y.fs, cfg.fade_in_out * 1000)

		# write the OSBA audio output to .wav file in an interleaved format and ISM metadata in .csv files
		audiofile.write(output_filename, y.audio, y.fs)
		metadata.write_ISM_metadata_in_file(y.object_pos, y.metadata_files)

		# convert the OSBA output to BINAURAL, if option was chosen
		if cfg.binaural_output:
		binaudio = audio.fromtype("BINAURAL")
		binaudio.fs = y.fs
		convert_osba(y, binaudio)
		binaural_output_filename = output_filename.with_name(
		output_filename.stem + "_BINAURAL" + output_filename.suffix
		)
		logger.info(f"-- Converting to BINAURAL output file: {binaural_output_filename}")
		binaudio = audio.fromtype("BINAURAL")
		binaudio.fs = y.fs
		convert_osba(y, binaudio)
		audiofile.write(
		binaural_output_filename,
		binaudio.audio,

ivas_processing_scripts/generation/generate_stereo_items.py

+19 −7

Original line number	Diff line number	Diff line
		@@ -194,8 +194,9 @@ def generate_stereo_scene(
		- Writes the processed STEREO audio to output file.
		"""

		scenes = list(cfg.scenes.keys())
		logger.info(
		f"Processing scene: {scene_name} out of {len(cfg.scenes)} scenes, output: {scene['output']}"
		f"Processing scene \"{scene_name}\" ({scenes.index(scene_name) + 1} out of {len(scenes)}), output file: {scene['output']}"
		)

		# extract the number of audio sources
		@@ -244,6 +245,7 @@ def generate_stereo_scene(
		source_shift = 0.0

		# convert overlap to samples and ensure it is a multiple of 20ms
		source_shift_in_seconds = source_shift
		source_shift = source_shift * cfg.fs
		if source_shift >= 0:
		source_shift = int(np.floor(source_shift / frame_len) * frame_len)
		@@ -260,7 +262,7 @@ def generate_stereo_scene(
		else:
		level = -26

		logger.info(f"Convolving {source_file} with {IR_file}")
		logger.info(f"-- Convolving {source_file} with {IR_file} at {level} LUFS with shift of {source_shift_in_seconds} seconds")

		# read source file
		x = audio.fromfile("MONO", input_filename)
		@@ -280,7 +282,7 @@ def generate_stereo_scene(
		# convolve MONO source audio with STEREO IR -> results in STEREO audio object
		x = reverb_stereo(x, IR)

		# adjust the level of the stereo signal
		# adjust the level of the STEREO signal
		x.audio, _ = loudness_norm(x, level, loudness_format="STEREO")

		# ensure the length of the audio source signal is a multiple of 20ms
		@@ -295,9 +297,10 @@ def generate_stereo_scene(
		# add source signal to the array of all source signals
		y.audio = x.audio.copy()

		# if source_shift < 0:
		# # insert zeros to the new audio source signal to shift it right
		# metadata.trim_meta(y, limits=[source_shift, 0], samples=True)
		if source_shift < 0:
		# insert zeros to the new audio source signal to shift it right
		y.audio = audioarray.trim(y.audio, x.fs, limits=[source_shift, 0], samples=True)
		else:
		offset = source_shift
		else:
		# shift the beginning of the audio source signal
		@@ -345,5 +348,14 @@ def generate_stereo_scene(
		if len(y.audio) != duration:
		y.audio = audioarray.trim(y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True)

		# adjust the loudness of the output signal
		logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LUFS")
		y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="STEREO")

		# apply fade-in and fade-out
		if "fade_in_out" in cfg.__dict__ and cfg.fade_in_out > 0:
		logger.info(f"-- Applying fade-in and fade-out with {cfg.fade_in_out} seconds")
		y.audio = audioarray.window(y.audio, y.fs, cfg.fade_in_out * 1000)

		# write the STEREO audio signal into output file
		audiofile.write(output_filename, y.audio, y.fs)