reformatting (de1f23bf) · Commits · IVAS Codec Public Collaboration / IVAS Processing Scripts

ivas_processing_scripts/generation/generate_ambi_items.py

+43 −17

Original line number	Diff line number	Diff line
		@@ -205,12 +205,12 @@ def generate_ambi_scene(
		source_file = (
		scene["input"][i] if isinstance(scene["input"], list) else scene["input"]
		)
		IR_file = (
		scene["IR"][i] if isinstance(scene["IR"], list) else scene["IR"]
		)
		IR_file = scene["IR"][i] if isinstance(scene["IR"], list) else scene["IR"]

		# get input filename and IR filename
		input_filename = Path(source_file).parent / (cfg.use_input_prefix + Path(source_file).name)
		input_filename = Path(source_file).parent / (
		cfg.use_input_prefix + Path(source_file).name
		)
		IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name)

		# read the overlap length
		@@ -241,7 +241,9 @@ def generate_ambi_scene(
		else:
		level = -26

		logger.info(f"-- Convolving {source_file} with {IR_file} at {level} LUFS with shift of {source_shift_in_seconds} seconds")
		logger.info(
		f"-- Convolving {source_file} with {IR_file} at {level} LUFS with shift of {source_shift_in_seconds} seconds"
		)

		# read source file
		x = audio.fromfile("MONO", input_filename)
		@@ -274,7 +276,9 @@ def generate_ambi_scene(
		# pad with zeros to ensure that the signal length is a multiple of 20ms
		if len(x.audio) % frame_len != 0:
		N_pad = int(frame_len - len(x.audio) % frame_len)
		x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True)
		x.audio = audioarray.trim(
		x.audio, x.fs, limits=[0, -N_pad], samples=True
		)

		# add the convolved FOA/HOA2/HOA3 audio source signal to the output signal
		if y.audio is None:
		@@ -283,7 +287,9 @@ def generate_ambi_scene(

		if source_shift < 0:
		# insert zeros to the new audio source signal to shift it right
		y.audio = audioarray.trim_meta(y.audio, y.fs, limits=[source_shift, 0], samples=True)
		y.audio = audioarray.trim_meta(
		y.audio, y.fs, limits=[source_shift, 0], samples=True
		)
		else:
		offset = source_shift
		else:
		@@ -291,33 +297,47 @@ def generate_ambi_scene(
		delta_offset = source_shift - offset
		if delta_offset > 0:
		# insert zeros to the existing output signal to shift it right
		y.audio = audioarray.trim(y.audio, y.fs, limits=[0, -delta_offset], samples=True)
		y.audio = audioarray.trim(
		y.audio, y.fs, limits=[0, -delta_offset], samples=True
		)
		offset = source_shift
		else:
		# insert zeros to the new audio source signal to shift it right
		x.audio = audioarray.trim(x.audio, x.fs, limits=[0, delta_offset], samples=True)
		x.audio = audioarray.trim(
		x.audio, x.fs, limits=[0, delta_offset], samples=True
		)

		# adjust the length of the audio source signal
		delta_length = len(x.audio) - len(y.audio)
		if delta_length > 0:
		# pad zeros to the existing output signal
		y.audio = audioarray.trim(y.audio, y.fs, limits=[0, -delta_length], samples=True)
		y.audio = audioarray.trim(
		y.audio, y.fs, limits=[0, -delta_length], samples=True
		)
		else:
		# pad zeros to the new audio source signal
		x.audio = audioarray.trim(x.audio, x.fs, limits=[0, delta_length], samples=True)
		x.audio = audioarray.trim(
		x.audio, x.fs, limits=[0, delta_length], samples=True
		)

		# superimpose
		y.audio += x.audio

		# append pre-amble and post-amble
		if "preamble" in cfg.__dict__ or "postamble" in cfg.__dict__:
		preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms
		postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms
		preamble = int(
		np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len
		) # convert to samples and ensure multiple of 20ms
		postamble = int(
		np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len
		) # convert to samples and ensure multiple of 20ms
		if preamble != 0 or postamble != 0:
		logger.info(
		f"-- Adding pre-amble of {cfg.preamble} seconds and post-amble of {cfg.postamble} seconds"
		)
		y.audio = audioarray.trim(y.audio, y.fs, limits=[-preamble, -postamble], samples=True)
		y.audio = audioarray.trim(
		y.audio, y.fs, limits=[-preamble, -postamble], samples=True
		)

		# add random noise
		if "add_low_level_random_noise" in cfg.__dict__ and cfg.add_low_level_random_noise:
		@@ -333,9 +353,13 @@ def generate_ambi_scene(
		else:
		# do not change the length of the audio signal
		duration = len(y.audio)
		duration = int(np.floor(duration / frame_len) * frame_len) # ensure multiple of 20ms
		duration = int(
		np.floor(duration / frame_len) * frame_len
		) # ensure multiple of 20ms
		if len(y.audio) != duration:
		y.audio = audioarray.trim(y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True)
		y.audio = audioarray.trim(
		y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True
		)

		# adjust the loudness of the output signal
		if "loudness" in cfg.__dict__:
		@@ -355,7 +379,9 @@ def generate_ambi_scene(
		binaural_output_filename = output_filename.with_name(
		output_filename.stem + "_BINAURAL" + output_filename.suffix
		)
		logger.info(f"-- Converting to BINAURAL output file: {binaural_output_filename}")
		logger.info(
		f"-- Converting to BINAURAL output file: {binaural_output_filename}"
		)
		binaudio = audio.fromtype("BINAURAL")
		binaudio.fs = y.fs
		convert_scenebased(y, binaudio)

ivas_processing_scripts/generation/generate_ismN_items.py

+30 −10

Original line number	Diff line number	Diff line
		@@ -208,14 +208,18 @@ def generate_ismN_scene(
		# read azimuth and elevation information
		if "azimuth" in scene.keys():
		source_azi = (
		scene["azimuth"][i] if isinstance(scene["azimuth"], list) else scene["azimuth"]
		scene["azimuth"][i]
		if isinstance(scene["azimuth"], list)
		else scene["azimuth"]
		)
		else:
		source_azi = 0.0

		if "elevation" in scene.keys():
		source_ele = (
		scene["elevation"][i] if isinstance(scene["elevation"], list) else scene["elevation"]
		scene["elevation"][i]
		if isinstance(scene["elevation"], list)
		else scene["elevation"]
		)
		else:
		source_ele = 0.0
		@@ -223,7 +227,9 @@ def generate_ismN_scene(
		# read the source shift length (in seconds)
		if "shift" in scene.keys():
		source_shift = (
		scene["shift"][i] if isinstance(scene["shift"], list) else scene["shift"]
		scene["shift"][i]
		if isinstance(scene["shift"], list)
		else scene["shift"]
		)
		else:
		source_shift = 0.0
		@@ -239,12 +245,16 @@ def generate_ismN_scene(
		# read the level
		if "level" in scene.keys():
		level = (
		scene["level"][i] if isinstance(scene["level"], list) else scene["level"]
		scene["level"][i]
		if isinstance(scene["level"], list)
		else scene["level"]
		)
		else:
		level = -26

		logger.info(f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LUFS with shift of {source_shift_in_seconds} seconds")
		logger.info(
		f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LUFS with shift of {source_shift_in_seconds} seconds"
		)

		# read source file
		x = audio.fromtype("ISM1")
		@@ -271,7 +281,9 @@ def generate_ismN_scene(
		# pad with zeros to ensure that the signal length is a multiple of 20ms
		if len(x.audio) % frame_len != 0:
		N_pad = int(frame_len - len(x.audio) % frame_len)
		x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True)
		x.audio = audioarray.trim(
		x.audio, x.fs, limits=[0, -N_pad], samples=True
		)

		# get the number of frames (multiple of 20ms)
		N_frames = int(len(x.audio) / frame_len)
		@@ -385,8 +397,12 @@ def generate_ismN_scene(

		# append pre-amble and post-amble
		if "preamble" in cfg.__dict__ or "postamble" in cfg.__dict__:
		preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms
		postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms
		preamble = int(
		np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len
		) # convert to samples and ensure multiple of 20ms
		postamble = int(
		np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len
		) # convert to samples and ensure multiple of 20ms
		if preamble != 0 or postamble != 0:
		logger.info(
		f"-- Adding pre-amble of {cfg.preamble} seconds and post-amble of {cfg.postamble} seconds"
		@@ -407,7 +423,9 @@ def generate_ismN_scene(
		else:
		# do not change the length of the audio signal
		duration = len(y.audio)
		duration = int(np.floor(duration / frame_len) * frame_len) # ensure multiple of 20ms
		duration = int(
		np.floor(duration / frame_len) * frame_len
		) # ensure multiple of 20ms
		if len(y.audio) != duration:
		metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True)

		@@ -430,7 +448,9 @@ def generate_ismN_scene(
		binaural_output_filename = output_filename.with_name(
		output_filename.stem + "_BINAURAL" + output_filename.suffix
		)
		logger.info(f"-- Converting to BINAURAL output file: {binaural_output_filename}")
		logger.info(
		f"-- Converting to BINAURAL output file: {binaural_output_filename}"
		)
		binaudio = audio.fromtype("BINAURAL")
		binaudio.fs = y.fs
		convert_objectbased(y, binaudio)

ivas_processing_scripts/generation/generate_omasa_items.py

+30 −10

Original line number	Diff line number	Diff line
		@@ -203,14 +203,18 @@ def generate_OMASA_scene(
		# read azimuth and elevation information
		if "azimuth" in scene.keys():
		source_azi = (
		scene["azimuth"][i] if isinstance(scene["azimuth"], list) else scene["azimuth"]
		scene["azimuth"][i]
		if isinstance(scene["azimuth"], list)
		else scene["azimuth"]
		)
		else:
		source_azi = 0.0

		if "elevation" in scene.keys():
		source_ele = (
		scene["elevation"][i] if isinstance(scene["elevation"], list) else scene["elevation"]
		scene["elevation"][i]
		if isinstance(scene["elevation"], list)
		else scene["elevation"]
		)
		else:
		source_ele = 0.0
		@@ -218,7 +222,9 @@ def generate_OMASA_scene(
		# read the source shift length (in seconds)
		if "shift" in scene.keys():
		source_shift = (
		scene["shift"][i] if isinstance(scene["shift"], list) else scene["shift"]
		scene["shift"][i]
		if isinstance(scene["shift"], list)
		else scene["shift"]
		)
		else:
		source_shift = 0.0
		@@ -241,7 +247,9 @@ def generate_OMASA_scene(
		else:
		level = -26

		logger.info(f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LUFS with shift of {source_shift_in_seconds} seconds")
		logger.info(
		f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LUFS with shift of {source_shift_in_seconds} seconds"
		)

		# get the number of channels from the .wav file header
		wav_header = audiofile.parse_wave_header(input_filename)
		@@ -286,7 +294,9 @@ def generate_OMASA_scene(
		# pad with zeros to ensure that the signal length is a multiple of 20ms
		if len(x.audio) % frame_len != 0:
		N_pad = int(frame_len - len(x.audio) % frame_len)
		x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True)
		x.audio = audioarray.trim(
		x.audio, x.fs, limits=[0, -N_pad], samples=True
		)

		# get the number of frames (multiple of 20ms)
		N_frames = int(len(x.audio) / frame_len)
		@@ -421,12 +431,18 @@ def generate_OMASA_scene(
		y.object_pos.extend(x.object_pos)

		# add ISM metadata .csv filename (should end with .wav.0.csv, .wav.1.csv, ...)
		y.metadata_files.insert(i - 1, str(output_filename.with_suffix(f".{i - 1}.csv")))
		y.metadata_files.insert(
		i - 1, str(output_filename.with_suffix(f".{i - 1}.csv"))
		)

		# append pre-amble and post-amble
		if "preamble" in cfg.__dict__ or "postamble" in cfg.__dict__:
		preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms
		postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms
		preamble = int(
		np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len
		) # convert to samples and ensure multiple of 20ms
		postamble = int(
		np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len
		) # convert to samples and ensure multiple of 20ms
		if preamble != 0 or postamble != 0:
		logger.info(
		f"-- Adding pre-amble of {cfg.preamble} seconds and post-amble of {cfg.postamble} seconds"
		@@ -447,7 +463,9 @@ def generate_OMASA_scene(
		else:
		# do not change the length of the audio signal
		duration = len(y.audio)
		duration = int(np.floor(duration / frame_len) * frame_len) # ensure multiple of 20ms
		duration = int(
		np.floor(duration / frame_len) * frame_len
		) # ensure multiple of 20ms
		if len(y.audio) != duration:
		metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True)

		@@ -470,7 +488,9 @@ def generate_OMASA_scene(
		binaural_output_filename = output_filename.with_name(
		output_filename.stem + "_BINAURAL" + output_filename.suffix
		)
		logger.info(f"-- Converting to BINAURAL output file: {binaural_output_filename}")
		logger.info(
		f"-- Converting to BINAURAL output file: {binaural_output_filename}"
		)
		binaudio = audio.fromtype("BINAURAL")
		binaudio.fs = y.fs
		convert_omasa(y, binaudio)

ivas_processing_scripts/generation/generate_osba_items.py

+24 −8

Original line number	Diff line number	Diff line
		@@ -194,7 +194,9 @@ def generate_OSBA_scene(
		)

		# get input filename
		input_filename = Path(source_file).parent / (cfg.use_input_prefix + Path(source_file).name)
		input_filename = Path(source_file).parent / (
		cfg.use_input_prefix + Path(source_file).name
		)

		# read azimuth and elevation information
		source_azi = (
		@@ -236,7 +238,9 @@ def generate_OSBA_scene(
		else:
		level = -26

		logger.info(f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LUFS with shift of {source_shift_in_seconds} seconds")
		logger.info(
		f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LUFS with shift of {source_shift_in_seconds} seconds"
		)

		# get the number of channels from the .wav file header
		wav_header = audiofile.parse_wave_header(input_filename)
		@@ -281,7 +285,9 @@ def generate_OSBA_scene(
		# pad with zeros to ensure that the signal length is a multiple of 20ms
		if len(x.audio) % frame_len != 0:
		N_pad = int(frame_len - len(x.audio) % frame_len)
		x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True)
		x.audio = audioarray.trim(
		x.audio, x.fs, limits=[0, -N_pad], samples=True
		)

		# get the number of frames (multiple of 20ms)
		N_frames = int(len(x.audio) / frame_len)
		@@ -403,12 +409,18 @@ def generate_OSBA_scene(
		y.object_pos.extend(x.object_pos)

		# add ISM metadata .csv filename (should end with .wav.0.csv, .wav.1.csv, ...)
		y.metadata_files.insert(i - 1, str(output_filename.with_suffix(f".{i - 1}.csv")))
		y.metadata_files.insert(
		i - 1, str(output_filename.with_suffix(f".{i - 1}.csv"))
		)

		# append pre-amble and post-amble
		if "preamble" in cfg.__dict__ or "postamble" in cfg.__dict__:
		preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms
		postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms
		preamble = int(
		np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len
		) # convert to samples and ensure multiple of 20ms
		postamble = int(
		np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len
		) # convert to samples and ensure multiple of 20ms
		if preamble != 0 or postamble != 0:
		logger.info(
		f"-- Adding pre-amble of {cfg.preamble} seconds and post-amble of {cfg.postamble} seconds"
		@@ -429,7 +441,9 @@ def generate_OSBA_scene(
		else:
		# do not change the length of the audio signal
		duration = len(y.audio)
		duration = int(np.floor(duration / frame_len) * frame_len) # ensure multiple of 20ms
		duration = int(
		np.floor(duration / frame_len) * frame_len
		) # ensure multiple of 20ms
		if len(y.audio) != duration:
		metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True)

		@@ -452,7 +466,9 @@ def generate_OSBA_scene(
		binaural_output_filename = output_filename.with_name(
		output_filename.stem + "_BINAURAL" + output_filename.suffix
		)
		logger.info(f"-- Converting to BINAURAL output file: {binaural_output_filename}")
		logger.info(
		f"-- Converting to BINAURAL output file: {binaural_output_filename}"
		)
		binaudio = audio.fromtype("BINAURAL")
		binaudio.fs = y.fs
		convert_osba(y, binaudio)

ivas_processing_scripts/generation/generate_stereo_items.py

+40 −16

Original line number	Diff line number	Diff line
		@@ -211,12 +211,12 @@ def generate_stereo_scene(
		source_file = (
		scene["input"][i] if isinstance(scene["input"], list) else scene["input"]
		)
		IR_file = (
		scene["IR"][i] if isinstance(scene["IR"], list) else scene["IR"]
		)
		IR_file = scene["IR"][i] if isinstance(scene["IR"], list) else scene["IR"]

		# get input filename and IR filename
		input_filename = Path(source_file).parent / (cfg.use_input_prefix + Path(source_file).name)
		input_filename = Path(source_file).parent / (
		cfg.use_input_prefix + Path(source_file).name
		)
		IR_filename = Path(IR_file).parent / (cfg.use_IR_prefix + Path(IR_file).name)

		# read the overlap length
		@@ -247,7 +247,9 @@ def generate_stereo_scene(
		else:
		level = -26

		logger.info(f"-- Convolving {source_file} with {IR_file} at {level} LUFS with shift of {source_shift_in_seconds} seconds")
		logger.info(
		f"-- Convolving {source_file} with {IR_file} at {level} LUFS with shift of {source_shift_in_seconds} seconds"
		)

		# read source file
		x = audio.fromfile("MONO", input_filename)
		@@ -275,7 +277,9 @@ def generate_stereo_scene(
		# pad with zeros to ensure that the signal length is a multiple of 20ms
		if len(x.audio) % frame_len != 0:
		N_pad = int(frame_len - len(x.audio) % frame_len)
		x.audio = audioarray.trim(x.audio, x.fs, limits=[0, -N_pad], samples=True)
		x.audio = audioarray.trim(
		x.audio, x.fs, limits=[0, -N_pad], samples=True
		)

		# add the convolved STEREO audio source signal to the output signal
		if y.audio is None:
		@@ -284,7 +288,9 @@ def generate_stereo_scene(

		if source_shift < 0:
		# insert zeros to the new audio source signal to shift it right
		y.audio = audioarray.trim(y.audio, x.fs, limits=[source_shift, 0], samples=True)
		y.audio = audioarray.trim(
		y.audio, x.fs, limits=[source_shift, 0], samples=True
		)
		else:
		offset = source_shift
		else:
		@@ -292,33 +298,47 @@ def generate_stereo_scene(
		delta_offset = source_shift - offset
		if delta_offset > 0:
		# insert zeros to the existing output signal to shift it right
		y.audio = audioarray.trim(y.audio, y.fs, limits=[0, -delta_offset], samples=True)
		y.audio = audioarray.trim(
		y.audio, y.fs, limits=[0, -delta_offset], samples=True
		)
		offset = source_shift
		else:
		# insert zeros to the new audio source signal to shift it right
		x.audio = audioarray.trim(x.audio, x.fs, limits=[0, delta_offset], samples=True)
		x.audio = audioarray.trim(
		x.audio, x.fs, limits=[0, delta_offset], samples=True
		)

		# adjust the length of the audio source signal
		delta_length = len(x.audio) - len(y.audio)
		if delta_length > 0:
		# pad zeros to the existing output signal
		y.audio = audioarray.trim(y.audio, y.fs, limits=[0, -delta_length], samples=True)
		y.audio = audioarray.trim(
		y.audio, y.fs, limits=[0, -delta_length], samples=True
		)
		else:
		# pad zeros to the new audio source signal
		x.audio = audioarray.trim(x.audio, x.fs, limits=[0, delta_length], samples=True)
		x.audio = audioarray.trim(
		x.audio, x.fs, limits=[0, delta_length], samples=True
		)

		# superimpose
		y.audio += x.audio

		# append pre-amble and post-amble
		if "preamble" in cfg.__dict__ or "postamble" in cfg.__dict__:
		preamble = int(np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms
		postamble = int(np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len) # convert to samples and ensure multiple of 20ms
		preamble = int(
		np.floor(cfg.preamble * cfg.fs / frame_len) * frame_len
		) # convert to samples and ensure multiple of 20ms
		postamble = int(
		np.floor(cfg.postamble * cfg.fs / frame_len) * frame_len
		) # convert to samples and ensure multiple of 20ms
		if preamble != 0 or postamble != 0:
		logger.info(
		f"-- Adding pre-amble of {cfg.preamble} seconds and post-amble of {cfg.postamble} seconds"
		)
		y.audio = audioarray.trim(y.audio, y.fs, limits=[-preamble, -postamble], samples=True)
		y.audio = audioarray.trim(
		y.audio, y.fs, limits=[-preamble, -postamble], samples=True
		)

		# add random noise
		if "add_low_level_random_noise" in cfg.__dict__ and cfg.add_low_level_random_noise:
		@@ -334,9 +354,13 @@ def generate_stereo_scene(
		else:
		# do not change the length of the audio signal
		duration = len(y.audio)
		duration = int(np.floor(duration / frame_len) * frame_len) # ensure multiple of 20ms
		duration = int(
		np.floor(duration / frame_len) * frame_len
		) # ensure multiple of 20ms
		if len(y.audio) != duration:
		y.audio = audioarray.trim(y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True)
		y.audio = audioarray.trim(
		y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True
		)

		# adjust the loudness of the output signal
		if "loudness" in cfg.__dict__: