Loading ivas_processing_scripts/generation/generate_ambi_items.py +23 −11 Original line number Diff line number Diff line Loading @@ -188,8 +188,9 @@ def generate_ambi_scene( - Writes the processed FOA/HOA2/HOA3 audio to the output file. """ scenes = list(cfg.scenes.keys()) logger.info( f"Processing scene: {scene_name} out of {len(cfg.scenes)} scenes, output: {scene['output']}" f"Processing scene \"{scene_name}\" ({scenes.index(scene_name) + 1} out of {len(scenes)}), output file: {scene['output']}" ) # extract the number of audio sources Loading Loading @@ -238,6 +239,7 @@ def generate_ambi_scene( source_shift = 0.0 # convert overlap to samples and ensure it is a multiple of 20ms source_shift_in_seconds = source_shift source_shift = source_shift * cfg.fs if source_shift >= 0: source_shift = int(np.floor(source_shift / frame_len) * frame_len) Loading @@ -254,7 +256,7 @@ def generate_ambi_scene( else: level = -26 logger.info(f"Convolving {source_file} with {IR_file}") logger.info(f"-- Convolving {source_file} with {IR_file} at {level} LUFS with shift of {source_shift_in_seconds} seconds") # read source file x = audio.fromfile("MONO", input_filename) Loading @@ -279,7 +281,7 @@ def generate_ambi_scene( elif cfg.format == "HOA3": x = reverb_hoa3(x, IR) # adjust the level of the target signal # adjust the level of the FOA/HOA2/HOA3 signal x.audio, _ = loudness_norm(x, level, loudness_format="STEREO") # ensure the length of the audio source signal is a multiple of 20ms Loading @@ -294,9 +296,10 @@ def generate_ambi_scene( # add source signal to the array of all source signals y.audio = x.audio.copy() # if source_shift < 0: # # insert zeros to the new audio source signal to shift it right # metadata.trim_meta(y, limits=[source_shift, 0], samples=True) if source_shift < 0: # insert zeros to the new audio source signal to shift it right y.audio = audioarray.trim_meta(y.audio, y.fs, limits=[source_shift, 0], samples=True) else: offset = source_shift else: # shift the beginning of the audio source signal Loading Loading @@ -344,20 +347,29 @@ def generate_ambi_scene( if len(y.audio) != duration: y.audio = audioarray.trim(y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True) # adjust the loudness of the output signal logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LUFS") y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="BINAURAL") # apply fade-in and fade-out if "fade_in_out" in cfg.__dict__ and cfg.fade_in_out > 0: logger.info(f"-- Applying fade-in and fade-out with {cfg.fade_in_out} seconds") y.audio = audioarray.window(y.audio, y.fs, cfg.fade_in_out * 1000) # write the FOA/HOA2/HOA3 audio signal into output file audiofile.write(output_filename, y.audio, y.fs) # convert to BINAURAL, if option was chosen if cfg.binaural_output: binaudio = audio.fromtype("BINAURAL") binaudio.fs = y.fs convert_scenebased(y, binaudio) binaural_output_filename = output_filename.with_name( output_filename.stem + "_BINAURAL" + output_filename.suffix ) logger.info(f"-- Converting to BINAURAL output file: {binaural_output_filename}") binaudio = audio.fromtype("BINAURAL") binaudio.fs = y.fs convert_scenebased(y, binaudio) audiofile.write( binaural_output_filename, binaudio.audio, binaudio.fs, ) logger.info(f"Written BINAURAL output to: {binaural_output_filename}") ivas_processing_scripts/generation/generate_ismN_items.py +23 −9 Original line number Diff line number Diff line Loading @@ -176,8 +176,9 @@ def generate_ismN_scene( - Writes the processed audio and metadata to output files. """ scenes = list(cfg.scenes.keys()) logger.info( f"Processing {scene_name} out of {len(cfg.scenes)} scenes, output: {scene['output']}" f"Processing scene \"{scene_name}\" ({scenes.index(scene_name) + 1} out of {len(scenes)}), output file: {scene['output']}" ) # extract the number of audio sources Loading Loading @@ -236,6 +237,7 @@ def generate_ismN_scene( source_shift = 0.0 # convert overlap to samples and ensure it is a multiple of 20ms source_shift_in_seconds = source_shift source_shift = source_shift * cfg.fs if source_shift >= 0: source_shift = int(np.floor(source_shift / frame_len) * frame_len) Loading @@ -250,7 +252,7 @@ def generate_ismN_scene( else: level = -26 logger.info(f"Encoding {source_file} at position(s) {source_azi},{source_ele}") logger.info(f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LUFS with shift of {source_shift_in_seconds} seconds") # read source file x = audio.fromtype("ISM1") Loading Loading @@ -357,9 +359,11 @@ def generate_ismN_scene( y.audio = x.audio.copy() y.object_pos = x.object_pos.copy() y.fs = x.fs # if source_shift < 0: # # insert zeros to the new audio source signal to shift it right # metadata.trim_meta(y, limits=[source_shift, 0], samples=True) if source_shift < 0: # insert zeros to the new audio source signal to shift it right metadata.trim_meta(y, limits=[source_shift, 0], samples=True) else: offset = source_shift else: # shift the beginning of the audio source signal Loading Loading @@ -410,18 +414,28 @@ def generate_ismN_scene( if len(y.audio) != duration: metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True) # adjust the loudness of the output signal logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LUFS") y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="BINAURAL") # apply fade-in and fade-out if "fade_in_out" in cfg.__dict__ and cfg.fade_in_out > 0: logger.info(f"-- Applying fade-in and fade-out with {cfg.fade_in_out} seconds") y.audio = audioarray.window(y.audio, y.fs, cfg.fade_in_out * 1000) # write the ISMn output to .wav file in an interleaved format and ISM metadata in .csv files audiofile.write(output_filename, y.audio, y.fs) metadata.write_ISM_metadata_in_file(y.object_pos, y.metadata_files) # convert to BINAURAL, if option was chosen if cfg.binaural_output: binaudio = audio.fromtype("BINAURAL") binaudio.fs = y.fs convert_objectbased(y, binaudio) binaural_output_filename = output_filename.with_name( output_filename.stem + "_BINAURAL" + output_filename.suffix ) logger.info(f"-- Converting to BINAURAL output file: {binaural_output_filename}") binaudio = audio.fromtype("BINAURAL") binaudio.fs = y.fs convert_objectbased(y, binaudio) audiofile.write( binaural_output_filename, binaudio.audio, Loading ivas_processing_scripts/generation/generate_omasa_items.py +22 −9 Original line number Diff line number Diff line Loading @@ -175,8 +175,9 @@ def generate_OMASA_scene( - Handles various audio formats (e.g., MONO, FOA, HOA2) and applies transformations like loudness normalization, trimming, and padding. """ scenes = list(cfg.scenes.keys()) logger.info( f"Processing {scene_name} out of {len(cfg.scenes)} scenes, output: {scene['output']}" f"Processing scene \"{scene_name}\" ({scenes.index(scene_name) + 1} out of {len(scenes)}), output file: {scene['output']}" ) # extract the number of audio sources Loading Loading @@ -238,6 +239,7 @@ def generate_OMASA_scene( source_shift = 0.0 # convert overlap to samples and ensure it is a multiple of 20ms source_shift_in_seconds = source_shift source_shift = source_shift * cfg.fs if source_shift >= 0: source_shift = int(np.floor(source_shift / frame_len) * frame_len) Loading @@ -254,7 +256,7 @@ def generate_OMASA_scene( else: level = -26 logger.info(f"Encoding {source_file} at position(s) {source_azi},{source_ele}") logger.info(f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LUFS with shift of {source_shift_in_seconds} seconds") # get the number of channels from the .wav file header wav_header = audiofile.parse_wave_header(input_filename) Loading Loading @@ -404,9 +406,10 @@ def generate_OMASA_scene( # if ISM, append object position to the OMASA object y.object_pos = x.object_pos.copy() # if source_shift < 0: # # insert zeros to the new audio source signal to shift it right # metadata.trim_meta(y, limits=[source_shift, 0], samples=True) if source_shift < 0: # insert zeros to the new audio source signal to shift it right metadata.trim_meta(y, limits=[source_shift, 0], samples=True) else: offset = source_shift else: # shift the beginning of the audio source signal Loading Loading @@ -458,18 +461,28 @@ def generate_OMASA_scene( if len(y.audio) != duration: metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True) # adjust the loudness of the output signal logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LUFS") y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="BINAURAL") # apply fade-in and fade-out if "fade_in_out" in cfg.__dict__ and cfg.fade_in_out > 0: logger.info(f"-- Applying fade-in and fade-out with {cfg.fade_in_out} seconds") y.audio = audioarray.window(y.audio, y.fs, cfg.fade_in_out * 1000) # write the OMASA audio output to .wav file in an interleaved format and ISM metadata in .csv files audiofile.write(output_filename, y.audio, y.fs) metadata.write_ISM_metadata_in_file(y.object_pos, y.metadata_files[:-1]) # convert to OMASA output to BINAURAL, if option was chosen if cfg.binaural_output: binaudio = audio.fromtype("BINAURAL") binaudio.fs = y.fs convert_omasa(y, binaudio) binaural_output_filename = output_filename.with_name( output_filename.stem + "_BINAURAL" + output_filename.suffix ) logger.info(f"-- Converting to BINAURAL output file: {binaural_output_filename}") binaudio = audio.fromtype("BINAURAL") binaudio.fs = y.fs convert_omasa(y, binaudio) audiofile.write( binaural_output_filename, binaudio.audio, Loading ivas_processing_scripts/generation/generate_osba_items.py +23 −10 Original line number Diff line number Diff line Loading @@ -173,8 +173,9 @@ def generate_OSBA_scene( - Handles various audio formats (e.g., FOA, HOA2, HOA3) and applies transformations like loudness normalization, trimming, and padding. """ scenes = list(cfg.scenes.keys()) logger.info( f"Processing {scene_name} out of {len(cfg.scenes)} scenes, output: {scene['output']}" f"Processing scene \"{scene_name}\" ({scenes.index(scene_name) + 1} out of {len(scenes)}), output file: {scene['output']}" ) # extract the number of audio sources Loading Loading @@ -233,6 +234,7 @@ def generate_OSBA_scene( source_shift = 0.0 # convert overlap to samples and ensure it is a multiple of 20ms source_shift_in_seconds = source_shift source_shift = source_shift * cfg.fs if source_shift >= 0: source_shift = int(np.floor(source_shift / frame_len) * frame_len) Loading @@ -249,7 +251,7 @@ def generate_OSBA_scene( else: level = -26 logger.info(f"Encoding {source_file} at position(s) {source_azi},{source_ele}") logger.info(f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LUFS with shift of {source_shift_in_seconds} seconds") # get the number of channels from the .wav file header wav_header = audiofile.parse_wave_header(input_filename) Loading @@ -266,7 +268,7 @@ def generate_OSBA_scene( elif N_channels == 16: fmt = "HOA3" else: logger.info( logger.error( f"Error: Input format of the source file with {N_channels} channels is not supported!" ) sys.exit(-1) Loading Loading @@ -386,9 +388,10 @@ def generate_OSBA_scene( # if ISM, append object position to the OSBA object y.object_pos = x.object_pos.copy() # if source_shift < 0: # # insert zeros to the new audio source signal to shift it right # metadata.trim_meta(y, limits=[source_shift, 0], samples=True) if source_shift < 0: # insert zeros to the new audio source signal to shift it right metadata.trim_meta(y, limits=[source_shift, 0], samples=True) else: offset = source_shift else: # shift the beginning of the audio source signal Loading Loading @@ -440,18 +443,28 @@ def generate_OSBA_scene( if len(y.audio) != duration: metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True) # adjust the loudness of the output signal logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LUFS") y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="BINAURAL") # apply fade-in and fade-out if "fade_in_out" in cfg.__dict__ and cfg.fade_in_out > 0: logger.info(f"-- Applying fade-in and fade-out with {cfg.fade_in_out} seconds") y.audio = audioarray.window(y.audio, y.fs, cfg.fade_in_out * 1000) # write the OSBA audio output to .wav file in an interleaved format and ISM metadata in .csv files audiofile.write(output_filename, y.audio, y.fs) metadata.write_ISM_metadata_in_file(y.object_pos, y.metadata_files) # convert the OSBA output to BINAURAL, if option was chosen if cfg.binaural_output: binaudio = audio.fromtype("BINAURAL") binaudio.fs = y.fs convert_osba(y, binaudio) binaural_output_filename = output_filename.with_name( output_filename.stem + "_BINAURAL" + output_filename.suffix ) logger.info(f"-- Converting to BINAURAL output file: {binaural_output_filename}") binaudio = audio.fromtype("BINAURAL") binaudio.fs = y.fs convert_osba(y, binaudio) audiofile.write( binaural_output_filename, binaudio.audio, Loading ivas_processing_scripts/generation/generate_stereo_items.py +19 −7 Original line number Diff line number Diff line Loading @@ -194,8 +194,9 @@ def generate_stereo_scene( - Writes the processed STEREO audio to output file. """ scenes = list(cfg.scenes.keys()) logger.info( f"Processing scene: {scene_name} out of {len(cfg.scenes)} scenes, output: {scene['output']}" f"Processing scene \"{scene_name}\" ({scenes.index(scene_name) + 1} out of {len(scenes)}), output file: {scene['output']}" ) # extract the number of audio sources Loading Loading @@ -244,6 +245,7 @@ def generate_stereo_scene( source_shift = 0.0 # convert overlap to samples and ensure it is a multiple of 20ms source_shift_in_seconds = source_shift source_shift = source_shift * cfg.fs if source_shift >= 0: source_shift = int(np.floor(source_shift / frame_len) * frame_len) Loading @@ -260,7 +262,7 @@ def generate_stereo_scene( else: level = -26 logger.info(f"Convolving {source_file} with {IR_file}") logger.info(f"-- Convolving {source_file} with {IR_file} at {level} LUFS with shift of {source_shift_in_seconds} seconds") # read source file x = audio.fromfile("MONO", input_filename) Loading @@ -280,7 +282,7 @@ def generate_stereo_scene( # convolve MONO source audio with STEREO IR -> results in STEREO audio object x = reverb_stereo(x, IR) # adjust the level of the stereo signal # adjust the level of the STEREO signal x.audio, _ = loudness_norm(x, level, loudness_format="STEREO") # ensure the length of the audio source signal is a multiple of 20ms Loading @@ -295,9 +297,10 @@ def generate_stereo_scene( # add source signal to the array of all source signals y.audio = x.audio.copy() # if source_shift < 0: # # insert zeros to the new audio source signal to shift it right # metadata.trim_meta(y, limits=[source_shift, 0], samples=True) if source_shift < 0: # insert zeros to the new audio source signal to shift it right y.audio = audioarray.trim(y.audio, x.fs, limits=[source_shift, 0], samples=True) else: offset = source_shift else: # shift the beginning of the audio source signal Loading Loading @@ -345,5 +348,14 @@ def generate_stereo_scene( if len(y.audio) != duration: y.audio = audioarray.trim(y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True) # adjust the loudness of the output signal logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LUFS") y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="STEREO") # apply fade-in and fade-out if "fade_in_out" in cfg.__dict__ and cfg.fade_in_out > 0: logger.info(f"-- Applying fade-in and fade-out with {cfg.fade_in_out} seconds") y.audio = audioarray.window(y.audio, y.fs, cfg.fade_in_out * 1000) # write the STEREO audio signal into output file audiofile.write(output_filename, y.audio, y.fs) Loading
ivas_processing_scripts/generation/generate_ambi_items.py +23 −11 Original line number Diff line number Diff line Loading @@ -188,8 +188,9 @@ def generate_ambi_scene( - Writes the processed FOA/HOA2/HOA3 audio to the output file. """ scenes = list(cfg.scenes.keys()) logger.info( f"Processing scene: {scene_name} out of {len(cfg.scenes)} scenes, output: {scene['output']}" f"Processing scene \"{scene_name}\" ({scenes.index(scene_name) + 1} out of {len(scenes)}), output file: {scene['output']}" ) # extract the number of audio sources Loading Loading @@ -238,6 +239,7 @@ def generate_ambi_scene( source_shift = 0.0 # convert overlap to samples and ensure it is a multiple of 20ms source_shift_in_seconds = source_shift source_shift = source_shift * cfg.fs if source_shift >= 0: source_shift = int(np.floor(source_shift / frame_len) * frame_len) Loading @@ -254,7 +256,7 @@ def generate_ambi_scene( else: level = -26 logger.info(f"Convolving {source_file} with {IR_file}") logger.info(f"-- Convolving {source_file} with {IR_file} at {level} LUFS with shift of {source_shift_in_seconds} seconds") # read source file x = audio.fromfile("MONO", input_filename) Loading @@ -279,7 +281,7 @@ def generate_ambi_scene( elif cfg.format == "HOA3": x = reverb_hoa3(x, IR) # adjust the level of the target signal # adjust the level of the FOA/HOA2/HOA3 signal x.audio, _ = loudness_norm(x, level, loudness_format="STEREO") # ensure the length of the audio source signal is a multiple of 20ms Loading @@ -294,9 +296,10 @@ def generate_ambi_scene( # add source signal to the array of all source signals y.audio = x.audio.copy() # if source_shift < 0: # # insert zeros to the new audio source signal to shift it right # metadata.trim_meta(y, limits=[source_shift, 0], samples=True) if source_shift < 0: # insert zeros to the new audio source signal to shift it right y.audio = audioarray.trim_meta(y.audio, y.fs, limits=[source_shift, 0], samples=True) else: offset = source_shift else: # shift the beginning of the audio source signal Loading Loading @@ -344,20 +347,29 @@ def generate_ambi_scene( if len(y.audio) != duration: y.audio = audioarray.trim(y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True) # adjust the loudness of the output signal logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LUFS") y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="BINAURAL") # apply fade-in and fade-out if "fade_in_out" in cfg.__dict__ and cfg.fade_in_out > 0: logger.info(f"-- Applying fade-in and fade-out with {cfg.fade_in_out} seconds") y.audio = audioarray.window(y.audio, y.fs, cfg.fade_in_out * 1000) # write the FOA/HOA2/HOA3 audio signal into output file audiofile.write(output_filename, y.audio, y.fs) # convert to BINAURAL, if option was chosen if cfg.binaural_output: binaudio = audio.fromtype("BINAURAL") binaudio.fs = y.fs convert_scenebased(y, binaudio) binaural_output_filename = output_filename.with_name( output_filename.stem + "_BINAURAL" + output_filename.suffix ) logger.info(f"-- Converting to BINAURAL output file: {binaural_output_filename}") binaudio = audio.fromtype("BINAURAL") binaudio.fs = y.fs convert_scenebased(y, binaudio) audiofile.write( binaural_output_filename, binaudio.audio, binaudio.fs, ) logger.info(f"Written BINAURAL output to: {binaural_output_filename}")
ivas_processing_scripts/generation/generate_ismN_items.py +23 −9 Original line number Diff line number Diff line Loading @@ -176,8 +176,9 @@ def generate_ismN_scene( - Writes the processed audio and metadata to output files. """ scenes = list(cfg.scenes.keys()) logger.info( f"Processing {scene_name} out of {len(cfg.scenes)} scenes, output: {scene['output']}" f"Processing scene \"{scene_name}\" ({scenes.index(scene_name) + 1} out of {len(scenes)}), output file: {scene['output']}" ) # extract the number of audio sources Loading Loading @@ -236,6 +237,7 @@ def generate_ismN_scene( source_shift = 0.0 # convert overlap to samples and ensure it is a multiple of 20ms source_shift_in_seconds = source_shift source_shift = source_shift * cfg.fs if source_shift >= 0: source_shift = int(np.floor(source_shift / frame_len) * frame_len) Loading @@ -250,7 +252,7 @@ def generate_ismN_scene( else: level = -26 logger.info(f"Encoding {source_file} at position(s) {source_azi},{source_ele}") logger.info(f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LUFS with shift of {source_shift_in_seconds} seconds") # read source file x = audio.fromtype("ISM1") Loading Loading @@ -357,9 +359,11 @@ def generate_ismN_scene( y.audio = x.audio.copy() y.object_pos = x.object_pos.copy() y.fs = x.fs # if source_shift < 0: # # insert zeros to the new audio source signal to shift it right # metadata.trim_meta(y, limits=[source_shift, 0], samples=True) if source_shift < 0: # insert zeros to the new audio source signal to shift it right metadata.trim_meta(y, limits=[source_shift, 0], samples=True) else: offset = source_shift else: # shift the beginning of the audio source signal Loading Loading @@ -410,18 +414,28 @@ def generate_ismN_scene( if len(y.audio) != duration: metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True) # adjust the loudness of the output signal logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LUFS") y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="BINAURAL") # apply fade-in and fade-out if "fade_in_out" in cfg.__dict__ and cfg.fade_in_out > 0: logger.info(f"-- Applying fade-in and fade-out with {cfg.fade_in_out} seconds") y.audio = audioarray.window(y.audio, y.fs, cfg.fade_in_out * 1000) # write the ISMn output to .wav file in an interleaved format and ISM metadata in .csv files audiofile.write(output_filename, y.audio, y.fs) metadata.write_ISM_metadata_in_file(y.object_pos, y.metadata_files) # convert to BINAURAL, if option was chosen if cfg.binaural_output: binaudio = audio.fromtype("BINAURAL") binaudio.fs = y.fs convert_objectbased(y, binaudio) binaural_output_filename = output_filename.with_name( output_filename.stem + "_BINAURAL" + output_filename.suffix ) logger.info(f"-- Converting to BINAURAL output file: {binaural_output_filename}") binaudio = audio.fromtype("BINAURAL") binaudio.fs = y.fs convert_objectbased(y, binaudio) audiofile.write( binaural_output_filename, binaudio.audio, Loading
ivas_processing_scripts/generation/generate_omasa_items.py +22 −9 Original line number Diff line number Diff line Loading @@ -175,8 +175,9 @@ def generate_OMASA_scene( - Handles various audio formats (e.g., MONO, FOA, HOA2) and applies transformations like loudness normalization, trimming, and padding. """ scenes = list(cfg.scenes.keys()) logger.info( f"Processing {scene_name} out of {len(cfg.scenes)} scenes, output: {scene['output']}" f"Processing scene \"{scene_name}\" ({scenes.index(scene_name) + 1} out of {len(scenes)}), output file: {scene['output']}" ) # extract the number of audio sources Loading Loading @@ -238,6 +239,7 @@ def generate_OMASA_scene( source_shift = 0.0 # convert overlap to samples and ensure it is a multiple of 20ms source_shift_in_seconds = source_shift source_shift = source_shift * cfg.fs if source_shift >= 0: source_shift = int(np.floor(source_shift / frame_len) * frame_len) Loading @@ -254,7 +256,7 @@ def generate_OMASA_scene( else: level = -26 logger.info(f"Encoding {source_file} at position(s) {source_azi},{source_ele}") logger.info(f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LUFS with shift of {source_shift_in_seconds} seconds") # get the number of channels from the .wav file header wav_header = audiofile.parse_wave_header(input_filename) Loading Loading @@ -404,9 +406,10 @@ def generate_OMASA_scene( # if ISM, append object position to the OMASA object y.object_pos = x.object_pos.copy() # if source_shift < 0: # # insert zeros to the new audio source signal to shift it right # metadata.trim_meta(y, limits=[source_shift, 0], samples=True) if source_shift < 0: # insert zeros to the new audio source signal to shift it right metadata.trim_meta(y, limits=[source_shift, 0], samples=True) else: offset = source_shift else: # shift the beginning of the audio source signal Loading Loading @@ -458,18 +461,28 @@ def generate_OMASA_scene( if len(y.audio) != duration: metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True) # adjust the loudness of the output signal logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LUFS") y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="BINAURAL") # apply fade-in and fade-out if "fade_in_out" in cfg.__dict__ and cfg.fade_in_out > 0: logger.info(f"-- Applying fade-in and fade-out with {cfg.fade_in_out} seconds") y.audio = audioarray.window(y.audio, y.fs, cfg.fade_in_out * 1000) # write the OMASA audio output to .wav file in an interleaved format and ISM metadata in .csv files audiofile.write(output_filename, y.audio, y.fs) metadata.write_ISM_metadata_in_file(y.object_pos, y.metadata_files[:-1]) # convert to OMASA output to BINAURAL, if option was chosen if cfg.binaural_output: binaudio = audio.fromtype("BINAURAL") binaudio.fs = y.fs convert_omasa(y, binaudio) binaural_output_filename = output_filename.with_name( output_filename.stem + "_BINAURAL" + output_filename.suffix ) logger.info(f"-- Converting to BINAURAL output file: {binaural_output_filename}") binaudio = audio.fromtype("BINAURAL") binaudio.fs = y.fs convert_omasa(y, binaudio) audiofile.write( binaural_output_filename, binaudio.audio, Loading
ivas_processing_scripts/generation/generate_osba_items.py +23 −10 Original line number Diff line number Diff line Loading @@ -173,8 +173,9 @@ def generate_OSBA_scene( - Handles various audio formats (e.g., FOA, HOA2, HOA3) and applies transformations like loudness normalization, trimming, and padding. """ scenes = list(cfg.scenes.keys()) logger.info( f"Processing {scene_name} out of {len(cfg.scenes)} scenes, output: {scene['output']}" f"Processing scene \"{scene_name}\" ({scenes.index(scene_name) + 1} out of {len(scenes)}), output file: {scene['output']}" ) # extract the number of audio sources Loading Loading @@ -233,6 +234,7 @@ def generate_OSBA_scene( source_shift = 0.0 # convert overlap to samples and ensure it is a multiple of 20ms source_shift_in_seconds = source_shift source_shift = source_shift * cfg.fs if source_shift >= 0: source_shift = int(np.floor(source_shift / frame_len) * frame_len) Loading @@ -249,7 +251,7 @@ def generate_OSBA_scene( else: level = -26 logger.info(f"Encoding {source_file} at position(s) {source_azi},{source_ele}") logger.info(f"-- Encoding {source_file} at position(s) {source_azi},{source_ele} at {level} LUFS with shift of {source_shift_in_seconds} seconds") # get the number of channels from the .wav file header wav_header = audiofile.parse_wave_header(input_filename) Loading @@ -266,7 +268,7 @@ def generate_OSBA_scene( elif N_channels == 16: fmt = "HOA3" else: logger.info( logger.error( f"Error: Input format of the source file with {N_channels} channels is not supported!" ) sys.exit(-1) Loading Loading @@ -386,9 +388,10 @@ def generate_OSBA_scene( # if ISM, append object position to the OSBA object y.object_pos = x.object_pos.copy() # if source_shift < 0: # # insert zeros to the new audio source signal to shift it right # metadata.trim_meta(y, limits=[source_shift, 0], samples=True) if source_shift < 0: # insert zeros to the new audio source signal to shift it right metadata.trim_meta(y, limits=[source_shift, 0], samples=True) else: offset = source_shift else: # shift the beginning of the audio source signal Loading Loading @@ -440,18 +443,28 @@ def generate_OSBA_scene( if len(y.audio) != duration: metadata.trim_meta(y, limits=[0, len(y.audio) - duration], samples=True) # adjust the loudness of the output signal logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LUFS") y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="BINAURAL") # apply fade-in and fade-out if "fade_in_out" in cfg.__dict__ and cfg.fade_in_out > 0: logger.info(f"-- Applying fade-in and fade-out with {cfg.fade_in_out} seconds") y.audio = audioarray.window(y.audio, y.fs, cfg.fade_in_out * 1000) # write the OSBA audio output to .wav file in an interleaved format and ISM metadata in .csv files audiofile.write(output_filename, y.audio, y.fs) metadata.write_ISM_metadata_in_file(y.object_pos, y.metadata_files) # convert the OSBA output to BINAURAL, if option was chosen if cfg.binaural_output: binaudio = audio.fromtype("BINAURAL") binaudio.fs = y.fs convert_osba(y, binaudio) binaural_output_filename = output_filename.with_name( output_filename.stem + "_BINAURAL" + output_filename.suffix ) logger.info(f"-- Converting to BINAURAL output file: {binaural_output_filename}") binaudio = audio.fromtype("BINAURAL") binaudio.fs = y.fs convert_osba(y, binaudio) audiofile.write( binaural_output_filename, binaudio.audio, Loading
ivas_processing_scripts/generation/generate_stereo_items.py +19 −7 Original line number Diff line number Diff line Loading @@ -194,8 +194,9 @@ def generate_stereo_scene( - Writes the processed STEREO audio to output file. """ scenes = list(cfg.scenes.keys()) logger.info( f"Processing scene: {scene_name} out of {len(cfg.scenes)} scenes, output: {scene['output']}" f"Processing scene \"{scene_name}\" ({scenes.index(scene_name) + 1} out of {len(scenes)}), output file: {scene['output']}" ) # extract the number of audio sources Loading Loading @@ -244,6 +245,7 @@ def generate_stereo_scene( source_shift = 0.0 # convert overlap to samples and ensure it is a multiple of 20ms source_shift_in_seconds = source_shift source_shift = source_shift * cfg.fs if source_shift >= 0: source_shift = int(np.floor(source_shift / frame_len) * frame_len) Loading @@ -260,7 +262,7 @@ def generate_stereo_scene( else: level = -26 logger.info(f"Convolving {source_file} with {IR_file}") logger.info(f"-- Convolving {source_file} with {IR_file} at {level} LUFS with shift of {source_shift_in_seconds} seconds") # read source file x = audio.fromfile("MONO", input_filename) Loading @@ -280,7 +282,7 @@ def generate_stereo_scene( # convolve MONO source audio with STEREO IR -> results in STEREO audio object x = reverb_stereo(x, IR) # adjust the level of the stereo signal # adjust the level of the STEREO signal x.audio, _ = loudness_norm(x, level, loudness_format="STEREO") # ensure the length of the audio source signal is a multiple of 20ms Loading @@ -295,9 +297,10 @@ def generate_stereo_scene( # add source signal to the array of all source signals y.audio = x.audio.copy() # if source_shift < 0: # # insert zeros to the new audio source signal to shift it right # metadata.trim_meta(y, limits=[source_shift, 0], samples=True) if source_shift < 0: # insert zeros to the new audio source signal to shift it right y.audio = audioarray.trim(y.audio, x.fs, limits=[source_shift, 0], samples=True) else: offset = source_shift else: # shift the beginning of the audio source signal Loading Loading @@ -345,5 +348,14 @@ def generate_stereo_scene( if len(y.audio) != duration: y.audio = audioarray.trim(y.audio, y.fs, limits=[0, len(y.audio) - duration], samples=True) # adjust the loudness of the output signal logger.info(f"-- Rescaling to target loudness: {cfg.loudness} LUFS") y.audio, _ = loudness_norm(y, cfg.loudness, loudness_format="STEREO") # apply fade-in and fade-out if "fade_in_out" in cfg.__dict__ and cfg.fade_in_out > 0: logger.info(f"-- Applying fade-in and fade-out with {cfg.fade_in_out} seconds") y.audio = audioarray.window(y.audio, y.fs, cfg.fade_in_out * 1000) # write the STEREO audio signal into output file audiofile.write(output_filename, y.audio, y.fs)