Loading item_generation_scripts/processing/process_ism_items.py +51 −32 Original line number Diff line number Diff line Loading @@ -36,8 +36,8 @@ import logging import os from pathlib import Path from typing import Optional import numpy as np from math import floor from item_generation_scripts.audiotools import audio, audiofile from item_generation_scripts.audiotools.wrappers.bs1770 import get_loudness Loading Loading @@ -69,8 +69,14 @@ def generate_ism_items( # extract the number of audio sources N_sources = len(np.atleast_1d(scene["source"])) y = None # initialize output variables if format == "ISM2": y = audio.ChannelBasedAudio("STEREO") else: y = audio.ChannelBasedAudio("MONO") y_meta = None # repeat for all source files for i in range(N_sources): # parse parameters from the scene description Loading @@ -87,16 +93,18 @@ def generate_ism_items( ) # read source file audio_object = audio.fromfile("MONO", os.path.join(input_path, source_file), fs=fs) x = audio_object.audio fs = audio_object.fs x = audio.fromfile("MONO", os.path.join(input_path, source_file), fs=fs) # find the number of frames N_frames = int(len(x) / fs * 50 + 1) # get the number of frames (multiple of 20ms) N_frames = int(len(x.audio) / x.fs * 50) # trim the source signal to align to 20ms boundary len = int(N_frames * x.fs / 50) x.audio = x.audio[:len] # adjust the level of the source file _, scale_factor = get_loudness(audio_object, target_level, "MONO") x *= scale_factor _, scale_factor = get_loudness(x, target_level, "MONO") x.audio *= scale_factor # read azimuth information and create array if isinstance(source_azi, str): Loading Loading @@ -167,59 +175,70 @@ def generate_ism_items( # delay the source file if source_delay > 0: pre = np.zeros((int(source_delay * fs), x.shape[1])) x = np.concatenate([pre, x]) # ensure delay is a multiple of 20ms N_delay = int(floor(source_delay * 50) / 50 * x.fs) # insert all-zero preamble pre = np.zeros((N_delay, x.audio.shape[1])) x.audio = np.concatenate([pre, x.audio]) # apply delay to metadata as well # insert neutral position as a pre-amble pre = np.tile( [0.00, 0.00, 1.00, 0.00, 1.00], (int(source_delay * 50), 1) ) # pre = np.zeros((int(source_delay * 50), x_meta.shape[1])) [0.00, 0.00, 1.00, 0.00, 1.00], (N_delay, 1) ) # !!!! TBD - check if we should insert netrual position or the first position of the metadata x_meta = np.concatenate([pre, x_meta]) # add source signal to the array of source signals if y is None: y = x # add source signal to the array of all source signals y.fs = x.fs if y.audio is None: y.audio = x.audio else: # append zeros to have equal length of all source signals if x.shape[0] > y.shape[0]: y = np.vstack((y, np.zeros((x.shape[0] - y.shape[0], y.shape[1])))) elif y.shape[0] > x.shape[0]: x = np.vstack((x, np.zeros((y.shape[0] - x.shape[0], x.shape[1])))) y = np.hstack((y, x)) if x.audio.shape[0] > y.audio.shape[0]: y.audio = np.vstack((y.audio, np.zeros((x.audio.shape[0] - y.audio.shape[0], y.audio.shape[1])))) elif y.audio.shape[0] > x.audio.shape[0]: x.audio = np.vstack((x.audio, np.zeros((y.audio.shape[0] - x.audio.shape[0], x.audio.shape[1])))) y.audio = np.hstack((y.audio, x.audio)) # add metadata to the array of all metadata x_meta = x_meta[np.newaxis, :] # make sure x_meta is a 3d array # make sure x_meta is a 3d array x_meta = x_meta[np.newaxis, :] if y_meta is None: y_meta = x_meta else: N_srcs = y_meta.shape[0] N_meta_features = y_meta.shape[2] # append postamble (create by repeating the last row of metadata) to have equal length of all metadata # append the last position of the metadata to have equal length of all metadata if x_meta.shape[1] > y_meta.shape[1]: N_delta = x_meta.shape[1] - y_meta.shape[1] y_meta = y_meta.reshape(y_meta.shape[1], -1) # reshape to 2d array # reshape to 2d array y_meta = y_meta.reshape(y_meta.shape[1], -1) # repeat last row N_delta times and append to the array y_meta = np.vstack( (y_meta, np.tile(y_meta[-1, :], (N_delta, 1))) ) # repeat last row N_delta times and append to the array ) # reshape back to 3d array y_meta = y_meta.reshape( N_srcs, -1, N_meta_features ) # reshape back to 3d array ) elif y_meta.shape[1] > x_meta.shape[1]: N_delta = y_meta.shape[1] - x_meta.shape[1] x_meta = x_meta.reshape(x_meta.shape[1], -1) # reshape to 2d array # reshape to 2d array x_meta = x_meta.reshape(x_meta.shape[1], -1) # repeat last row N_delta times and append to the array x_meta = np.vstack( (x_meta, np.tile(x_meta[-1, :], (N_delta, 1))) ) # repeat last row N_delta times and append to the array x_meta = np.expand_dims(x_meta, axis=0) # reshape back to 3d array ) # reshape back to 3d array x_meta = np.expand_dims(x_meta, axis=0) y_meta = np.concatenate([y_meta, x_meta]) # write individual ISM audio streams to the output file in an interleaved format output_filename = scene["name"] audiofile.write( os.path.join(output_path, output_filename), y, fs os.path.join(output_path, output_filename), y.audio, y.fs ) # !!!! TBD: replace all os.path.xxx operations with the Path object # write individual ISM metadata to output files in .csv format Loading Loading
item_generation_scripts/processing/process_ism_items.py +51 −32 Original line number Diff line number Diff line Loading @@ -36,8 +36,8 @@ import logging import os from pathlib import Path from typing import Optional import numpy as np from math import floor from item_generation_scripts.audiotools import audio, audiofile from item_generation_scripts.audiotools.wrappers.bs1770 import get_loudness Loading Loading @@ -69,8 +69,14 @@ def generate_ism_items( # extract the number of audio sources N_sources = len(np.atleast_1d(scene["source"])) y = None # initialize output variables if format == "ISM2": y = audio.ChannelBasedAudio("STEREO") else: y = audio.ChannelBasedAudio("MONO") y_meta = None # repeat for all source files for i in range(N_sources): # parse parameters from the scene description Loading @@ -87,16 +93,18 @@ def generate_ism_items( ) # read source file audio_object = audio.fromfile("MONO", os.path.join(input_path, source_file), fs=fs) x = audio_object.audio fs = audio_object.fs x = audio.fromfile("MONO", os.path.join(input_path, source_file), fs=fs) # find the number of frames N_frames = int(len(x) / fs * 50 + 1) # get the number of frames (multiple of 20ms) N_frames = int(len(x.audio) / x.fs * 50) # trim the source signal to align to 20ms boundary len = int(N_frames * x.fs / 50) x.audio = x.audio[:len] # adjust the level of the source file _, scale_factor = get_loudness(audio_object, target_level, "MONO") x *= scale_factor _, scale_factor = get_loudness(x, target_level, "MONO") x.audio *= scale_factor # read azimuth information and create array if isinstance(source_azi, str): Loading Loading @@ -167,59 +175,70 @@ def generate_ism_items( # delay the source file if source_delay > 0: pre = np.zeros((int(source_delay * fs), x.shape[1])) x = np.concatenate([pre, x]) # ensure delay is a multiple of 20ms N_delay = int(floor(source_delay * 50) / 50 * x.fs) # insert all-zero preamble pre = np.zeros((N_delay, x.audio.shape[1])) x.audio = np.concatenate([pre, x.audio]) # apply delay to metadata as well # insert neutral position as a pre-amble pre = np.tile( [0.00, 0.00, 1.00, 0.00, 1.00], (int(source_delay * 50), 1) ) # pre = np.zeros((int(source_delay * 50), x_meta.shape[1])) [0.00, 0.00, 1.00, 0.00, 1.00], (N_delay, 1) ) # !!!! TBD - check if we should insert netrual position or the first position of the metadata x_meta = np.concatenate([pre, x_meta]) # add source signal to the array of source signals if y is None: y = x # add source signal to the array of all source signals y.fs = x.fs if y.audio is None: y.audio = x.audio else: # append zeros to have equal length of all source signals if x.shape[0] > y.shape[0]: y = np.vstack((y, np.zeros((x.shape[0] - y.shape[0], y.shape[1])))) elif y.shape[0] > x.shape[0]: x = np.vstack((x, np.zeros((y.shape[0] - x.shape[0], x.shape[1])))) y = np.hstack((y, x)) if x.audio.shape[0] > y.audio.shape[0]: y.audio = np.vstack((y.audio, np.zeros((x.audio.shape[0] - y.audio.shape[0], y.audio.shape[1])))) elif y.audio.shape[0] > x.audio.shape[0]: x.audio = np.vstack((x.audio, np.zeros((y.audio.shape[0] - x.audio.shape[0], x.audio.shape[1])))) y.audio = np.hstack((y.audio, x.audio)) # add metadata to the array of all metadata x_meta = x_meta[np.newaxis, :] # make sure x_meta is a 3d array # make sure x_meta is a 3d array x_meta = x_meta[np.newaxis, :] if y_meta is None: y_meta = x_meta else: N_srcs = y_meta.shape[0] N_meta_features = y_meta.shape[2] # append postamble (create by repeating the last row of metadata) to have equal length of all metadata # append the last position of the metadata to have equal length of all metadata if x_meta.shape[1] > y_meta.shape[1]: N_delta = x_meta.shape[1] - y_meta.shape[1] y_meta = y_meta.reshape(y_meta.shape[1], -1) # reshape to 2d array # reshape to 2d array y_meta = y_meta.reshape(y_meta.shape[1], -1) # repeat last row N_delta times and append to the array y_meta = np.vstack( (y_meta, np.tile(y_meta[-1, :], (N_delta, 1))) ) # repeat last row N_delta times and append to the array ) # reshape back to 3d array y_meta = y_meta.reshape( N_srcs, -1, N_meta_features ) # reshape back to 3d array ) elif y_meta.shape[1] > x_meta.shape[1]: N_delta = y_meta.shape[1] - x_meta.shape[1] x_meta = x_meta.reshape(x_meta.shape[1], -1) # reshape to 2d array # reshape to 2d array x_meta = x_meta.reshape(x_meta.shape[1], -1) # repeat last row N_delta times and append to the array x_meta = np.vstack( (x_meta, np.tile(x_meta[-1, :], (N_delta, 1))) ) # repeat last row N_delta times and append to the array x_meta = np.expand_dims(x_meta, axis=0) # reshape back to 3d array ) # reshape back to 3d array x_meta = np.expand_dims(x_meta, axis=0) y_meta = np.concatenate([y_meta, x_meta]) # write individual ISM audio streams to the output file in an interleaved format output_filename = scene["name"] audiofile.write( os.path.join(output_path, output_filename), y, fs os.path.join(output_path, output_filename), y.audio, y.fs ) # !!!! TBD: replace all os.path.xxx operations with the Path object # write individual ISM metadata to output files in .csv format Loading