HOA2 processing (5a00b481) · Commits · IVAS Codec Public Collaboration / IVAS Processing Scripts

item_gen_configs/HOA2_CONFIG.yml

0 → 100644

+58 −0

Original line number	Diff line number	Diff line
		---
		################################################
		# General configuration
		################################################

		### Output format
		format: "HOA2"

		### Output sampling rate in Hz needed for headerless audio files; default = 48000
		fs: 48000

		### IR sampling rate in Hz needed for headerless audio files; default = 48000
		IR_fs: 48000

		### Any relative paths will be interpreted relative to the working directory the script is called from!
		### Usage of absolute paths is recommended.
		### Do not use file names with dots "." in them! This is not supported, use "_" instead
		### For Windows users: please use double back slash '\\' in paths and add '.exe' to executable definitions

		### Input path to mono files
		input_path: "./items_mono"

		### Input path to stereo impulse response files, default = './ivas_processing_scripts/generation/IR'
		IR_path: "./IRs"

		### Output path for generated test items and metadata files
		output_path: "./items_HOA2"

		### Target loudness in LKFS; default = null (no loudness normalization applied)
		loudness: -26

		### Pre-amble and Post-amble length in seconds (default = 0.0)
		preamble: 0.5
		postamble: 1.0

		### Flag for adding low-level random background noise (amplitude +-4) instead of silence; default = false (silence)
		add_low_level_random_noise: true


		################################################
		### Scene description
		################################################

		### Each scene must start with the sceneN tag
		### Specify the mono source filename (the program will search for it in the input_path folder)
		### Specify the stereo IR source filename (the program will search for it in the input_path_IR folder)
		### Specify the overlap length in seconds for each input source (negative value creates a gap)
		### Note 1: use [val1, val2, ...] for multiple sources in a scene
		### Note 2: use the "start:step:stop" notation for moving sources, where step will be applied in 20ms frames

		scenes:
		a1:
		name: "out.wav"
		description: ""
		source: ["fa1.wav", "ma1.wav"]
		IR: ["IR_HOA2_env1/FreefieldFloor_TalkPos1_EigenHoA2_SinSweep_9chn.wav", "IR_HOA2_env1/FreefieldFloor_TalkPos2_EigenHoA2_SinSweep_9chn.wav"]
		overlap: -0.2

ivas_processing_scripts/audiotools/wrappers/reverb.py

+65 −1

Original line number	Diff line number	Diff line
		@@ -238,7 +238,7 @@ def reverb_foa(
		H = fft(foa_IR.audio, axis=0)
		align = 1.0 / np.max(np.abs(H))

		# convolve mono input with left and right IR
		# convolve mono input with FOA IR
		y_w = reverb(input, IR_w, align=align)
		y_x = reverb(input, IR_x, align=align)
		y_y = reverb(input, IR_y, align=align)
		@@ -251,3 +251,67 @@ def reverb_foa(
		y.audio = np.column_stack([y_w.audio, y_x.audio, y_y.audio, y_z.audio])

		return y


		def reverb_hoa2(
		input: Audio,
		hoa2_IR: Audio,
		align: Optional[float] = None,
		) -> Audio:
		"""
		Wrapper for the ITU-T reverb binary to convolve mono audio signal with an HOA2 impulse response

		Parameters
		----------
		input: Audio
		Input audio signal
		IR: Audio
		Impulse response
		align: float
		multiplicative factor to apply to the reverberated sound in order to align its energy level with the second file

		Returns
		-------
		output: Audio
		Convolved audio signal with HOA2 IR
		"""

		# convert to float32
		hoa2_IR.audio = np.float32(hoa2_IR.audio)

		numchannels = 9 # HOA2 by definition

		# calculate the scaling (multiplicative) factor such that the maximum gain of the IR filter across all frequencies is 0dB
		if align is None:
		H = fft(hoa2_IR.audio, axis=0)
		align = 1.0 / np.max(np.abs(H))

		IR = copy(hoa2_IR)
		IR.name = "MONO"
		IR.num_channels = 1
		ych = []
		for i in range(numchannels):
		# separate IR into each channel
		IR.audio = np.reshape(hoa2_IR.audio[:, i], (-1, 1))
		# convolve mono input with channel IR
		ych.append(reverb(input, IR, align=align))

		# combine into hoa2 output
		y = copy(input)
		y.name = "HOA2"
		y.num_channels = numchannels
		y.audio = np.column_stack(
		[
		ych[0].audio,
		ych[1].audio,
		ych[2].audio,
		ych[3].audio,
		ych[4].audio,
		ych[5].audio,
		ych[6].audio,
		ych[7].audio,
		ych[8].audio,
		]
		)

		return y

ivas_processing_scripts/generation/process_hoa2_items.py

0 → 100644

+218 −0

Original line number	Diff line number	Diff line
		#!/usr/bin/env python3

		#
		# (C) 2022-2023 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB,
		# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD.,
		# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange,
		# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other
		# contributors to this repository. All Rights Reserved.
		#
		# This software is protected by copyright law and by international treaties.
		# The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB,
		# Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD.,
		# Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange,
		# Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other
		# contributors to this repository retain full ownership rights in their respective contributions in
		# the software. This notice grants no license of any kind, including but not limited to patent
		# license, nor is any license granted by implication, estoppel or otherwise.
		#
		# Contributors are required to enter into the IVAS codec Public Collaboration agreement before making
		# contributions.
		#
		# This software is provided "AS IS", without any express or implied warranties. The software is in the
		# development stage. It is intended exclusively for experts who have experience with such software and
		# solely for the purpose of inspection. All implied warranties of non-infringement, merchantability
		# and fitness for a particular purpose are hereby disclaimed and excluded.
		#
		# Any dispute, controversy or claim arising under or in relation to providing this software shall be
		# submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in
		# accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and
		# the United Nations Convention on Contracts on the International Sales of Goods.
		#

		import logging
		import os
		from math import floor

		import numpy as np

		from ivas_processing_scripts.audiotools import audio, audiofile
		from ivas_processing_scripts.audiotools.wrappers.bs1770 import get_loudness
		from ivas_processing_scripts.audiotools.wrappers.reverb import reverb_hoa2
		from ivas_processing_scripts.generation import config

		SEED_RANDOM_NOISE = 0


		# function for converting nd numpy array to strings with 2 decimal digits
		def csv_formatdata(data):
		for row in data:
		yield ["%0.2f" % v for v in row]


		def generate_hoa2_items(
		cfg: config.TestConfig,
		logger: logging.Logger,
		):
		"""Generate HOA2 items from mono items based on scene description"""

		# get the number of scenes
		N_scenes = len(cfg.scenes)

		# set the target level
		if "loudness" not in cfg.__dict__:
		cfg.loudness = -26

		# set the fs
		if "fs" not in cfg.__dict__:
		cfg.fs = 48000

		# set the IR fs
		if "IR_fs" not in cfg.__dict__:
		cfg.IR_fs = 48000

		# set the pre-amble and post-amble
		if "preamble" not in cfg.__dict__:
		cfg.preamble = 0.0

		if "postamble" not in cfg.__dict__:
		cfg.postamble = 0.0

		# set the IR path
		if "IR_path" not in cfg.__dict__:
		cfg.IR_path = os.path.join(os.path.dirname(__file__), "IR")

		# set the pre-amble and post-amble
		if "add_low_level_random_noise" not in cfg.__dict__:
		cfg.add_low_level_random_noise = False

		# repeat for all source files
		for scene_name, scene in cfg.scenes.items():
		logger.info(
		f"Processing scene: {scene_name} out of {N_scenes} scenes, name: {scene['name']}"
		)

		# extract the number of audio sources
		N_sources = len(np.atleast_1d(scene["source"]))

		# read the overlap length
		if "overlap" in scene.keys():
		source_overlap = float(scene["overlap"])
		else:
		source_overlap = 0.0

		y = audio.SceneBasedAudio("HOA2")
		for i in range(N_sources):
		# parse parameters from the scene description
		source_file = np.atleast_1d(scene["source"])[i]
		IR_file = np.atleast_1d(scene["IR"])[i]

		logger.info(f"Convolving {source_file} with {IR_file}")

		# read source file
		x = audio.fromfile(
		"MONO", os.path.join(cfg.input_path, source_file), fs=cfg.fs
		)

		# read the IR file
		IR = audio.fromfile(
		"HOA2", os.path.join(cfg.IR_path, IR_file), fs=cfg.IR_fs
		)

		# convolve with HOA2 IR
		x = reverb_hoa2(x, IR)

		# adjust the level of the HOA2 signal
		_, scale_factor, _ = get_loudness(x, cfg.loudness, "BINAURAL")
		x.audio *= scale_factor

		# shift the second (and all other) source files (positive shift creates overlap, negative shift creates a gap)
		if i > 0 and source_overlap != 0.0:
		# get the length of the first source file
		N_delay = len(y.audio[:, 0])

		# add the shift
		N_delay += int(-source_overlap * x.fs)

		# insert all-zero preamble
		pre = np.zeros((N_delay, x.audio.shape[1]))
		x.audio = np.concatenate([pre, x.audio])

		# pad with zeros to ensure that the signal length is a multiple of 20ms
		N_frame = x.fs / 50
		if len(x.audio) % N_frame != 0:
		N_pad = int(N_frame - len(x.audio) % N_frame)

		# insert all-zero preamble
		pre = np.zeros((N_pad, x.audio.shape[1]))
		x.audio = np.concatenate([pre, x.audio])

		# add source signal to the array of source signals
		y.fs = x.fs
		if y.audio is None:
		y.audio = x.audio
		else:
		# pad with zeros to have equal length of all source signals
		if x.audio.shape[0] > y.audio.shape[0]:
		y.audio = np.vstack(
		(
		y.audio,
		np.zeros(
		(
		x.audio.shape[0] - y.audio.shape[0],
		y.audio.shape[1],
		)
		),
		)
		)
		elif y.audio.shape[0] > x.audio.shape[0]:
		x.audio = np.vstack(
		(
		x.audio,
		np.zeros(
		(
		y.audio.shape[0] - x.audio.shape[0],
		x.audio.shape[1],
		)
		),
		)
		)

		# superimpose
		y.audio += x.audio

		# append pre-amble and post-amble to all sources
		if cfg.preamble != 0.0:
		# ensure that pre-amble is a multiple of 20ms
		N_pre = int(floor(cfg.preamble * 50) / 50 * y.fs)

		# insert all-zero preamble to all sources
		pre = np.zeros((N_pre, y.audio.shape[1]))
		y.audio = np.concatenate([pre, y.audio])

		if cfg.postamble != 0.0:
		# ensure that post-mable is a multiple of 20ms
		N_post = int(floor(cfg.postamble * 50) / 50 * y.fs)

		# append all-zero postamble to all sources
		post = np.zeros((N_post, y.audio.shape[1]))
		y.audio = np.concatenate([y.audio, post])

		# add random noise
		if cfg.add_low_level_random_noise:
		# create uniformly distributed noise between -4 and 4
		np.random.seed(SEED_RANDOM_NOISE)
		noise = np.random.randint(low=-4, high=5, size=y.audio.shape).astype(
		"float"
		)

		# superimpose
		y.audio += noise

		# write the reverberated audio into output file
		output_filename = scene["name"]
		audiofile.write(
		os.path.join(cfg.output_path, output_filename), y.audio, y.fs
		) # !!!! TBD: replace all os.path.xxx operations with the Path object

		return