From 2ad1e20ae1ce44b56b8e19fe536a20055929499f Mon Sep 17 00:00:00 2001 From: Archit Tamarapu Date: Wed, 13 Jul 2022 19:28:34 +0200 Subject: [PATCH 1/2] updates to python scripts from internal development repo * LFE in binaural rendering updated * Removed the dependency on the prerenderer for certain formats * MC to MC conversions now use the same tables as IVAS * Allow specifying ISM metadata files on the commandline (`-m` / `--metadata`) * EFAP python implementation in `pyaudio3dtools/EFAP.py` * ALLRAD python implementation added to `hoadecoder.py`, .mat files removed * Restructuring of `spatialaudioconvert()` to be cleaner and more modular * Improved support of metadata (mixed-scene) text file input --- scripts/README.md | 64 +- ...t => IIS_BRIR_officialMPEG_222UC_SBA3.mat} | 0 scripts/pyaudio3dtools/EFAP.py | 851 ++++++++++++++++++ scripts/pyaudio3dtools/__init__.py | 2 +- scripts/pyaudio3dtools/audio3dtools.py | 57 +- scripts/pyaudio3dtools/audioarray.py | 31 +- scripts/pyaudio3dtools/audiofile.py | 83 +- scripts/pyaudio3dtools/binauralRef.py | 342 ------- scripts/pyaudio3dtools/binauralrenderer.py | 721 +++++++++++---- scripts/pyaudio3dtools/constants.py | 386 ++++++++ scripts/pyaudio3dtools/hoadecoder.py | 73 +- scripts/pyaudio3dtools/prerenderer.py | 255 ------ .../pyaudio3dtools/quaternions/functions.py | 30 +- scripts/pyaudio3dtools/rotateHOA.py | 44 +- scripts/pyaudio3dtools/rotateISM.py | 28 +- scripts/pyaudio3dtools/rotateMC.py | 96 ++ scripts/pyaudio3dtools/spatialaudioconvert.py | 472 ++++++---- scripts/pyaudio3dtools/spatialaudioformat.py | 44 +- scripts/pyaudio3dtools/spatialmetadata.py | 67 +- scripts/pyprocessing/evs.py | 13 +- scripts/pyprocessing/ivas.py | 14 +- scripts/pyprocessing/prepost_processing.py | 28 +- scripts/pyprocessing/processing_configs.py | 4 +- 23 files changed, 2550 insertions(+), 1155 deletions(-) rename scripts/pyaudio3dtools/BRIRs_mat/{IIS_BRIR_officialMPEG_222UC_VIRTUALLS_SBA3.mat => IIS_BRIR_officialMPEG_222UC_SBA3.mat} (100%) create mode 100644 scripts/pyaudio3dtools/EFAP.py delete mode 100644 scripts/pyaudio3dtools/binauralRef.py create mode 100644 scripts/pyaudio3dtools/constants.py delete mode 100644 scripts/pyaudio3dtools/prerenderer.py create mode 100644 scripts/pyaudio3dtools/rotateMC.py diff --git a/scripts/README.md b/scripts/README.md index 7264ccf7a5..57084e1bdf 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -35,26 +35,30 @@ title: Python scripts for Testing the IVAS code and Generating test items --- # Python scripts for Testing the IVAS code and Generating test items ## Contents -0. [Requirements](#0-requirements) -1. [Scripts and classes for testing IVAS code](#1-scripts-and-classes-for-testing-ivas-code) - - [Classes](#11-classes) - - [Output directory structure](#12-output-directory-structure) - - [Scripts](#13-scripts) +- [Python scripts for Testing the IVAS code and Generating test items](#python-scripts-for-testing-the-ivas-code-and-generating-test-items) + - [Contents](#contents) + - [0. Requirements](#0-requirements) + - [- numpy and scipy for `generate_test_items.py`, `testBitexact.py` and `self_test.py`](#--numpy-and-scipy-for-generate_test_itemspy-testbitexactpy-and-self_testpy) + - [1. Scripts and classes for testing IVAS code](#1--scripts-and-classes-for-testing-ivas-code) + - [1.1 Classes](#11-classes) + - [1.2 Output directory structure](#12-output-directory-structure) + - [1.3 Scripts](#13-scripts) - [Common commandline options for the scripts](#common-commandline-options-for-the-scripts) - - [runIvasCodec.py](#runivascodecpy) - - [IvasBuildAndRun.py](#ivasbuildandrunpy) - - [IvasBuildAndRunChecks.py](#ivasbuildandruncheckspy) - - [testBitexact.py](#testbitexactpy) - - [self_test.py](#self_testpy) -2. [Script for generating listening test items](#2-script-for-generating-listening-test-items) - - [generate_test_items.py](#21-generate_test_itemspy) - - [Test configuration file](#22-test-configuration-file) - - [Supported test conditions](#23-supported-test-conditions) - - [Supported input/output/rendered audio formats](#24-supported-inputoutputrendered-audio-formats) - - [Pre-renderer Metadata definiton](#25-pre-renderer-metadata-definition) -3. [Script for converting formats and binauralizing](#3-script-for-converting-formats-and-binauralizing) - - [Binauralizing with head rotation](#31-binauralizing-with-head-rotation) - - [Generating binaural reference signals](#32-generating-binaural-reference-signals) + - [`runIvasCodec.py`](#runivascodecpy) + - [`IvasBuildAndRun.py`](#ivasbuildandrunpy) + - [`IvasBuildAndRunChecks.py`](#ivasbuildandruncheckspy) + - [`testBitexact.py`](#testbitexactpy) + - [`self_test.py`](#self_testpy) + - [2. Script for generating listening test items](#2-script-for-generating-listening-test-items) + - [2.1. `generate_test_items.py`](#21-generate_test_itemspy) + - [2.2. Test configuration file](#22-test-configuration-file) + - [2.3. Supported test conditions](#23-supported-test-conditions) + - [2.4. Supported input/output/rendered audio formats](#24-supported-inputoutputrendered-audio-formats) + - [2.5. Processing](#25-processing) + - [2.6. Renderer Metadata definition](#26-renderer-metadata-definition) + - [3. Script for converting formats and binauralizing](#3-script-for-converting-formats-and-binauralizing) + - [3.1. Binauralizing with head rotation](#31-binauralizing-with-head-rotation) + - [3.2. Generating binaural reference signals](#32-generating-binaural-reference-signals) --- @@ -556,12 +560,12 @@ Multiple conditions for evs_mono and ivas can be specified by using underscore s |--------------------------------------------------|----------------------|------------------------------------------------| | MONO | yes/yes/yes | mono signals | | STEREO | yes/yes/yes | stereo signals | -| ISM or ISMx | yes/no/no | Objects with metadata, description using pre-renderer metadata | +| ISM or ISMx | yes/no/no | Objects with metadata, description using renderer metadata | | MASA or MASAx | yes/no/no | mono or stereo signals with spatial metadata !!!metadata must share same basename as waveform file but with .met extension!!! | | FOA/HOA2/HOA3 or PLANAR(FOA/HOAx) | yes/yes/yes | Ambisonic signals or planar ambisonic signals | | BINAURAL/BINAURAL_ROOM | no/yes/yes | Binaural signals | | 5_1/5_1_2/5_1_4/7_1/7_1_4 or CICP[6/12/14/16/19] | yes/yes/yes | Multi-channel signals for predefined loudspeaker layout | -| META | yes/yes/no | Audio scene described by a pre-renderer config | +| META | yes/yes/no | Audio scene described by a renderer config | --- @@ -585,17 +589,17 @@ The processing chain is as follows: --- -### 2.6. Pre-renderer Metadata definition +### 2.6. Renderer Metadata definition -To run, the pre-renderer requires a config file describing the input scene.The expected format of the config file is as follows: +To run, the renderer requires a config file describing the input scene.The expected format of the config file is as follows: --- -- Line 1: Path to a "multitrack" audio file. This should be a single multichannel wav/pcm file that contains all input audio. For example channels 1-4 can be an FOA scene,channel 5 - an object and channels 6-11 - a 5.1 channel bed. If the path is not absolute, it is considered relative to the pre-renderer executable, not the config file. This path has lower priority than the one given on the command line: *The path in the config file is ignored if the --inputAudio argument to the pre-renderer executable is specified.* When running together with the encoder using EVS_cod_prerenderer.sh, the path in the config file is ignored and the one provided as script argument is used instead. +- Line 1: Path to a "multitrack" audio file. This should be a single multichannel wav/pcm file that contains all input audio. For example channels 1-4 can be an FOA scene,channel 5 - an object and channels 6-11 - a 5.1 channel bed. If the path is not absolute, it is considered relative to the renderer executable, not the config file. This path has lower priority than the one given on the command line: *The path in the config file is ignored if the --inputAudio argument to the renderer executable is specified.* --- -- Line 2: Contains number of inputs. An input can either be an Ambisonics scene, anobject or a channel bed.This is NOT the total number of channels in the input audio file.The pre-renderer currently supports simultaneously: *Up to 2 SBA inputs, Up to 2 MC inputs* Up to 16 ISM inputsThese limits can be freely changed with pre-processor macros, if needed. +- Line 2: Contains number of inputs. An input can either be an Ambisonics scene, anobject or a channel bed.This is NOT the total number of channels in the input audio file.The renderer currently supports simultaneously: *Up to 2 SBA inputs, Up to 2 MC inputs* Up to 16 ISM inputsThese limits can be freely changed with pre-processor macros, if needed. --- - Following lines: @@ -625,7 +629,9 @@ ISM ## 3. Script for converting formats and binauralizing -The script audio3dtool.py can convert between different input and output formats and binauralize the signals. +The script audio3dtools.py can convert between different input and output formats and binauralize signals. + +Execute `python -m pyaudio3dtools.audio3dtools --help` for usage. ### 3.1. Binauralizing with head rotation @@ -642,3 +648,9 @@ Currently MC input signals are supported. The reference processing can be activa ``` python -m pyaudio3dtools.audio3dtools -i cicp6_input.wav -o . -F BINAURAL_REF -T .\trajectories\full_circle_in_15s ``` + +### 3.3. Rendering ISM to Custom loudspeakers with auxiliary binaural output +ISM metadata can either be specified via an input text file in the Renderer Metadata definition format, or via the commandline using the same style as IVAS: +``` +python -m pyaudio3dtools.audio3dtools -i ism2.wav -f ISM2 -m ism1.csv NULL -F 7_1_4 -o . -b -T .\trajectories\full_circle_in_15s +``` diff --git a/scripts/pyaudio3dtools/BRIRs_mat/IIS_BRIR_officialMPEG_222UC_VIRTUALLS_SBA3.mat b/scripts/pyaudio3dtools/BRIRs_mat/IIS_BRIR_officialMPEG_222UC_SBA3.mat similarity index 100% rename from scripts/pyaudio3dtools/BRIRs_mat/IIS_BRIR_officialMPEG_222UC_VIRTUALLS_SBA3.mat rename to scripts/pyaudio3dtools/BRIRs_mat/IIS_BRIR_officialMPEG_222UC_SBA3.mat diff --git a/scripts/pyaudio3dtools/EFAP.py b/scripts/pyaudio3dtools/EFAP.py new file mode 100644 index 0000000000..73b8df6ee0 --- /dev/null +++ b/scripts/pyaudio3dtools/EFAP.py @@ -0,0 +1,851 @@ +#!/usr/bin/env python3 + +""" + (C) 2022 Baseline Development Group with portions copyright Dolby International AB, Ericsson AB, + Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., + Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies OY, Orange, + Panasonic Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation. All Rights Reserved. + + This software is protected by copyright law and by international treaties. + The Baseline Development Group consisting of Dolby International AB, Ericsson AB, + Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., + Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies OY, Orange, + Panasonic Corporation, Qualcomm Technologies, Inc., and VoiceAge Corporation retain full ownership + rights in their respective contributions in the software. No license of any kind, including but not + limited to patent license, of any foregoing parties is hereby granted by implication, estoppel or + otherwise. + + This software is provided "AS IS", without any express or implied warranties. The software is in the + development stage. It is intended exclusively for experts who have experience with such software and + solely for the purpose of inspection. All implied warranties of non-infringement, merchantability + and/or fitness for a particular purpose are hereby disclaimed and excluded. + + Any dispute, controversy or claim arising under or in relation to providing this software shall be + submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in + accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and + the United Nations Convention on Contracts on the International Sales of Goods. +""" + +import argparse +import os +from itertools import combinations +from typing import Optional, Tuple, Union + +import numpy as np + + +def wrap_angles( + azi: float, ele: float, clip_ele: Optional[bool] = False +) -> Tuple[float, float]: + """ + Wrap angles to (-180, 180] azimuth and [-90, 90] elevation + Takes into account hemisphere flips from large elevation changes unless clip_ele is specified + """ + if clip_ele: + ele = min(max(ele, -90), 90) + + if ele != 0 and ele % 90 == 0: + # if elevation is a multiple of 90, azimuth is irrelevant since we are at a pole + azi = 0 + while np.abs(ele) > 90: + ele -= 360 + else: + # wrap elevation value + while np.abs(ele) > 90: + # flip azimuth to other hemisphere + azi += 180 + + # compensate elevation accordingly + if ele > 90: + ele -= 180 + elif ele < -90: + ele += 180 + + # wrap azimuth value + while np.abs(azi) > 180: + azi = (azi + 180) % 360 + if azi < 0: + azi += 360 + azi -= 180 + + # set -180 azimuth to 180 + if azi == -180: + azi = 180 + + return azi, ele + + +class EfapVertex: + """ + Vertex data structure for EFAP + Initialises a vertex from the given spherical coordinate pair, with a flag specifying if it is a ghost loudspeaker + + + Parameters + ---------- + azi : float + Azimuth of vertex + ele : float + Elevation of vertex + is_ghost : bool + Whether the vertex is a ghost, default is False + """ + + def __init__(self, azi: float, ele: float, is_ghost: Optional[bool] = False): + self.azi, self.ele = wrap_angles(azi, ele) + self.pos = np.array( + [ + np.cos(np.deg2rad(azi)) * np.cos(np.deg2rad(ele)), + np.sin(np.deg2rad(azi)) * np.cos(np.deg2rad(ele)), + np.sin(np.deg2rad(ele)), + ] + ) + + idx_azi = np.round(np.abs(90 - np.abs(self.azi))) + idx_ele = 90 - np.round(np.abs(self.ele)) + self.index = ( + idx_azi + 181 * idx_ele + ) # vertices on the median plane have lowest index + + self.is_ghost = is_ghost + + def __str__(self): + str_ = f"a{self.azi}e{self.ele}" + if self.is_ghost: + str_ += "*" + return str_ + + def __lt__(self, other): + return self.index < other.index + + +class EFAP: + """ + EFAP data structure + + Initialise EFAP data for computing panning gains + + + Parameters + ---------- + azimuths : np.ndarray + Azimuth positions of the loudspeaker array + elevations : npndarray + Elevation postions of the loudspeaker array + + Examples + -------- + + >>> from EFAP import EFAP + >>> panner = EFAP([30, -30, 0, 110, -110], [0, 0, 0, 0, 0]]) + >>> panner.pan(15, 45, intensity_panning=False) + array([0.66742381, 0.19069252, 0.66742381, 0.19069252, 0.19069252]) + + """ + + _EFAP_HULL_TOL = 1e-4 # tolerance for a point to be added to the convex hull + _EFAP_MAX_AZI_GAP = 160 # maximum allowed angular gap in the middle layer + _EFAP_POLAR_ELE = 90 # elevation of north / south poles (zenith / nadir) + _EFAP_THRESH_COPLANAR = 1e-3 # tolerance for points to be considered coplanar + _EFAP_THRESH_MID_LAYER = 45 # elevation threshold for loudspeakers to be considered as in the middle layer + _EFAP_THRESH_POLES = 1e-6 # tolerance for a vertex to be considered polar + _EFAP_THRESH_TRI = 1e-10 # tolerance for a point to be inside a triangle + + def __init__( + self, azimuths: Union[list, np.ndarray], elevations: Union[list, np.ndarray] + ): + # validation + azimuths = np.array(azimuths) + elevations = np.array(elevations) + if np.squeeze(azimuths).ndim > 1: + raise ValueError("Too many dimensions for loudspeaker azimuth array") + if np.squeeze(elevations).ndim > 1: + raise ValueError("Too many dimensions for loudspeaker elevations array") + if azimuths.shape != elevations.shape: + raise ValueError("Mismatch between loudspeaker azimuths and elevations") + + # initialise vertices and add ghost loudspeakers if needed + self.verts = np.array( + [EfapVertex(azi, ele) for azi, ele in zip(azimuths, elevations)] + ) + self._add_ghost_speakers() + + # formulate initial tetrahedron for the convex hull + self._init_simplex() + + # add the remaining vertices to the convex hull in order of their index + for i in np.argsort(self.verts): + if self.verts[i] not in self.verts[self.tris]: + self._add_vertex_to_hull(i) + + # compute downmix matrix with remapped ghost speakers + self._remap_ghost_speakers() + + # set vertices near poles to have NaN azimuth + for v in self.verts: + if ( + v.ele > self._EFAP_POLAR_ELE - self._EFAP_THRESH_POLES + or v.ele < self._EFAP_THRESH_POLES - self._EFAP_POLAR_ELE + ): + v.azi = np.nan + + # combine triangles into polygons + self._tri2poly() + + def _add_ghost_speakers(self) -> None: + """ + Add ghost loudspeakers at the poles, or to fill large horizontal gaps + """ + ele = [v.ele for v in self.verts] + + # add ghost loudspeakers at the poles if necessary + if max(ele) < self._EFAP_POLAR_ELE: + self.verts = np.append(self.verts, EfapVertex(0, 90, True)) + if min(ele) > -self._EFAP_POLAR_ELE: + self.verts = np.append(self.verts, EfapVertex(0, -90, True)) + + # check for large gaps in the middle horizontal layer + mid_spkrs = [ + v.azi for v in self.verts if np.abs(v.ele) < self._EFAP_THRESH_MID_LAYER + ] + + # no speakers in middle layer; add a triangle of ghost speakers + if not mid_spkrs: + self.verts = np.append( + self.verts, + [ + EfapVertex(0, 0, True), + EfapVertex(180, 0, True), + EfapVertex(240, 0, True), + ], + ) + # only one speaker in the threshold; add two ghost speakers to form a triangle + elif len(mid_spkrs) == 1: + self.verts = np.append( + self.verts, + [ + EfapVertex(mid_spkrs[0] + 120, 0, True), + EfapVertex(mid_spkrs[0] + 240, 0, True), + ], + ) + # search for and fill gaps greater than MAX_AZI_GAP + else: + mid_spkrs = np.sort(mid_spkrs) + angle_diff = np.diff(np.concatenate([mid_spkrs, [mid_spkrs[0] + 360]])) + sectors = np.ceil(angle_diff / self._EFAP_MAX_AZI_GAP) + + for i, s in enumerate(sectors): + if s > 1: + new_diff = angle_diff[i] / s + num_new = s - 1 + for k in range(int(num_new)): + new_azi = mid_spkrs[i] + (k + 1) * new_diff + self.verts = np.append(self.verts, EfapVertex(new_azi, 0, True)) + + def _init_simplex(self) -> None: + """ + Create an initial tetrahedron / simplex for the convex hull from 4 vertices + """ + # take the first vertex as seed + t = [0] + + # attempt to form an edge with non-zero length + for i, v in enumerate(self.verts): + if ( + v.azi != self.verts[t[0]].azi or v.ele != self.verts[t[0]].ele + ) and i not in t: + t.append(i) + break + else: + raise ValueError("Vertices are conincident!") + + # attempt to form a triangle with non-zero area + for i, v in enumerate(self.verts): + if ( + np.linalg.norm( + np.cross( + self.verts[t[1]].pos - self.verts[t[0]].pos, + v.pos - self.verts[t[0]].pos, + ), + 2, + ) + > self._EFAP_HULL_TOL + and i not in t + ): + t.append(i) + break + else: + raise ValueError("Vertices are colinear!") + + # attempt to form a tetrahedron with non-zero volume + for i, v in enumerate(self.verts): + if ( + np.abs( + np.dot( + np.cross( + self.verts[t[1]].pos - self.verts[t[0]].pos, + self.verts[t[2]].pos - self.verts[t[0]].pos, + ), + v.pos - self.verts[t[0]].pos, + ) + ) + ) > self._EFAP_HULL_TOL and i not in t: + t.append(i) + break + else: + raise ValueError("Vertices are coplanar!") + + # create a list of the triangles of the initial simplex / tetrahedron + t = np.array(t) + self.tris = np.array([t[[0, 1, 2]], t[[0, 1, 3]], t[[0, 2, 3]], t[[1, 2, 3]]]) + + # orient the triangle surface planes outwards from the centroid + self.centroid = np.mean([self.verts[i].pos for i in t], axis=0) + for i, tri in enumerate(self.tris): + self.tris[i, :] = self._flip_plane(tri) + + def _add_vertex_to_hull(self, idx_new_vert: int) -> None: + """ + Add a vertex to the convex hull and update the list of triangles in the hull + """ + # compute the centroid of the current convex hull + self.centroid = np.mean( + [self.verts[i].pos for i in np.unique(self.tris)], axis=0 + ) + + tris_new = [] + visible = [] + + # find which hull surfaces are visible from the new vertex + for i, tri in enumerate(self.tris): + if self._vertex_dist(tri, idx_new_vert) > -1e-6: + visible.append(i) + else: + tris_new.append(tri) + + tris_new = np.array(tris_new) + visible = np.array(visible, dtype=int) + + # find edges of the visible hull surfaces + max_vert = np.amax(self.tris[visible]) + 1 + counter = np.zeros([max_vert, max_vert]) + for i, tri in enumerate(self.tris[visible]): + surface = np.append(tri, tri[0]) + for n in range(3): + a = surface[n] + b = surface[n + 1] + counter[a, b] = counter[a, b] + 1 + + counter += counter.T + + edges = [] + for a in range(max_vert - 1): + for b in range(a + 1, max_vert): + if counter[a, b] == 1: + edges.append([a, b]) + edges = np.vstack(edges) + + # break the edges visible from the new vertex and add the new triangle + for e in edges: + tris_new = np.vstack( + [tris_new, self._flip_plane(np.append(e, idx_new_vert))] + ) + + # update the list of triangles in the convex hull + self.tris = tris_new + + def _remap_ghost_speakers(self) -> None: + """ + Remove unused ghost speakers and compute a downmix matrix for the rest + """ + # find ghosts that are not part of the convex hull + ghosts = [i for i, v in enumerate(self.verts) if v.is_ghost] + unused_ghosts = np.compress( + np.isin(ghosts, np.unique(self.tris), invert=True), ghosts + ) + + if unused_ghosts.size > 0: + # remove the unused ghosts from the triangle array and also adjust indices + self.tris[self.tris > unused_ghosts.min()] -= unused_ghosts.size + # delete them from the vertex array + self.verts = np.delete(self.verts, unused_ghosts) + + # generate initial sound energy distribution matrix + n_vtx = len(self.verts) + n_ghost = len(ghosts) - len(unused_ghosts) + + M = np.eye(n_vtx) + for i, v in enumerate(self.verts): + if v.is_ghost: + neighbours = self._get_neighbours(i) + M[:, i] = np.zeros(n_vtx) + M[neighbours, i] = np.ones(len(neighbours)) / len(neighbours) + + # re-distribute sound energy from ghosts + M2 = M.copy() + for i, v in enumerate(self.verts): + if v.is_ghost: + vec = M[:, i] + while np.sum(vec[-n_ghost:]) > 1e-4: + vec = M @ vec + M2[:, i] = vec + + # energy distribution for real LS and amplitude distribution for ghost LS + self.dmx_mat = M2[:-n_ghost, :] + self.dmx_mat[:, :-n_ghost] = np.sqrt(self.dmx_mat[:, :-n_ghost]) + + def _tri2poly(self) -> None: + """ + Merge hull triangles into polygons if they are coplanar + """ + polys = [] + + for tri in self.tris: + # find all vertices coplanar with this triangle (including those already in the triangle) + new_poly = np.array( + [ + i + for i, _ in enumerate(self.verts) + if np.abs(self._vertex_dist(tri, i)) < self._EFAP_THRESH_COPLANAR + ] + ) + + # check if we already found this polygon as a complete subset + is_subset = [ + i for i, poly in enumerate(polys) if np.all(np.isin(new_poly, poly)) + ] + is_superset = [ + i for i, poly in enumerate(polys) if np.all(np.isin(poly, new_poly)) + ] + + if is_subset: + continue + elif is_superset: + # remove the other polygon since it will be replaced by the superset polygon + polys_new = [p for i, p in enumerate(polys) if i not in is_superset] + polys = polys_new + + # orient the polygon plane in the same direction as the triangle + P1 = self.verts[tri[0]].pos + P2 = self.verts[tri[1]].pos + P3 = self.verts[tri[2]].pos + + # first base vector + U = P2 - P1 + U = U / np.linalg.norm(U) + + # second base vector + V = P3 - P2 + V = V - np.dot(U, V) * U + V = V / np.linalg.norm(V) + + # center of the first triangle + M = np.mean([P1, P2, P3], axis=0) + + # sort vertices + azi = np.zeros_like(new_poly, dtype=float) + for i, idx_v in enumerate(new_poly): + P = self.verts[idx_v].pos - M + X = np.dot(P, U) + Y = np.dot(P, V) + azi[i] = np.arctan2(Y, X) + + idx = np.argsort(azi) + new_poly = new_poly[idx] + + # add the polygon to the main list + polys.append(new_poly) + + self.polys = polys + + def _pan_EFAP_poly( + self, azimuth: float, elevation: float, poly: np.ndarray, mod: int + ) -> np.ndarray: + """ + Compute panning gains for each vertex in the given polygon + + + Parameters + ---------- + azimuth : float + Azimuth of requested panning position + elevation : float + Elevation of requested panning position + poly : np.ndarray + Array of vertices defining the polygon + + Returns + ------- + poly_gain: np.ndarray + Gains for each vertex in the polygon + """ + poly_gain = np.zeros_like(poly, dtype=float) + + P = np.array([azimuth, elevation]) + # search for the triangle of the polygon in which P belongs + for i in range(1, poly.size + 1): + A = np.array([self.verts[poly[i - 1]].azi, self.verts[poly[i - 1]].ele]) + for j in range(i, poly.size - 2 + i): + idx1 = 1 + (j % poly.size) + idx2 = 1 + (idx1 % poly.size) + B = np.array( + [self.verts[poly[idx1 - 1]].azi, self.verts[poly[idx1 - 1]].ele] + ) + C = np.array( + [self.verts[poly[idx2 - 1]].azi, self.verts[poly[idx2 - 1]].ele] + ) + + if mod: + A[0] %= mod + B[0] %= mod + C[0] %= mod + + if self._in_triangle(P, A, B, C): + N = np.transpose([B[1] - C[1], C[0] - B[0]]) + N = N / np.dot(N, B - A) + poly_gain[i - 1] = 1 - np.dot(P - A, N) + + """ DEBUGGING / TODO """ + # set gains <= -60dB to 0 + poly_gain[np.abs(poly_gain) < 1e-6] = 0 + + return poly_gain + + """ geometric / math helper functions """ + + def _get_neighbours(self, idx_vert: int) -> np.ndarray: + """ + Find triangles containing the given vertex index (neighbouring vertices) + """ + n = self.tris[np.any(np.isin(self.tris, idx_vert), axis=1)] + return np.unique(n[n != idx_vert]) + + def _get_azi_ele(self, idx_vert: int) -> Tuple[float, float]: + """ + Return a tuple of (azi, ele) for a vertex at the given index + """ + return self.verts[idx_vert].azi, self.verts[idx_vert].ele + + def _in_polygon( + self, azimuth: float, elevation: float, poly: np.ndarray + ) -> Tuple[bool, int]: + """ + Determine whether the panning position lies within the given polygon + by iteratively checking its triangles + + Parameters + ---------- + azimuth : float + Azimuth of requested panning position + elevation : float + Elevation of requested panning position + poly : np.ndarray + Array of vertices defining the polygon + + Returns + ------- + in_polygon, mod: Tuple[bool, int] + Flag indicating whether the point is inside the given polygon + Value of wrapping required if used + """ + azi = [self.verts[v].azi for v in poly] + + P = np.array([azimuth, elevation]) + + for tri in combinations(poly, 3): + A = np.array(self._get_azi_ele(tri[0])) + B = np.array(self._get_azi_ele(tri[1])) + C = np.array(self._get_azi_ele(tri[2])) + if self._in_triangle(P, A, B, C): + return True, None + + # if the azimuth difference is large, perform the 2D check again with azimuths wrapped to (-360, 0] and [0, 360) + # RuntimeWarning due to NaNs can be safely ignored, _in_triangle() accounts for them + if np.nanmax(azi) - np.nanmin(azi) > 180: + for tri in combinations(poly, 3): + A = np.array(self._get_azi_ele(tri[0])) + B = np.array(self._get_azi_ele(tri[1])) + C = np.array(self._get_azi_ele(tri[2])) + A[0] %= 360 + B[0] %= 360 + C[0] %= 360 + if self._in_triangle(P, A, B, C): + return True, 360 + + for tri in combinations(poly, 3): + A = np.array(self._get_azi_ele(tri[0])) + B = np.array(self._get_azi_ele(tri[1])) + C = np.array(self._get_azi_ele(tri[2])) + A[0] %= -360 + B[0] %= -360 + C[0] %= -360 + if self._in_triangle(P, A, B, C): + return True, -360 + + return False, None + + def _in_triangle( + self, P: np.ndarray, A: np.ndarray, B: np.ndarray, C: np.ndarray + ) -> bool: + """ + Determine whether the panning position lies within the given triangle + + Parameters + ---------- + P : float + Point under test + A : float + First vertex of the triangle + B : float + Second vertex of the triangle + C : float + Third vertex of the triangle + + + Returns + ------- + bool + Flag indicating whether the point is inside the given triangle + """ + if np.isnan(A[0]): + A[0] = P[0] + + if np.isnan(B[0]): + B[0] = P[0] + + if np.isnan(C[0]): + C[0] = P[0] + + tmpMat = np.transpose([B - A, C - A]) + if (1 / np.linalg.cond(tmpMat)) < self._EFAP_THRESH_TRI: + return False + + Minv = np.linalg.inv(tmpMat) + S = Minv @ (P - A) + + if ( + S[0] < -self._EFAP_THRESH_TRI + or S[1] < -self._EFAP_THRESH_TRI + or S[0] + S[1] > 1 + self._EFAP_THRESH_TRI + ): + return False + + return True + + def _vertex_dist(self, surface: np.ndarray, idx_vert: int) -> float: + """ + Compute the distance of a vertex from a given plane + + Parameters + ---------- + surface : np.ndarray + Array of 3 ordered vertices defining the plane and its orientation + idx_vert: int + Index of the vertex to compute the distance for + + Returns + ------- + float + Distance of the vertex from the given plane + """ + return self._point_plane_dist( + self.verts[surface[0]].pos, + self.verts[surface[1]].pos, + self.verts[surface[2]].pos, + self.verts[idx_vert].pos, + ) + + def _point_plane_dist( + self, P1: np.ndarray, P2: np.ndarray, P3: np.ndarray, X: np.ndarray + ) -> float: + """ + Compute the distance of a vertex from a plane defined by three points + + Parameters + ---------- + P1 : np.ndarray + Cartesian coordinates of the first point + P2 : np.ndarray + Cartesian coordinates of the second point + P3 : np.ndarray + Cartesian coordinates of the third point + X: np.ndarray + Cartesian coordinates of the vertex + + Returns + ------- + float + Distance of the vertex from the given plane + """ + + if np.all(X == P1) or np.all(X == P2) or np.all(X == P3): + return 0 + else: + N = np.cross(P1 - P2, P1 - P3) + return np.dot(X - P1, N / np.linalg.norm(N)) + + def _flip_plane(self, surface: np.ndarray) -> np.ndarray: + """ + Flip the orientation of a plane (invert normal vector) + + Parameters + ---------- + surface : np.ndarray + Array of 3 ordered vertices defining the plane and its orientation + + Returns + ------- + surface : np.ndarray + Reordered vertices with plane normal pointing outwards from the hull centroid + """ + if ( + self._point_plane_dist( + self.verts[surface[0]].pos, + self.verts[surface[1]].pos, + self.verts[surface[2]].pos, + self.centroid, + ) + > 0 + ): + surface = np.flip(surface.copy()) + + return surface + + def _compute_gains_point( + self, azimuth: float, elevation: float, intensity_panning: bool = False + ) -> np.ndarray: + """ + Compute gains for the requested panning position + + + Parameters + ---------- + azimuth : float + Azimuth of requested panning position + elevation : float + Elevation of requested panning position + intensity_panning : bool + Flag whether to use intensity panning (Default is False == amplitude panning) + + Returns + ------- + gains: np.ndarray + Panning gains for the loudspeaker layout + """ + if np.isnan(azimuth) or np.isnan(elevation): + raise ValueError(f"Angles cannot be NaNs : ({azimuth}, {elevation})") + + azimuth, elevation = wrap_angles(azimuth, elevation) + + # find the polygon corresponding to the given point + found_poly = None + mod = None + for poly in self.polys: + in_poly, mod = self._in_polygon(azimuth, elevation, poly) + if in_poly: + found_poly = poly + break + else: + raise AssertionError("Unexpected error during panning") + + # compute gains for the polygon vertices + poly_gain = self._pan_EFAP_poly(azimuth, elevation, found_poly, mod) + + # downmix ghost loudspeakers + gains = np.zeros(self.verts.size) + gains[found_poly] = poly_gain / np.linalg.norm(poly_gain) + gains = gains @ self.dmx_mat.T + gains = gains / np.linalg.norm(gains) + + if intensity_panning: + gains = np.sqrt(gains / np.sum(gains)) + + return gains + + """ public functions """ + + def pan( + self, azimuths: float, elevations: float, intensity_panning: bool = False + ) -> np.ndarray: + """ + Compute gains for the requested panning position + + + Parameters + ---------- + azimuth : float + Azimuth of requested panning position + elevation : float + Elevation of requested panning position + intensity_panning : bool + Flag whether to use intensity panning (Default is False == amplitude panning) + + Returns + ------- + gains: np.ndarray + Panning gains for the loudspeaker layout + """ + azimuths = np.array(azimuths) + elevations = np.array(elevations) + if azimuths.size == 1 and elevations.size == 1: + return self._compute_gains_point(azimuths, elevations, intensity_panning) + elif np.squeeze(azimuths).ndim == 1 and np.squeeze(elevations).ndim == 1: + gains = [] + for a, e in zip(azimuths, elevations): + gains.append(self._compute_gains_point(a, e, intensity_panning)) + return np.vstack(gains) + else: + raise ValueError( + "Azimuth and Elevation arrays cannot have more than one dimension and must be of equal size" + ) + + +def main(args): + """ + Parses a speaker layout text file and prints the panning gains + for the requested position + + + Parameters + ---------- + args : tuple + Command line arguments + + """ + + speaker_positions = np.loadtxt( + os.path.abspath(args.input), delimiter=",", max_rows=2 + ) + panner = EFAP(speaker_positions[0, :], speaker_positions[1, :]) + print(panner.pan(args.azimuth, args.elevation, args.efip)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Edge-Fading Amplitude Panning") + parser.add_argument( + "-i", + "--input", + metavar="layout_file", + required=True, + type=str, + help="IVAS compatible loudspeaker layout file (Loudspeaker azimuths in first line, elevations in second, subsequent lines are ignored)", + ) + parser.add_argument( + "-efip", + "-intensity_panning", + default=False, + action="store_true", + help="Intensity panning mode (EFIP)", + ) + parser.add_argument( + "azimuth", + type=float, + help="Azimuth of direction to compute panning gains for (positive-left)", + ) + parser.add_argument( + "elevation", + type=float, + help="Elevation of direction to compute panning gains for (positive-up)", + ) + args = parser.parse_args() + main(args) diff --git a/scripts/pyaudio3dtools/__init__.py b/scripts/pyaudio3dtools/__init__.py index 553db5ff99..fa204bae82 100644 --- a/scripts/pyaudio3dtools/__init__.py +++ b/scripts/pyaudio3dtools/__init__.py @@ -45,8 +45,8 @@ from . import ( audiofile, binauralrenderer, hoadecoder, - prerenderer, spatialaudioconvert, spatialaudioformat, spatialmetadata, ) +from .EFAP import EFAP diff --git a/scripts/pyaudio3dtools/audio3dtools.py b/scripts/pyaudio3dtools/audio3dtools.py index ecdc317e98..99ae71351b 100644 --- a/scripts/pyaudio3dtools/audio3dtools.py +++ b/scripts/pyaudio3dtools/audio3dtools.py @@ -84,28 +84,36 @@ def main(): "--outformat", type=str, metavar="OUTFORMAT", - help="Output format (default = None, same as input format)", + help="Output format (default = %(default)s, same as input format)", default=None, ) parser.add_argument( "-s", "--infs", type=int, - help="Input sampling rate (Hz) (default = None, deduced for input file)", + help="Input sampling rate (Hz) (default = %(default)s, deduced for input file)", default=None, ) parser.add_argument( "-S", "--outfs", type=int, - help="Output sampling rate (Hz) (default = None, same as input)", + help="Output sampling rate (Hz) (default = %(default)s, same as input)", default=None, ) parser.add_argument( "-c", "--inchan", type=int, - help="Input number of channels (default = None, deduced for input file)", + help="Input number of channels (default = %(default)s, deduced for input file)", + default=None, + ) + parser.add_argument( + "-m", + "--metadata", + type=str, + nargs="+", + help="list of input metadata files (only relevant for ISM and MASA input)", default=None, ) parser.add_argument( @@ -119,14 +127,14 @@ def main(): "-fc", "--outfc", type=int, - help="Cut-off freq for eventual low-pass filtering (default = None)", + help="Cut-off freq for eventual low-pass filtering (default = %(default)s)", default=None, ) parser.add_argument( "-T", "--trajectory", type=str, - help="Head-tracking trajectory file (default = None)", + help="Head-tracking trajectory file (default = %(default)s)", default=None, ) parser.add_argument( @@ -134,20 +142,20 @@ def main(): "--normalize", default=None, type=int, - help="Normalize to given loudness with --LOUDNESS_TOOL (default = None)", + help="Normalize to given loudness with --LOUDNESS_TOOL (default = %(default)s)", ) """ Miscellaneous or meta arguments """ parser.add_argument( "-b", "--binaural", - help="Binauralize output in addition to converting to output format", + help="Binauralize output *in addition to converting to output format", action="store_true", ) parser.add_argument( "--binaural_dataset", type=str, - help="Dataset to use for binaural rendering", + help="Dataset to use for binaural rendering (default = %(default)s)", choices=["orange51", "orange52", "orange53", "orange54", "sadie"], default="orange53", ) @@ -234,12 +242,13 @@ def main(): outfile = outfile.replace(input_ext, ".out.wav") outfile = os.path.join(outdir, outfile) - out_format, out_fs = spatialaudioconvert.spatial_audio_convert( + spatialaudioconvert.spatial_audio_convert( infile, outfile, in_format=args.informat, in_fs=args.infs, in_nchans=args.inchan, + in_meta_files=args.metadata, in_ls_layout_file=args.layoutfile, out_format=args.outformat, out_fs=args.outfs, @@ -253,22 +262,28 @@ def main(): logger.info(f" Output {outfile}") if args.binaural: - if args.outformat.find("BINAURAL") > -1: + if args.outformat.startswith("BINAURAL"): raise SystemExit( "BINAURAL output format can not be binauralized again!" ) - out_format_config = spatialaudioformat.Format(in_format=out_format) - out_sig, out_fs = audiofile.readfile( - outfile, nchannels=out_format_config.nchannels, fs=out_fs - ) - bin_sig = binauralrenderer.binaural_rendering( - out_sig, out_format_config.name, fs=out_fs - ) _, output_ext = os.path.splitext(os.path.basename(outfile)) - output_binaural_wav = outfile.replace(output_ext, "_BINAURAL.wav") - audiofile.writefile(output_binaural_wav, bin_sig, out_fs) - logger.info(f" Output binaural {output_binaural_wav}") + outfile_bin = outfile.replace(output_ext, "_BINAURAL.wav") + logger.info(f" Output binaural {outfile_bin}") + + spatialaudioconvert.spatial_audio_convert( + in_file=outfile, + out_file=outfile_bin, + in_format=args.outformat, + in_fs=args.outfs, + in_meta_files=args.metadata, + in_ls_layout_file=args.layoutfile, + out_format="BINAURAL", + output_loudness=args.normalize, + loudness_tool=args.loudness_tool, + trajectory=args.trajectory, + binaural_dataset=args.binaural_dataset, + ) outfile = None else: diff --git a/scripts/pyaudio3dtools/audioarray.py b/scripts/pyaudio3dtools/audioarray.py index 5b26edf0f1..16569e1ec7 100644 --- a/scripts/pyaudio3dtools/audioarray.py +++ b/scripts/pyaudio3dtools/audioarray.py @@ -89,7 +89,7 @@ def convert( if out_nchans < in_nchans: y = y[:, 0:out_nchans] elif out_nchans > in_nchans: - y = np.append(y, np.zeros((y.shape[0], out_nchans - in_nchans)), axis=1) + y = np.append(y, np.zeros([y.shape[0], out_nchans - in_nchans]), axis=1) # adjust sampling rate y = resample(y, in_fs, out_fs) @@ -214,7 +214,7 @@ def cut(x: np.ndarray, limits: Tuple[int, int]) -> np.ndarray: if last_sample > in_samples: signal_end = in_samples insert_end = insert_end - last_sample + in_samples - y = np.zeros((total_samples, in_channels), dtype=x.dtype) + y = np.zeros([total_samples, in_channels], dtype=x.dtype) y[insert_start:insert_end, :] = x[signal_start:signal_end, :] return y @@ -313,7 +313,7 @@ def getdelay(x: np.ndarray, y: np.ndarray) -> int: if n_chan_x != n_chan_y: raise ValueError lags = np.arange(-n_samples_x + 1, n_samples_y) - lag = np.zeros((n_chan_x, 1), dtype=int) + lag = np.zeros([n_chan_x, 1], dtype=int) for chan in range(n_chan_x): correlation = sig.correlate(y[:, chan], x[:, chan], mode="full") lag[chan] = lags[np.argmax(correlation)] @@ -389,7 +389,7 @@ def limiter(x: np.ndarray, fs: int): fac = attack_constant ** (np.arange(1, framesize + 1, dtype=np.float32)) else: release_constant = 0.01 ** ( - 1.0 / (0.005 * (200.0 ** release_heuristic) * fs) + 1.0 / (0.005 * (200.0**release_heuristic) * fs) ) fac = release_constant ** ( np.arange(1, framesize + 1, dtype=np.float32) @@ -407,3 +407,26 @@ def limiter(x: np.ndarray, fs: int): fr_sig[idx_max] = 32767 idx_min = np.where(fr_sig < -32768) fr_sig[idx_min] = -32768 + + +def get_framewise(x: np.ndarray, chunk_size: int) -> np.ndarray: + """Generator to yield a signal frame by frame + If array size is not a multiple of chunk_size, last frame contains the remainder + + Parameters + ---------- + x: numpy array + Input reference array + chunk_size: int + Size of frames to yield + + Yields + ------- + frame : np.ndarray + One frame of the input audio signal + """ + n_frames = x.shape[0] // chunk_size + for i in range(n_frames): + yield x[i * chunk_size : (i + 1) * chunk_size, :] + if x.shape[0] % chunk_size: + yield x[n_frames * chunk_size :, :] diff --git a/scripts/pyaudio3dtools/audiofile.py b/scripts/pyaudio3dtools/audiofile.py index 8a87ea7523..d01d078245 100755 --- a/scripts/pyaudio3dtools/audiofile.py +++ b/scripts/pyaudio3dtools/audiofile.py @@ -172,7 +172,12 @@ def convertfile( print(f"Input file: {in_file}, sampling rate {str(in_fs)} size {str(x.shape)}") # Process - if in_file == out_file and in_nchans == out_nchans and in_fs == out_fs and in_len_samples == out_len_samples: + if ( + in_file == out_file + and in_nchans == out_nchans + and in_fs == out_fs + and in_len_samples == out_len_samples + ): if verbose: print("Convert file: nothing to be done") else: @@ -231,8 +236,8 @@ def concatenatefiles( x, in_fs = readfile(in_file, fs=in_fs) # pad with silence - pre = np.zeros((pad_pre, x.shape[1])) - post = np.zeros((pad_post, x.shape[1])) + pre = np.zeros([pad_pre, x.shape[1]]) + post = np.zeros([pad_post, x.shape[1]]) x = np.concatenate([pre, x, post]) if y is None: @@ -290,9 +295,7 @@ def combinefiles( x = x[: y.shape[0], :] elif y.shape[0] > x.shape[0]: y = y[: x.shape[0], :] - # x = np.append(x, np.zeros( - # (y.shape[0]-x.shape[0], x.shape[1])), axis=0) - y = np.column_stack((y, x)) + y = np.column_stack([y, x]) y = audioarray.resample(y, in_fs, out_fs) @@ -490,7 +493,7 @@ def loudnessinfo( Parameters ---------- - in_sig: str + in_sig: np.ndarray Input audio signal in_fs: Optional[int] Input sampling rate @@ -517,32 +520,25 @@ def loudnessinfo( null_file = "/dev/null" # check for binary - loudness_tool = shutil.which(loudness_tool) - if loudness_tool is None: + if shutil.which(loudness_tool) is None: raise FileNotFoundError(f"The binary {loudness_tool} was not found in path!") - in_format_config = spatialaudioformat.Format( + in_spfmt = spatialaudioformat.Format( in_format=in_format, ls_layout_file=in_ls_layout_file ) - if not ( - in_format_config.isheadphones - or in_format_config.isloudspeaker - or in_format_config.ambi_order > 1 - ): + if not (in_spfmt.isheadphones or in_spfmt.isloudspeaker or in_spfmt.ambi_order > 1): raise NotImplementedError( - f"{in_format_config.name} is currently unsupported with {loudness_tool}." + f"{in_spfmt.name} is currently unsupported with {loudness_tool}." ) - if in_sig.shape[1] != in_format_config.nchannels: + if in_sig.shape[1] != in_spfmt.nchannels: raise ValueError( f"Mismatch in number of channels in signal of shape {in_sig.shape} of spatial audio format {in_format}!" ) with TemporaryDirectory() as tmp_dir: - tmp_file = os.path.join( - tmp_dir, "tmp.pcm" - ) # TODO might interfere with multiprocessing however each instance would probably create its own tmp dir? + tmp_file = os.path.join(tmp_dir, "tmp.pcm") if "bs1770demo" in loudness_tool: """ @@ -554,7 +550,7 @@ def loudnessinfo( cmd = [ loudness_tool, "-nchan", - str(in_format_config.nchannels), # input nchan + str(in_spfmt.nchannels), # input nchan "-lev", str(output_loudness), # level "-conf", @@ -562,19 +558,19 @@ def loudnessinfo( tmp_file, null_file, ] - if in_format_config.ambi_order > 0 or in_format_config.name == "MONO": + if in_spfmt.ambi_order > 0 or in_spfmt.name == "MONO": cmd[2] = "1" # -nchan cmd[6] = "0" # -conf - if in_format_config.isheadphones: + if in_spfmt.isheadphones: cmd[2] = "2" # -nchan cmd[6] = "00" # -conf - elif in_format_config.isloudspeaker: + elif in_spfmt.isloudspeaker: # if loudspeaker position fulfills the criteria, set the config string to 1 for that index conf_str = [ str(int(abs(e) < 30 and (abs(a) >= 60 and abs(a) <= 120))) - for a, e in zip(in_format_config.ls_azi, in_format_config.ls_ele) + for a, e in zip(in_spfmt.ls_azi, in_spfmt.ls_ele) ] - for lfe in in_format_config.lfe_index: + for lfe in in_spfmt.lfe_index: conf_str[lfe] = "L" cmd[6] = "".join(conf_str) @@ -583,7 +579,7 @@ def loudnessinfo( """ ITU-T P.56 """ - if not (in_format_config.ambi_order > 0 or in_format_config.name == "MONO"): + if not (in_spfmt.ambi_order > 0 or in_spfmt.name == "MONO"): raise ValueError( f"{in_format} is currently unsupported with {loudness_tool}" ) @@ -610,11 +606,11 @@ def loudnessinfo( ) # write temporary file - if in_format_config.ambi_order > 0 or in_format_config.name == "MONO": + if in_spfmt.ambi_order > 0 or in_spfmt.name == "MONO": writefile(tmp_file, in_sig[:, 0], in_fs) - elif in_format_config.isheadphones: + elif in_spfmt.isheadphones: writefile(tmp_file, in_sig[:, :2], in_fs) - elif in_format_config.isloudspeaker: + elif in_spfmt.isloudspeaker: writefile(tmp_file, in_sig, in_fs) # run command @@ -630,18 +626,21 @@ def loudnessinfo( measured_loudness = float(result.stdout.splitlines()[3].split(":")[1]) scale_factor = float(result.stdout.splitlines()[-3].split(":")[1]) elif "sv56demo" in loudness_tool: - measured_loudness = float( - result.stdout.splitlines()[14] - .replace("Active speech level: ..........", "") - .replace("[dBov]", "") - .strip() - ) - scale_factor = float( - result.stdout.splitlines()[6] - .replace("Norm factor desired is: .......", "") - .replace("[times]", "") - .strip() - ) + try: + measured_loudness = float( + result.stdout.splitlines()[14] + .replace("Active speech level: ..........", "") + .replace("[dBov]", "") + .strip() + ) + scale_factor = float( + result.stdout.splitlines()[6] + .replace("Norm factor desired is: .......", "") + .replace("[times]", "") + .strip() + ) + except Exception: + raise ValueError(f"Error parsing sv56demo output!\n{result.stdout}") else: raise ValueError(f"Unsupported tool {loudness_tool}") diff --git a/scripts/pyaudio3dtools/binauralRef.py b/scripts/pyaudio3dtools/binauralRef.py deleted file mode 100644 index da5adb6785..0000000000 --- a/scripts/pyaudio3dtools/binauralRef.py +++ /dev/null @@ -1,342 +0,0 @@ -#!/usr/bin/env python3 - -""" - (C) 2022 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, - Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., - Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, - Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other - contributors to this repository. All Rights Reserved. - - This software is protected by copyright law and by international treaties. - The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, - Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., - Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, - Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other - contributors to this repository retain full ownership rights in their respective contributions in - the software. This notice grants no license of any kind, including but not limited to patent - license, nor is any license granted by implication, estoppel or otherwise. - - Contributors are required to enter into the IVAS codec Public Collaboration agreement before making - contributions. - - This software is provided "AS IS", without any express or implied warranties. The software is in the - development stage. It is intended exclusively for experts who have experience with such software and - solely for the purpose of inspection. All implied warranties of non-infringement, merchantability - and fitness for a particular purpose are hereby disclaimed and excluded. - - Any dispute, controversy or claim arising under or in relation to providing this software shall be - submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in - accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and - the United Nations Convention on Contracts on the International Sales of Goods. -""" - -import logging -import os -from typing import Tuple - -import numpy as np -import scipy.interpolate as interp -import scipy.io as sio -import scipy.signal as sig - -import timeit - - -main_logger = logging.getLogger("__main__") -logger = main_logger.getChild(__name__) -logger.setLevel(logging.DEBUG) - - -def binaural_fftconv_framewise( - x: np.ndarray, - IR: np.array, - SourcePosition: np.array, - azi: np.array = None, - ele: np.array = None, - frame_len: int = 240, - interp_method="linear", - verbose=True, -) -> np.ndarray: - """Binauralization using fft convolution with frame-wise processing - supports rotation on trajectories with interpolation between measured Source - positions, reimplemented roughly along the lines of ConvBinauralRenderer.m - - Parameters - ---------- - x: np.array - input multi-channel array - IR: np.array - HRIRs array - SourcePosition: np.array - positions of the source in the measurements in IR - azi: np.array - azimuth angles for all frames - ele: np.array - elevation angles for all frames - frame_len: int - frame length, optional, default = 240 - interp_method: - interpolation method, optional, default = linear - - - Returns - ------- - y: np.ndarray - output binaural signal array - - """ - - sig_len = x.shape[0] - frame_len = 240 - N_frames = np.int(sig_len / frame_len) - - N_HRIR_taps = IR.shape[2] - - if azi is None or ele is None: - azi = np.repeat([0.0], N_frames) - ele = np.repeat([0.0], N_frames) - - iGs = np.zeros([N_frames], dtype=np.int) - mGs = np.zeros([N_frames], dtype=np.int) - - # store trajectory as a sequence of indices of source positions - # on the HRTF database in a compressed format such that, for - # each new measurement point the trajectory hits, the sample index - # is stored in mGs and the index of the measurement in iG - # the number of measurement points hit by the trajectory is nsp - isp = 0 - iGs[0] = FindFilter(SourcePosition, azi[0], ele[0]) - mGs[0] = 0 - for i_frame in range(1, N_frames): - iG = FindFilter(SourcePosition, azi[i_frame], ele[i_frame]) - if iG != iGs[isp]: - isp = isp + 1 - iGs[isp] = iG - mGs[isp] = i_frame * frame_len + 1 - nsp = isp + 1 - - # set last fence post explicitly - if mGs[nsp] < sig_len: - iGs[nsp] = iG - mGs[nsp] = sig_len - nsp = nsp + 1 - - T_rev = frame_len + N_HRIR_taps - 1 - N_rev = np.int(np.ceil(T_rev / frame_len)) - print(" N_rev = ", N_rev) - - fastcode = True - if N_rev > 5: - if verbose: - print( - " __ __ ___ ___ _ _ ___ _ _ ___ " - ) - print( - " \ \ / / / \ | _ \ | \| | |_ _| | \| | / __|" - ) - print( - " \ \/\/ / | - | | / | . | | | | . | | (_ |" - ) - print( - " \_/\_/ |_|_| |_|_\ |_|\_| |___| |_|\_| \___|" - ) - print( - " " - ) - print( - " You are using very long filters! This will be slooooow and use a lot of memory!" - ) - else: - fastcode = False - - if fastcode and verbose: - print( - " __ __ ___ ___ _ _ ___ _ _ ___ " - ) - print( - " \ \ / / / \ | _ \ | \| | |_ _| | \| | / __|" - ) - print( - " \ \/\/ / | - | | / | . | | | | . | | (_ |" - ) - print( - " \_/\_/ |_|_| |_|_\ |_|\_| |___| |_|\_| \___|" - ) - print( - " " - ) - print( - " To speed up the otherwise extremely slow calculation, we only calculate the " - ) - print( - " responses of the latest frame with the latest filters instead of the full " - ) - print( - " integrals. This is much faster but much more prone to clicks. Inspect your " - ) - print( - " output signals carefully! To change this behavior, go to binauralrenderer.py " - ) - print( - " and set fastcode to False. " - ) - - y = np.zeros((sig_len + T_rev, 2)) - y0 = np.zeros((N_rev, sig_len + T_rev, 2)) - - b = np.linspace(0.0, 1.0, frame_len) - a = 1.0 - b - - for i_ear in [0, 1]: - - Gs = IR[ - iGs[0:nsp], i_ear, : - ] # Green's function along the trajectory sampled by the measurement points - interp_G = interp.interp1d( - mGs[0:nsp], Gs, kind=interp_method, axis=0 - ) # interpolator for Green's function between those points - - G = interp_G(np.array(range(0, sig_len, frame_len))) - - t0 = timeit.default_timer() - - if fastcode: - for i_frame in range(N_frames): - - i1 = i_frame * frame_len - i2 = (i_frame + 1) * frame_len - i2p = i1 + T_rev - - a = np.linspace(0.0, 1.0, T_rev) - b = 1.0 - a - - for j_frame in [0, 1]: - G_n_m = G[min(j_frame + i_frame, N_frames - 1), :] - y0[j_frame, i1:i2p, i_ear] = sig.oaconvolve(x[i1:i2], G_n_m) - - y[i1:i2p, i_ear] += a * y0[0, i1:i2p, i_ear] + b * y0[1, i1:i2p, i_ear] - - t1 = timeit.default_timer() - fps = (i_frame + 1) / (t1 - t0) - eta = (2 * N_frames - (i_frame + 1) + i_ear * N_frames) / fps - - print( - " Frame {}/{} on ear {}/2 done at {: 3.1f} fps, ETA {: 6.0f} s ".format( - i_frame + 1, N_frames, i_ear + 1, fps, eta - ), - end="\r", - ) - - else: - for i_frame in range(N_frames): - - i1 = i_frame * frame_len - i2 = (i_frame + 1) * frame_len - i2p = i1 + T_rev - - y0[:] = 0.0 - for j_frame in range( - max(0, i_frame - N_rev), min(i_frame + 1, N_frames) - ): - - j1 = j_frame * frame_len - j2 = (j_frame + 1) * frame_len - j2p = j1 + T_rev - - G0 = G[i_frame] - G1 = G[min(i_frame + 1, N_frames - 1)] - - y0[0, j1:j2p, i_ear] += sig.oaconvolve(x[j1:j2], G0) - y0[1, j1:j2p, i_ear] += sig.oaconvolve(x[j1:j2], G1) - - y[i1:i2, i_ear] = a * y0[0, i1:i2, i_ear] + b * y0[1, i1:i2, i_ear] - - t1 = timeit.default_timer() - fps = (i_frame + 1) / (t1 - t0) - eta = (2 * N_frames - (i_frame + 1) + i_ear * N_frames) / fps - - print( - " Frame {}/{} on ear {}/2 done at {: 3.1f} fps, ETA {: 6.0f} s ".format( - i_frame + 1, N_frames, i_ear + 1, fps, eta - ), - end="\r", - ) - - print("") - - return y[0:sig_len] - - -def read_hrirs_from_mat_full( - hrirs_path: str = "/HRIRs_mat/SADIE_II_D2_48K_24bit_256tap_full.mat", -) -> Tuple[np.ndarray, np.ndarray]: - """Read HRIRs from Matlab dictionary file that contains - complete databases and includes the SourcePositions - - Parameters - ---------- - hrirs_path: str - HRTFs file name (.mat) - - Returns - ------- - hrirs: np.ndarray - IRs read from file with source positions in full format - SourcePositions: np.ndarray - Source positions corresponding to the IRs - - """ - script_path = os.path.dirname(os.path.abspath(__file__)) - hrirs_filename = script_path + hrirs_path - - mat_contents = sio.loadmat(hrirs_filename) - hrirs = mat_contents["IR"] - - SourcePositions = mat_contents["SourcePosition"] - logger.debug( - f"Loaded HRIRs: {hrirs_filename}, {hrirs.shape[0]} by {hrirs.shape[1]} (full DB)" - ) - - return hrirs, SourcePositions - - -def FindFilter(SourcePosition: np.array, azi: float, ele: float) -> int: - """Find measurement closest to the selected direction, - reimplemented roughly along the lines of ConvBinauralRenderer.m - - Parameters - ---------- - SourcePositions: np.ndarray - Source IR positions - azi: float - desired response azimuth - ele: float - desired response elevation - - Returns - ------- - i_dir: int - index of nearest SourcePosition - """ - if azi < 0: - azi = azi + 360.0 - - if ele < 0: - ele = ele + 360.0 - - delta_azi = np.deg2rad(np.abs(azi - SourcePosition[:, 0])) - dist = np.arccos( - np.sin(np.deg2rad(SourcePosition[:, 2])) * np.sin(np.deg2rad(ele)) - + np.cos(np.deg2rad(SourcePosition[:, 1])) - * np.cos(np.deg2rad(ele)) - * np.cos(delta_azi) - ) - - i_dir = np.argmin(dist) - - # print('Direction closest to {}, {} is {} with angles {}, {} and distance {}\n'.format( - # azi, ele, i_dir, SourcePosition[i_dir,0], SourcePosition[i_dir,1], dist[i_dir] - # ) - # ) - - return i_dir diff --git a/scripts/pyaudio3dtools/binauralrenderer.py b/scripts/pyaudio3dtools/binauralrenderer.py index 1040269970..ca56618a01 100644 --- a/scripts/pyaudio3dtools/binauralrenderer.py +++ b/scripts/pyaudio3dtools/binauralrenderer.py @@ -32,27 +32,29 @@ import logging import os +import timeit from typing import Tuple import numpy as np +import scipy.interpolate as interp import scipy.io as sio import scipy.signal as sig from pyaudio3dtools import audioarray, spatialaudioformat -from pyaudio3dtools.binauralRef import ( - binaural_fftconv_framewise, - read_hrirs_from_mat_full, -) +from pyaudio3dtools.constants import * from pyaudio3dtools.rotateHOA import rotateHOA from pyaudio3dtools.rotateISM import rotateISM +from pyaudio3dtools.rotateMC import rotateMC main_logger = logging.getLogger("__main__") logger = main_logger.getChild(__name__) logger.setLevel(logging.DEBUG) +"""" Helper functions """ + def read_hrirs_from_mat( - hrirs_path: str = "/HRIRs_mat/SADIE_II_D2_48K_24bit_256tap_CICP19.mat", + hrirs_path: str = "/HRIRs_mat/ORANGE_HRIR_53_48000_combined.mat", ) -> np.ndarray: """Read HRIRs from Matlab dictionary file mat @@ -63,21 +65,146 @@ def read_hrirs_from_mat( Returns ------- - hrirs: np.ndarray + IR: np.ndarray array of impulse responses + SourcePosition: np.ndarray + array of source positions corresponding to the impulse responses """ script_path = os.path.dirname(os.path.abspath(__file__)) hrirs_filename = script_path + hrirs_path mat_contents = sio.loadmat(hrirs_filename) - hrirs = mat_contents["IR"] + IR = mat_contents["IR"] + try: + SourcePosition = mat_contents["SourcePosition"] + except KeyError: + SourcePosition = None + + logger.debug(f"Loaded HRIRs: {hrirs_filename}, {IR.shape[0]} by {IR.shape[1]}") + + return IR, SourcePosition + + +def get_IR( + in_format: spatialaudioformat.Format, + out_format: spatialaudioformat.Format, + dataset: str, +) -> Tuple[np.ndarray, np.ndarray]: + """get_IR + + Parameters + ---------- + in_format: spatialaudioformat + input spatial audio format + out_format: spatialaudioformat + output spatial audio format + dataset: str + name of the HRIRs or BRIRs dataset + + Returns + ------- + IR: np.ndarray + desired impulse response array + SourcePosition: np.ndarray + source positions of corresponding IRs + + """ + # override for BRIRs, currently only one option + if out_format.name == "BINAURAL_ROOM": + dataset = "mozart_iis" + + # dataset file prefix + if dataset.lower().startswith("sadie"): + prefix = "/HRIRs_mat/SADIE_II_D2_48K_24bit_256tap" + elif dataset.lower().startswith("orange"): + prefix = f"/HRIRs_mat/ORANGE_HRIR_{dataset.replace('_full', '')[-2:]}_48000" + elif dataset.lower().startswith("mozart"): + prefix = "/BRIRs_mat/IIS_BRIR_officialMPEG_222UC" + else: + raise ValueError(f"Unsupported dataset '{dataset}' for HRIRs") + + # dataset file suffix + if in_format.name.startswith("ISM") or in_format.name.startswith("CUSTOM_LS"): + suffix = "full.mat" + elif in_format.isloudspeaker and in_format.nchannels > 1: + suffix = "combined.mat" + elif in_format.ambi_order > 0 or in_format.name.upper() == "MONO": + suffix = "SBA3.mat" + else: + raise ValueError( + f"Unsupported format '{in_format.name}' for dataset '{dataset}' for HRIRs" + ) + + IR, SourcePosition = read_hrirs_from_mat("_".join([prefix, suffix])) + + if in_format.name.startswith("MONO"): + IR = IR[:, :, :1] # use omni/W from SBA + elif in_format.name.startswith("STEREO"): + IR = IR[:, :, :2] # use L and R channels + elif in_format.isloudspeaker and not in_format.name.startswith("CUSTOM_LS"): + # extract positions from the combined file + tmpformat = spatialaudioformat.Format("COMBINED") + IR_tmp = IR.copy() + IR = np.zeros([IR_tmp.shape[0], IR_tmp.shape[1], in_format.nchannels]) + + ir_index = 0 + for i in range(tmpformat.nchannels): + for j in range(in_format.nchannels): + if ( + tmpformat.ls_azi[i] == in_format.ls_azi[j] + and tmpformat.ls_ele[i] == in_format.ls_ele[j] + ): + if j != in_format.lfe_index[0]: + IR[:, :, ir_index] = IR_tmp[:, :, i] + ir_index += 1 + + return IR, SourcePosition + + +def FindFilter(SourcePosition: np.ndarray, azi: float, ele: float) -> int: + """Find measurement closest to the selected direction, + reimplemented roughly along the lines of ConvBinauralRenderer.m + + Parameters + ---------- + SourcePosition: np.ndarray + Source IR positions + azi: float + desired response azimuth + ele: float + desired response elevation - logger.debug( - f"Loaded HRIRs: {hrirs_filename}, {hrirs.shape[0]} by {hrirs.shape[1]}" + Returns + ------- + i_dir: int + index of nearest SourcePosition + """ + if azi < 0: + azi = azi + 360.0 + + if ele < 0: + ele = ele + 360.0 + + delta_azi = np.deg2rad(np.abs(azi - SourcePosition[:, 0])) + dist = np.arccos( + np.sin(np.deg2rad(SourcePosition[:, 2])) * np.sin(np.deg2rad(ele)) + + np.cos(np.deg2rad(SourcePosition[:, 1])) + * np.cos(np.deg2rad(ele)) + * np.cos(delta_azi) ) - return hrirs + i_dir = np.argmin(dist) + + # print('Direction closest to {}, {} is {} with angles {}, {} and distance {}\n'.format( + # azi, ele, i_dir, SourcePosition[i_dir,0], SourcePosition[i_dir,1], dist[i_dir] + # ) + # ) + + return i_dir + + +""" Core binaural rendering functions """ def binaural_fftconv( @@ -102,7 +229,7 @@ def binaural_fftconv( output convolved signal array """ - y = np.zeros((x.shape[0], 2)) + y = np.zeros([x.shape[0], 2]) for chan_idx in range(min(x.shape[1], nchannels)): if chan_idx not in lfe_index: y[:, 0] = np.add( @@ -123,30 +250,37 @@ def binaural_fftconv( return y -def binaural_rendering( +def binaural_fftconv_framewise( x: np.ndarray, - in_format_name: str, - dataset: str = "orange53", - fs: int = 48000, - include_LFE: bool = False, - trajectory: str = None, - LFE_gain: float = 1.0, - in_ls_layout_file: str = None, -): - """Binaural rendering + IR: np.ndarray, + SourcePosition: np.ndarray, + azi: np.ndarray = None, + ele: np.ndarray = None, + frame_len: int = (IVAS_FRAME_LEN_MS // 4) * 48000, + interp_method="linear", + verbose=False, +) -> np.ndarray: + """Binauralization using fft convolution with frame-wise processing + supports rotation on trajectories with interpolation between measured Source + positions, reimplemented roughly along the lines of ConvBinauralRenderer.m Parameters ---------- - x: np array + x: np.ndarray input multi-channel array - in_format_name: str - name of input spatial format - dataset: str - name of the HRIRs or BRIRs dataset - fs: int - input/output sampling-rate (default 48kHz) - trajectory: str - path to trajectory file + IR: np.ndarray + HRIRs array + SourcePosition: np.ndarray + positions of the source in the measurements in IR + azi: np.ndarray + azimuth angles for all frames + ele: np.ndarray + elevation angles for all frames + frame_len: int + frame length, optional, default = (IVAS_FRAME_LEN_MS // 4) * 48000 + interp_method: + interpolation method, optional, default = linear + Returns ------- @@ -155,175 +289,440 @@ def binaural_rendering( """ - spformat = spatialaudioformat.Format( - in_format=in_format_name, ls_layout_file=in_ls_layout_file - ) + sig_len = x.shape[0] + frame_len = (IVAS_FRAME_LEN_MS // 4) * 48000 + N_frames = int(sig_len / frame_len) - if trajectory is not None: - logger.info( - " performing rotation along trajectory from file {}".format(trajectory) + N_HRIR_taps = IR.shape[2] + + if azi is None or ele is None: + azi = np.repeat([0.0], N_frames) + ele = np.repeat([0.0], N_frames) + elif len(azi) < N_frames or len(ele) < N_frames: + azi = np.concatenate( + [np.repeat(azi, N_frames // len(azi)), azi[: N_frames % len(azi)]] ) - if not ( - spformat.ambi_order > 0 - or (spformat.isloudspeaker and spformat.nchannels > 2) - or spformat.name == "CUSTOM_LS" - ): - raise NotImplementedError( - "Head tracking only supported with [F|H]OA, LS, or CUSTOM_LS input!" + ele = np.concatenate( + [np.repeat(ele, N_frames // len(ele)), ele[: N_frames % len(ele)]] + ) + + iGs = np.zeros([N_frames], dtype=int) + mGs = np.zeros([N_frames], dtype=int) + + # store trajectory as a sequence of indices of source positions + # on the HRTF database in a compressed format such that, for + # each new measurement point the trajectory hits, the sample index + # is stored in mGs and the index of the measurement in iG + # the number of measurement points hit by the trajectory is nsp + isp = 0 + iGs[0] = FindFilter(SourcePosition, azi[0], ele[0]) + mGs[0] = 0 + for i_frame in range(1, N_frames): + iG = FindFilter(SourcePosition, azi[i_frame], ele[i_frame]) + if iG != iGs[isp]: + isp += 1 + iGs[isp] = iG + mGs[isp] = i_frame * frame_len + 1 + nsp = isp + 1 + + # set last fence post explicitly + if mGs[nsp] < sig_len: + iGs[nsp] = iG + mGs[nsp] = sig_len + nsp = nsp + 1 + + T_rev = frame_len + N_HRIR_taps - 1 + N_rev = int(np.ceil(T_rev / frame_len)) + + if verbose: + print(" N_rev = ", N_rev) + + fastcode = True + if N_rev > 5: + if verbose: + print( + " __ __ ___ ___ _ _ ___ _ _ ___ " ) + print( + r" \ \ / / / \ | _ \ | \| | |_ _| | \| | / __|" + ) + print( + r" \ \/\/ / | - | | / | . | | | | . | | (_ |" + ) + print( + r" \_/\_/ |_|_| |_|_\ |_|\_| |___| |_|\_| \___|" + ) + print( + " " + ) + print( + " You are using very long filters! This will be slooooow and use a lot of memory!" + ) + else: + fastcode = False - # get corresponding IR - IR, SourcePosition = get_IR(spformat, dataset, include_LFE, LFE_gain) + if fastcode and verbose: + print( + " __ __ ___ ___ _ _ ___ _ _ ___ " + ) + print( + r" \ \ / / / \ | _ \ | \| | |_ _| | \| | / __|" + ) + print( + r" \ \/\/ / | - | | / | . | | | | . | | (_ |" + ) + print( + r" \_/\_/ |_|_| |_|_\ |_|\_| |___| |_|\_| \___|" + ) + print( + " " + ) + print( + " To speed up the otherwise extremely slow calculation, we only calculate the " + ) + print( + " responses of the latest frame with the latest filters instead of the full " + ) + print( + " integrals. This is much faster but much more prone to clicks. Inspect your " + ) + print( + " output signals carefully! To change this behavior, go to binauralrenderer.py " + ) + print( + " and set fastcode to False. " + ) - if dataset.lower().endswith("full"): - ls_azi_all = spformat.ls_azi - ls_ele_all = spformat.ls_ele - lfe_index_all = spformat.lfe_index + y = np.zeros([sig_len + T_rev, 2]) + y0 = np.zeros([N_rev, sig_len + T_rev, 2]) - if spformat.name == "CUSTOM_LS": - logger.info(" Processing channels on custom LS layout") - logger.info("azi: {}".format(ls_azi_all)) - logger.info("ele: {}".format(ls_ele_all)) - logger.info("lfe_index: {}".format(lfe_index_all)) + b = np.linspace(0.0, 1.0, frame_len, endpoint=False) + a = 1.0 - b - y = audioarray.resample(x, fs, 48000) + for i_ear in [0, 1]: - frame_len = 240 - sig_len = y.shape[0] - N_frames = np.int(sig_len / frame_len) + Gs = IR[ + iGs[0:nsp], i_ear, : + ] # Green's function along the trajectory sampled by the measurement points + interp_G = interp.interp1d( + mGs[0:nsp], Gs, kind=interp_method, axis=0 + ) # interpolator for Green's function between those points - i_ls = 0 - verbose = True - y_all = np.zeros([sig_len, 2]) - for i_chan in range(y.shape[1]): + G = interp_G(np.arange(0, sig_len, frame_len)) - # skip LFE - if i_chan in lfe_index_all: - continue + t0 = timeit.default_timer() - # skip silent (or very low volume) channels - if np.allclose(y[:, i_chan], 0.0, atol=32.0): - continue + if fastcode: + for i_frame in range(N_frames): - ls_azi = np.repeat(ls_azi_all[i_ls], N_frames) - ls_ele = np.repeat(ls_ele_all[i_ls], N_frames) + i1 = i_frame * frame_len + i2 = (i_frame + 1) * frame_len + i2p = i1 + T_rev - azi, ele = rotateISM(ls_azi, ls_ele, trajectory=trajectory) + a = np.linspace(0.0, 1.0, T_rev, endpoint=False) + b = 1.0 - a - y_all += binaural_fftconv_framewise( - y[:, i_chan], - IR, - SourcePosition, - frame_len=frame_len, - azi=azi, - ele=ele, - verbose=verbose, - ) - i_ls += 1 - verbose = False + for j_frame in [0, 1]: + G_n_m = G[min(j_frame + i_frame, N_frames - 1), :] + y0[j_frame, i1:i2p, i_ear] = sig.oaconvolve( + np.squeeze(x[i1:i2]), G_n_m + ) + + y[i1:i2p, i_ear] += a * y0[0, i1:i2p, i_ear] + b * y0[1, i1:i2p, i_ear] + + t1 = timeit.default_timer() + fps = (i_frame + 1) / (t1 - t0) + eta = (2 * N_frames - (i_frame + 1) + i_ear * N_frames) / fps + + if verbose: + print( + " Frame {}/{} on ear {}/2 done at {: 3.1f} fps, ETA {: 6.0f} s ".format( + i_frame + 1, N_frames, i_ear + 1, fps, eta + ), + end="\r", + ) + + else: + for i_frame in range(N_frames): - y = audioarray.resample(y_all, 48000, fs) + i1 = i_frame * frame_len + i2 = (i_frame + 1) * frame_len + i2p = i1 + T_rev - return y_all + y0[:] = 0.0 + for j_frame in range( + max(0, i_frame - N_rev), min(i_frame + 1, N_frames) + ): + + j1 = j_frame * frame_len + j2 = (j_frame + 1) * frame_len + j2p = j1 + T_rev + + G0 = G[i_frame] + G1 = G[min(i_frame + 1, N_frames - 1)] + + y0[0, j1:j2p, i_ear] += sig.oaconvolve(np.squeeze(x[j1:j2]), G0) + y0[1, j1:j2p, i_ear] += sig.oaconvolve(np.squeeze(x[j1:j2]), G1) + + y[i1:i2, i_ear] = a * y0[0, i1:i2, i_ear] + b * y0[1, i1:i2, i_ear] + + t1 = timeit.default_timer() + fps = (i_frame + 1) / (t1 - t0) + eta = (2 * N_frames - (i_frame + 1) + i_ear * N_frames) / fps + + if verbose: + print( + " Frame {}/{} on ear {}/2 done at {: 3.1f} fps, ETA {: 6.0f} s ".format( + i_frame + 1, N_frames, i_ear + 1, fps, eta + ), + end="\r", + ) + + if verbose: + print("") - elif dataset.lower().startswith("sadie"): - if "cicp" in spformat.altname: - # TODO change LFE handling - if include_LFE: - IR_tmp = read_hrirs_from_mat( - hrirs_path="/HRIRs_mat/SADIE_II_D2_48K_24bit_256tap_MAGLS_SBA3.mat" - ) + return y[0:sig_len] - # copy omni response - for index in spformat.lfe_index: - IR[:, :, index] = IR_tmp[:, :, 0] - x[:, index] = LFE_gain * x[:, index] - spformat.lfe_index = [] +def binaural_render_LFE( + x: np.ndarray, + fs: int = 48000, + lfe_index: list = [3], + LFE_gain: float = 10 ** (5.5 / 20), + hrir_latency_smp: float = 0, +) -> np.ndarray: + + lfe = x[:, lfe_index].copy() + + # if there is more than one LFE sum them into one + if lfe.shape[1] > 1: + lfe = np.sum(lfe, axis=1) + + # 120 Hz low-pass filtering for LFE using IVAS filter coefficients + if fs == 48000: + lfe = sig.sosfilt(IVAS_LPF_4_BUTTER_48K_SOS, lfe, axis=0) + else: + raise NotImplementedError("Only 48 kHz supported at the moment!") + + # 3.5ms LP filter delay from IVAS ROM + filter_delay = int(3.5 * fs / 1000) + + # delay adjustment + lfe = np.roll(lfe, filter_delay, axis=0) + lfe[:filter_delay, :] = 0 + + # apply gain + lfe *= LFE_gain + + # duplicate for each binaural channel + lfe = np.hstack([lfe, lfe]) + + return lfe + + +""" Format specific wrapper functions """ + + +def render_custom_ls_binaural( + x: np.ndarray, + fs: int, + in_format: spatialaudioformat.Format, + IR: np.ndarray, + SourcePosition: np.ndarray, + trajectory: np.ndarray, +) -> np.ndarray: + if in_format.name != "CUSTOM_LS": + raise ValueError( + f"Unsupported format {in_format.name} for CUSTOM_LS binaural rendering!" + ) + + ls_azi_all = in_format.ls_azi + ls_ele_all = in_format.ls_ele + lfe_index_all = in_format.lfe_index + + logger.info(" Processing channels on custom LS layout") + logger.info("azi: {}".format(ls_azi_all)) + logger.info("ele: {}".format(ls_ele_all)) + logger.info("lfe_index: {}".format(lfe_index_all)) y = audioarray.resample(x, fs, 48000) - if trajectory is not None and spformat.ambi_order > 0: - y = rotateHOA(y, trajectory) - y = binaural_fftconv(y, IR, spformat.nchannels, spformat.lfe_index) - y = audioarray.resample(y, 48000, fs) + frame_len = (IVAS_FRAME_LEN_MS // 4) * (fs // 1000) + sig_len = y.shape[0] + N_frames = int(sig_len / frame_len) - return y + i_ls = 0 + y_all = np.zeros([sig_len, 2]) + for i_chan in range(y.shape[1]): + # skip LFE + if i_chan in lfe_index_all: + continue -def get_IR( - spformat: spatialaudioformat, dataset: str, include_LFE: bool, LFE_gain: float -) -> Tuple[np.ndarray, np.ndarray]: - """ get_IR + # skip silent (or very low volume) channels + if np.allclose(y[:, i_chan], 0.0, atol=32.0): + continue + + ls_azi = np.repeat(ls_azi_all[i_ls], N_frames) + ls_ele = np.repeat(ls_ele_all[i_ls], N_frames) + + azi, ele = rotateISM(ls_azi, ls_ele, trajectory=trajectory) + + y_all += binaural_fftconv_framewise( + y[:, i_chan], + IR, + SourcePosition, + frame_len=frame_len, + azi=azi, + ele=ele, + verbose=False, + ) + i_ls += 1 + + y = audioarray.resample(y_all, 48000, fs) + + return y_all + + +def render_ism_binaural( + x: np.ndarray, + fs: int, + in_format: spatialaudioformat.Format, + IR: np.ndarray, + SourcePosition: np.ndarray, + trajectory: np.ndarray, + in_pos: np.ndarray, +) -> np.ndarray: + y = audioarray.resample(x, fs, 48000) + + frame_len = (IVAS_FRAME_LEN_MS // 4) * (fs // 1000) + sig_len = y.shape[0] + N_frames = int(sig_len / frame_len) + + # get ISM metadata and repeat it nsubframe times + pos_data = [] + for pos in in_pos: + pos_data.extend( + [pos["azimuth"], pos["elevation"]] for _ in range(pos["use_for_frames"]) + ) + pos_data = np.array(pos_data) + pos_data = np.tile(pos_data, (4, 1)) + + # extract positions only according to the audio duration + pos_data = pos_data[:N_frames, :] + + y_all = np.zeros([sig_len, 2]) + + azi, ele = rotateISM(pos_data[:, 0], pos_data[:, 1], trajectory=trajectory) + + y_all += binaural_fftconv_framewise( + y, + IR, + SourcePosition, + frame_len=frame_len, + azi=azi, + ele=ele, + verbose=False, + ) + + y = audioarray.resample(y_all, 48000, fs) + + return y_all + + +""" Wrapper function for generic binaural rendering """ + + +def binaural_rendering( + x: np.ndarray, + in_format: spatialaudioformat.Format, + out_format: spatialaudioformat.Format, + dataset: str = "orange53", + fs: int = 48000, + trajectory: str = None, + include_LFE: bool = False, + LFE_gain: float = 10 ** (5.5 / 20), + in_ls_layout_file: str = None, + in_pos: dict = None, +): + """Binaural rendering Parameters ---------- - spformat: spatialaudioformat - input spatial audio format + x: np array + input multi-channel array + in_format_name: str + name of input spatial format dataset: str name of the HRIRs or BRIRs dataset - include_LFE: bool - flag to include LFE in binaural rendering - LFE_gain: float - gain to be applied to the LFE channel + fs: int + input/output sampling-rate (default 48kHz) + trajectory: str + path to trajectory file Returns ------- - IR: np.ndarray - desired impulse response array - SourcePosition: np.ndarray - source positions of corresponding IRs + y: np.ndarray + output binaural signal array """ - # dataset file prefix - if dataset.lower().startswith("sadie"): - prefix = "/HRIRs_mat/SADIE_II_D2_48K_24bit_256tap" - elif dataset.lower().startswith("orange"): - prefix = f"/HRIRs_mat/ORANGE_HRIR_{dataset.replace('_full', '')[-2:]}_48000" - elif dataset.lower().startswith("mozart"): - prefix = "/BRIRs_mat/IIS_BRIR_officialMPEG_222UC" - else: - raise ValueError(f"Unsupported dataset '{dataset}' for HRIRs") - # dataset file suffix - if dataset.endswith("full"): - suffix = "full.mat" - elif spformat.isloudspeaker and spformat.nchannels > 1: - suffix = "combined.mat" - elif spformat.ambi_order > 0 or spformat.name.upper() == "MONO": - suffix = "SBA3.mat" - if dataset.startswith("mozart"): - suffix = "VIRTUALLS_SBA3.mat" - else: - raise ValueError( - f"Unsupported format '{spformat.name}' for dataset '{dataset}' for HRIRs" + if trajectory is not None: + logger.info( + " performing rotation along trajectory from file {}".format(trajectory) ) - if dataset.endswith("full"): - IR, SourcePosition = read_hrirs_from_mat_full("_".join([prefix, suffix])) - else: - IR = read_hrirs_from_mat("_".join([prefix, suffix])) - SourcePosition = None + # get IR corresponding to the input and output formats + IR, SourcePosition = get_IR(in_format, out_format, dataset) - if spformat.name.upper() == "MONO": - IR = IR[:, :, 0] # use omni/W from SBA - elif spformat.name.upper() == "STEREO": - IR = IR[:, :, :2] # use L and R channels - elif spformat.isloudspeaker and include_LFE: - # extract positions from the combined file - tmpformat = spatialaudioformat.Format("COMBINED") - IR_tmp = IR.copy() - IR = np.zeros((IR_tmp.shape[0], IR_tmp.shape[1], spformat.nchannels)) + if in_format.name.startswith("CUSTOM_LS"): + return render_custom_ls_binaural( + x, fs, in_format, IR, SourcePosition, trajectory + ) + elif in_format.name.startswith("ISM"): + if not in_pos: + raise ValueError("ISM metadata empty!") + return render_ism_binaural( + x, + fs, + in_format, + IR, + SourcePosition, + trajectory, + in_pos, + ) + elif in_format.name.startswith("MASA"): + pass # TODO + # return render_masa_binaural() + elif in_format.ambi_order > 0 or in_format.isloudspeaker: + y = audioarray.resample(x, fs, 48000) - ir_index = 0 - for i in range(tmpformat.nchannels): - for j in range(spformat.nchannels): - if ( - tmpformat.ls_azi[i] == spformat.ls_azi[j] - and tmpformat.ls_ele[i] == spformat.ls_ele[j] - ): - if j != spformat.lfe_index[0]: - IR[:, :, ir_index] = IR_tmp[:, :, i] - ir_index += 1 + if trajectory is not None: + if in_format.ambi_order > 0: + y = rotateHOA(y, trajectory) + if in_format.isloudspeaker: + y = rotateMC(y, trajectory, in_format) - return IR, SourcePosition + if include_LFE: + lfe = binaural_render_LFE( + y, 48000, in_format.lfe_index, LFE_gain, latency_smp + ) + + y = binaural_fftconv(y, IR, in_format.nchannels, in_format.lfe_index) + + if include_LFE: + y += lfe + + # HRTF delay compensation + latency_smp = np.argmax(np.sum(np.abs(IR), axis=(1, 2))) + y = np.roll(y, latency_smp, axis=0) + if latency_smp > 0: + y[:latency_smp, :] = 0 + + y = audioarray.resample(y, 48000, fs) + + return y + else: + raise NotImplementedError( + f"{in_format.name} -> {out_format.name}: format conversion not implemented" + ) diff --git a/scripts/pyaudio3dtools/constants.py b/scripts/pyaudio3dtools/constants.py new file mode 100644 index 0000000000..648efb01c7 --- /dev/null +++ b/scripts/pyaudio3dtools/constants.py @@ -0,0 +1,386 @@ +#!/usr/bin/env python3 + +""" + (C) 2022 Baseline Development Group with portions copyright Dolby International AB, Ericsson AB, + Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., + Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies OY, Orange, + Panasonic Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation. All Rights Reserved. + + This software is protected by copyright law and by international treaties. + The Baseline Development Group consisting of Dolby International AB, Ericsson AB, + Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., + Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies OY, Orange, + Panasonic Corporation, Qualcomm Technologies, Inc., and VoiceAge Corporation retain full ownership + rights in their respective contributions in the software. No license of any kind, including but not + limited to patent license, of any foregoing parties is hereby granted by implication, estoppel or + otherwise. + + This software is provided "AS IS", without any express or implied warranties. The software is in the + development stage. It is intended exclusively for experts who have experience with such software and + solely for the purpose of inspection. All implied warranties of non-infringement, merchantability + and/or fitness for a particular purpose are hereby disclaimed and excluded. + + Any dispute, controversy or claim arising under or in relation to providing this software shall be + submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in + accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and + the United Nations Convention on Contracts on the International Sales of Goods. +""" + +import numpy as np + +IVAS_FRAME_LEN_MS = 20 + +IVAS_CICPX_TO_MONO = np.array( + [ + [ + 1, + 1, + np.sqrt(0.5), + np.sqrt(0.5), + 0.79999995, + 0.79999995, + 0.79999995, + 0.79999995, + 0.849999964, + 0.849999964, + 0.849999964, + 0.849999964, + ] + ] +).T + +IVAS_CICPX_TO_STEREO = np.array( + [ + [1, 0], + [0, 1], + [np.sqrt(0.5), np.sqrt(0.5)], + [np.sqrt(0.5), np.sqrt(0.5)], + [0.79999995, 0], + [0, 0.79999995], + [0.79999995, 0], + [0, 0.79999995], + [0.849999964, 0], + [0, 0.849999964], + [0.849999964, 0], + [0, 0.849999964], + ] +) + +# downmix matrices +IVAS_CICP12_TO_6 = np.zeros(8 * 6) +IVAS_CICP12_TO_6[[0, 7, 14, 21, 28, 35, 40, 47]] = 1 +IVAS_CICP12_TO_6 = IVAS_CICP12_TO_6.reshape(8, 6) + +IVAS_CICP14_TO_6 = np.zeros(8 * 6) +IVAS_CICP14_TO_6[[0, 7, 14, 21, 28, 35]] = 1 +IVAS_CICP14_TO_6[[36, 43]] = 0.849999964 +IVAS_CICP14_TO_6 = IVAS_CICP14_TO_6.reshape(8, 6) + +IVAS_CICP16_TO_6 = np.zeros(10 * 6) +IVAS_CICP16_TO_6[[0, 7, 14, 21, 28, 35]] = 1 +IVAS_CICP16_TO_6[[36, 43, 52, 59]] = 0.89999964 +IVAS_CICP16_TO_6 = IVAS_CICP16_TO_6.reshape(10, 6) + +IVAS_CICP16_TO_12 = np.zeros(10 * 8) +IVAS_CICP16_TO_12[[0, 9, 18, 27, 36, 45]] = 1 +IVAS_CICP16_TO_12[[48, 57, 68, 77]] = 0.89999964 +IVAS_CICP16_TO_12 = IVAS_CICP16_TO_12.reshape(10, 8) + +IVAS_CICP16_TO_14 = np.zeros(10 * 8) +IVAS_CICP16_TO_14[[0, 11, 22, 33, 44, 55, 66, 77]] = 1 +IVAS_CICP16_TO_14[[48, 59]] = 0.899999964 +IVAS_CICP16_TO_14 = IVAS_CICP16_TO_14.reshape(10, 8) + +IVAS_CICP19_TO_6 = np.zeros(12 * 6) +IVAS_CICP19_TO_6[[0, 7, 14, 21, 28, 35]] = 1 +IVAS_CICP19_TO_6[[36, 43]] = 0.367322683 +IVAS_CICP19_TO_6[[48, 55, 64, 71]] = 0.849999964 +IVAS_CICP19_TO_6[[40, 47]] = 0.930093586 +IVAS_CICP19_TO_6 = IVAS_CICP19_TO_6.reshape(12, 6) + +IVAS_CICP19_TO_12 = np.zeros(12 * 8) +IVAS_CICP19_TO_12[[0, 9, 18, 27, 38, 47]] = 1 +IVAS_CICP19_TO_12[[48, 57]] = 0.367322683 +IVAS_CICP19_TO_12[[64, 73, 84, 93]] = 0.849999964 +IVAS_CICP19_TO_12[[52, 61]] = 0.930093586 +IVAS_CICP19_TO_12 = IVAS_CICP19_TO_12.reshape(12, 8) + +IVAS_CICP19_TO_14 = np.zeros(12 * 8) +IVAS_CICP19_TO_14[[0, 9, 18, 27, 36, 45, 70, 79]] = 1 +IVAS_CICP19_TO_14[[48, 57]] = 0.367322683 +IVAS_CICP19_TO_14[[84, 93]] = 0.849999964 +IVAS_CICP19_TO_14[[52, 61]] = 0.930093586 +IVAS_CICP19_TO_14 = IVAS_CICP19_TO_14.reshape(12, 8) + +IVAS_CICP19_TO_16 = np.zeros(12 * 10) +IVAS_CICP19_TO_16[[0, 11, 22, 33, 44, 55, 86, 97, 108, 119]] = 1 +IVAS_CICP19_TO_16[[60, 71]] = 0.367322683 +IVAS_CICP19_TO_16[[64, 75]] = 0.930093586 +IVAS_CICP19_TO_16 = IVAS_CICP19_TO_16.reshape(12, 10) + +# upmix matrices +IVAS_MONO_TO_CICPX = np.zeros([1, 12]) +IVAS_MONO_TO_CICPX[0, 2] = 1 + +IVAS_STEREO_TO_CICPX = np.zeros([2, 12]) +IVAS_STEREO_TO_CICPX[0, 0] = 1 +IVAS_STEREO_TO_CICPX[1, 1] = 1 + +IVAS_CICP12_TO_14 = np.zeros(8 * 8) +IVAS_CICP12_TO_14[[0, 9, 18, 27, 36, 45, 52, 61]] = 1 +IVAS_CICP12_TO_14 = IVAS_CICP12_TO_14.reshape(8, 8) + +IVAS_CICP12_TO_16 = np.zeros(8 * 10) +IVAS_CICP12_TO_16[[0, 11, 22, 33, 44, 55, 64, 75]] = 1 +IVAS_CICP12_TO_16 = IVAS_CICP12_TO_16.reshape(8, 10) + +IVAS_CICP14_TO_19 = np.zeros(8 * 12) +IVAS_CICP14_TO_19[[0, 13, 26, 39, 52, 65, 80, 93]] = 1 +IVAS_CICP14_TO_19 = IVAS_CICP14_TO_19.reshape(8, 12) + +IVAS_CICP16_TO_19 = np.zeros(10 * 12) +IVAS_CICP16_TO_19[[0, 13, 26, 39, 52, 65, 80, 93, 106, 119]] = 1 +IVAS_CICP16_TO_19 = IVAS_CICP16_TO_19.reshape(10, 12) + +# mapping dict +IVAS_MC_CONVERSION = { + "MONO": { + # upmix + "5_1": IVAS_MONO_TO_CICPX[:, :6], + "7_1": IVAS_MONO_TO_CICPX[:, :8], + "5_1_2": IVAS_MONO_TO_CICPX[:, :8], + "5_1_4": IVAS_MONO_TO_CICPX[:, :10], + "7_1_4": IVAS_MONO_TO_CICPX[:, :12], + }, + "STEREO": { + # upmix + "5_1": IVAS_STEREO_TO_CICPX[:, :6], + "7_1": IVAS_STEREO_TO_CICPX[:, :8], + "5_1_2": IVAS_STEREO_TO_CICPX[:, :8], + "5_1_4": IVAS_STEREO_TO_CICPX[:, :10], + "7_1_4": IVAS_STEREO_TO_CICPX[:, :12], + }, + "5_1": { + # downmix + "MONO": IVAS_CICPX_TO_MONO[:6, :], + "STEREO": IVAS_CICPX_TO_STEREO[:6, :], + # upmix + "7_1": np.pad(np.eye(6), [[0, 0], [0, 2]]), + "5_1_2": np.pad(np.eye(6), [[0, 0], [0, 2]]), + "5_1_4": np.pad(np.eye(6), [[0, 0], [0, 4]]), + "7_1_4": np.pad(np.eye(6), [[0, 0], [0, 6]]), + }, + "7_1": { + # downmix + "MONO": IVAS_CICPX_TO_MONO[:8, :], + "STEREO": IVAS_CICPX_TO_STEREO[:8, :], + "5_1": IVAS_CICP12_TO_6, + # upmix + "5_1_2": IVAS_CICP12_TO_14, + "5_1_4": IVAS_CICP12_TO_16, + "7_1_4": np.pad(np.eye(8), [[0, 0], [0, 4]]), + }, + "5_1_2": { + # downmix + "MONO": np.vstack([IVAS_CICPX_TO_MONO[:6, :], IVAS_CICPX_TO_MONO[-2:, :]]), + "STEREO": np.vstack( + [IVAS_CICPX_TO_STEREO[:6, :], IVAS_CICPX_TO_STEREO[-2:, :]] + ), + "5_1": IVAS_CICP14_TO_6, + "7_1": np.pad(IVAS_CICP14_TO_6, [[0, 0], [0, 2]]), + # upmix + "5_1_4": np.pad(np.eye(8), [[0, 0], [0, 2]]), + "7_1_4": IVAS_CICP14_TO_19, + }, + "5_1_4": { + # downmix + "MONO": IVAS_CICPX_TO_MONO[:10, :], + "STEREO": IVAS_CICPX_TO_STEREO[:10, :], + "5_1": IVAS_CICP16_TO_6, + "7_1": IVAS_CICP16_TO_12, + "5_1_2": IVAS_CICP16_TO_14, + # upmix + "7_1_4": IVAS_CICP16_TO_19, + }, + "7_1_4": { + # downmix + "MONO": IVAS_CICPX_TO_MONO, + "STEREO": IVAS_CICPX_TO_STEREO, + "5_1": IVAS_CICP19_TO_6, + "7_1": IVAS_CICP19_TO_12, + "5_1_2": IVAS_CICP19_TO_14, + "5_1_4": IVAS_CICP19_TO_16, + }, +} + +# LFE 120 Hz LPF filter coefficients +IVAS_LPF_4_BUTTER_48K_SOS = np.array( + [ + [ + 5.12617881476274e-09, + 1.02523584294987e-08, + 5.12617879059970e-09, + 1, + -1.96875982668433, + 0.969044914826862, + ], + [ + 1, + 1.99999984394358, + 1.00000000471366, + 1, + -1.98677297369091, + 0.987060670205863, + ], + ] +) + +T_DESIGN_11_AZI = np.array( + [ + 132.927291884332, + -83.9349499672527, + 8.47410038634525, + -113.340833834572, + -103.265909909537, + -33.2370360923825, + 21.8564347471830, + -156.539486489880, + -64.2647531387317, + 165.779530068738, + -25.2028339893249, + -97.0037973959711, + 27.8546391256925, + 153.214218975132, + -155.061608694663, + -11.8421354925543, + 80.5387312016125, + -42.0561606270165, + -31.2233262205060, + 38.8379041944063, + 93.7606877469492, + -84.7560200078398, + 7.75536818082863, + -122.276883381108, + 46.8012705252113, + -24.7686335284573, + 99.8904719062334, + -134.783996960185, + -83.0880230164493, + 60.1281736000420, + 152.644656278084, + 29.7576658909417, + 40.7793187974476, + 110.183927562412, + 165.652065916454, + -12.9926632105736, + 79.7359893585681, + -50.5245271190884, + 118.923930267733, + 47.2202861862577, + 171.925276523721, + -62.5145800558502, + -11.1156697680531, + 132.018041099963, + -135.355486412425, + 102.370921576708, + 112.739282398012, + -178.304963670831, + -122.319932198534, + 59.0763464570905, + 151.704200334501, + 21.3763364190503, + -169.005476417779, + 118.980811786769, + -116.089295979010, + 9.64767870353308, + 60.8933243657771, + -156.021526862757, + -63.4602993325163, + 174.929787427393, + -175.288768596346, + -105.951907934032, + -50.1928304519800, + 131.358266702971, + -136.296815007542, + 93.5644603506407, + -97.0840116473627, + -169.158278888619, + -44.1323835471345, + 81.4795403841382, + ] +) + +T_DESIGN_11_ELE = np.array( + [ + 7.69254738757899, + -23.7300652200871, + 23.5127556185301, + 70.4225940747938, + -9.89694439538752, + -70.7513316063095, + -26.4618527647561, + 47.7764936689044, + -7.72047049524459, + 44.5343602375216, + 26.3897904767450, + -44.6578850137166, + 9.76703456924600, + -47.7053318175498, + 7.45302934155972, + -23.5901209534773, + 23.7194484034707, + 70.4382693912270, + -9.83541588740259, + -70.4980825105727, + -26.2949218109204, + 47.6148028805222, + -7.51718499746626, + 44.2862347125773, + 26.6442619674660, + -44.5693707254340, + 9.91271928508000, + -47.9599550372574, + 7.29679922953795, + -23.3445981426306, + 23.6415261666079, + 70.6843143997832, + -9.58140351749889, + -70.3934534122902, + -26.4258159091605, + 47.7510668062369, + -7.30853603036844, + 44.2632768570349, + 26.7140614474957, + -44.3149733480527, + 9.75899721561506, + -48.0361913333593, + 7.43965099805872, + -23.3326075548841, + 23.3868959687598, + 70.8219078016791, + -9.48596399169388, + -70.5801867828491, + -26.6740262349265, + 47.9978414043199, + -7.38276167631068, + 44.4970603752708, + 26.5024990214418, + -44.2461913308458, + 9.51845076548334, + -47.8281351088411, + 7.68427447425834, + -23.5706842106942, + 23.3074499244045, + 70.6586472132300, + -9.68088860263008, + -70.8026785673948, + -26.6963451935976, + 48.0136296461397, + -7.63734823159200, + 44.6651234222196, + 26.3023490002159, + -44.4576351865647, + 9.52341455917443, + -47.6242211091394, + ] +) diff --git a/scripts/pyaudio3dtools/hoadecoder.py b/scripts/pyaudio3dtools/hoadecoder.py index 9296a58f90..331e86f52f 100644 --- a/scripts/pyaudio3dtools/hoadecoder.py +++ b/scripts/pyaudio3dtools/hoadecoder.py @@ -32,49 +32,68 @@ import logging import os +from typing import Optional import numpy as np -import scipy.io as sio from scipy.special import lpmv + from pyaudio3dtools import spatialaudioformat -from typing import Optional +from pyaudio3dtools.constants import T_DESIGN_11_AZI, T_DESIGN_11_ELE +from pyaudio3dtools.EFAP import EFAP main_logger = logging.getLogger("__main__") logger = main_logger.getChild(__name__) logger.setLevel(logging.DEBUG) -def read_hoa_mtx_from_mat(ambi_order: int, spkrlayout: str) -> np.ndarray: - script_path = os.path.dirname(os.path.abspath(__file__)) - mtx_filename = os.path.join( - script_path, - f"hoa_decoder_mat/mtx_hoa3_decoder_allrad_{spkrlayout}.mat", - ) - - mat_contents = sio.loadmat(mtx_filename) - mtx_hoa_dec = mat_contents["mtx_hoa_decoder"] - - logger.debug( - f"Loaded hoa dec mtx: {os.path.basename(mtx_filename)}, {mtx_hoa_dec.shape[0]} by {mtx_hoa_dec.shape[1]}" - ) - - return mtx_hoa_dec - - -def get_hoa_mtx(ambi_order: int, spkrlayout: str) -> np.ndarray: +def get_hoa_mtx( + ambi_order: int, + spkrlayout: spatialaudioformat, + norm: Optional[str] = "sn3d", + rE_weight: Optional[bool] = False, + intensity_panning: Optional[bool] = True, +) -> np.ndarray: nharm = spatialaudioformat.Format.nchannels_from_ambiorder(ambi_order) - if spkrlayout.upper() == "MONO": - mtx_hoa_dec = np.zeros((1, nharm)) + if spkrlayout.name == "MONO": + mtx_hoa_dec = np.zeros([1, nharm]) mtx_hoa_dec[0, 0] = 1 - elif spkrlayout.upper() == "STEREO": - mtx_hoa_dec = np.zeros((2, nharm)) + elif spkrlayout.name == "STEREO": + mtx_hoa_dec = np.zeros([2, nharm]) # Cardioids +/- 90 degrees mtx_hoa_dec[0, 0] = 0.5 mtx_hoa_dec[0, 1] = 0.5 mtx_hoa_dec[1, 0] = 0.5 - mtx_hoa_dec[0, 1] = -0.5 + mtx_hoa_dec[1, 1] = -0.5 + elif spkrlayout.isloudspeaker: + Y_td = getRSH(T_DESIGN_11_AZI, T_DESIGN_11_ELE, ambi_order, norm="ortho") + Y_td *= np.sqrt(4 * np.pi) + + n_ls_woLFE = spkrlayout.nchannels - len(spkrlayout.lfe_index) + ls_azi_woLFE = np.delete(spkrlayout.ls_azi, spkrlayout.lfe_index).astype(float) + ls_ele_woLFE = np.delete(spkrlayout.ls_ele, spkrlayout.lfe_index).astype(float) + + panner = EFAP(ls_azi_woLFE, ls_ele_woLFE) + G_td = panner.pan(T_DESIGN_11_AZI, T_DESIGN_11_ELE, intensity_panning) + + mtx_hoa_dec = (G_td.T @ Y_td.T) / T_DESIGN_11_AZI.size + + if norm == "sn3d": + mtx_hoa_dec = mtx_hoa_dec @ np.diag(sn2n(ambi_order)) + elif norm == "ortho": + mtx_hoa_dec *= np.sqrt(4 * np.pi) + + if rE_weight: + a_n = rE_weight(ambi_order) + nrg_pre = np.sqrt(len(n_ls_woLFE) / np.sum(a_n**2)) + mtx_hoa_dec = mtx_hoa_dec @ np.diag(a_n) * nrg_pre + + mtx_hoa_dec = np.insert( + mtx_hoa_dec, spkrlayout.lfe_index, np.zeros(nharm), axis=0 + ) else: - mtx_hoa_dec = read_hoa_mtx_from_mat(ambi_order, spkrlayout.upper())[:, :nharm] + raise ValueError( + f"Unsupported spatial audio format for ALLRAD: {spkrlayout.name}" + ) return mtx_hoa_dec @@ -130,7 +149,7 @@ def getRSH( LM = np.array([(l, m) for l in range(ambi_order + 1) for m in range(-l, l + 1)]) - response = np.zeros((LM.shape[0], azi.shape[0])) + response = np.zeros([LM.shape[0], azi.shape[0]]) # trig_term * legendre * uncondon for i, (l, m) in enumerate(LM): diff --git a/scripts/pyaudio3dtools/prerenderer.py b/scripts/pyaudio3dtools/prerenderer.py deleted file mode 100644 index 7b85de27bb..0000000000 --- a/scripts/pyaudio3dtools/prerenderer.py +++ /dev/null @@ -1,255 +0,0 @@ -#!/usr/bin/env python3 - -""" - (C) 2022 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB, - Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., - Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, - Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other - contributors to this repository. All Rights Reserved. - - This software is protected by copyright law and by international treaties. - The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB, - Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., - Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange, - Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other - contributors to this repository retain full ownership rights in their respective contributions in - the software. This notice grants no license of any kind, including but not limited to patent - license, nor is any license granted by implication, estoppel or otherwise. - - Contributors are required to enter into the IVAS codec Public Collaboration agreement before making - contributions. - - This software is provided "AS IS", without any express or implied warranties. The software is in the - development stage. It is intended exclusively for experts who have experience with such software and - solely for the purpose of inspection. All implied warranties of non-infringement, merchantability - and fitness for a particular purpose are hereby disclaimed and excluded. - - Any dispute, controversy or claim arising under or in relation to providing this software shall be - submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in - accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and - the United Nations Convention on Contracts on the International Sales of Goods. -""" - -import logging -import os -import subprocess as sp -import sys - -import numpy as np - -from pyaudio3dtools.audiofile import readfile, writefile - -main_logger = logging.getLogger("__main__") -logger = main_logger.getChild(__name__) -logger.setLevel(logging.DEBUG) - - -def _abs_path(rel_path): - return os.path.abspath( - os.path.join(os.path.dirname(__file__), *(rel_path.split("/"))) - ) - - -_prerenderer_exec_path = _abs_path("../prerenderer/IVAS_prerenderer") -if sys.platform.startswith("win"): - _prerenderer_exec_path += ".exe" -elif sys.platform.startswith("linux"): - pass -elif sys.platform.startswith("darwin"): - _prerenderer_exec_path += "_darwin" - -_tmp_dir = _abs_path("tmp") - - -def render_from_file( - audio_in_path: str, - input_config: str, - output_config: str, - audio_out_path: str = None, - frame_size_ms: int = 20, - fs: int = 48000, - prerenderer_bin: str = None, -) -> np.ndarray: - """ - Renders a 3D audio scene from a wav/pcm file based on given config file/string - - Parameters - ---------- - audio_in_path : str - path to input wav file - - input_config : str - path to config file or - "sba" for single ambisonics input or - "mc" for single multichannel input - - output_config : str - for ambisonics of order N: "sbaN" - for multichannel cicp index X: "cicpX" - - frame_size_ms : int - processing frame size in milliseconds - - fs : int - input and processing sampling rate - - Returns - ------- - Numpy array - with processed samples - """ - - if not audio_out_path: - if not os.path.exists(_tmp_dir): - os.mkdir(_tmp_dir) - audio_out_path = os.path.join(_tmp_dir, "prerenderer_out.wav") - - if prerenderer_bin is None: - prerenderer_bin = _prerenderer_exec_path - - if not os.path.exists(prerenderer_bin): - raise FileNotFoundError( - f"The IVAS prerenderer binary was not found at the given path: {os.path.abspath(prerenderer_bin)}" - ) - - cmd = [ - prerenderer_bin, - "-ia", - audio_in_path, - "-fs", - str(fs // 1000), - "-fr", - str(frame_size_ms), - "-oc", - output_config, - "-of", - audio_out_path, - ] - - if os.path.exists(input_config): - cmd.extend(["-if", input_config]) - elif input_config.lower().startswith("cicp") or input_config.lower().startswith( - "mc" - ): - cmd.extend(["-if-mc", input_config]) - elif input_config.lower().startswith("sba"): - cmd.extend(["-if-sba", input_config]) - else: - raise Exception("Invalid input config for prerenderer") - - logger.debug(" ".join(cmd)) - logger.info( - f" Prerenderer {input_config.upper()} -> {output_config.upper()} : {os.path.join(os.path.basename(os.path.dirname(audio_out_path)) ,os.path.basename(audio_out_path))}" - ) - try: - result = sp.run(cmd, check=True, capture_output=True, text=True) - except sp.CalledProcessError as e: - logger.debug(f"Command returned non-zero exit status : {e.returncode}") - logger.debug(e.stderr) - logger.debug(e.stdout) - raise SystemError( - f"Command returned non-zero exit status ({e.returncode}): {' '.join(e.cmd)}\n{e.stderr}\n{e.stdout}" - ) - - logger.debug(result.stdout) - logger.debug(result.stderr) - - return readfile(audio_out_path, fs=fs)[0] - - -def render_from_array( - audio_in_array: np.ndarray, - input_config: str, - output_config: str, - audio_out_path: str = None, - frame_size_ms: int = 20, - fs: int = 48000, - prerenderer_bin: str = None, -): - """ - Renders a 3D audio scene from a NumPy array based on given config file/string - - Parameters - ---------- - audio_in_array : np.ndarray - NumPy array containing input audio - - input_config : str - path to config file or - "sba" for single ambisonics input or - "mc" for single multichannel input - - output_config : str - for ambisonics of order N: "sbaN" - for multichannel cicp index X: "cicpX" - - frame_size_ms : int - processing frame size in milliseconds - - fs : int - input and processing sampling rate - - Returns - ------- - Numpy array - with processed samples - """ - if audio_out_path is not None: - _, output_ext = os.path.splitext(os.path.basename(audio_out_path)) - in_path = audio_out_path.replace(output_ext, "_in.wav") - else: - if not os.path.exists(_tmp_dir): - os.mkdir(_tmp_dir) - in_path = os.path.join(_tmp_dir, "prerenderer_in.wav") - - writefile(in_path, audio_in_array, fs) - - return render_from_file( - in_path, - input_config, - output_config, - audio_out_path=audio_out_path, - frame_size_ms=frame_size_ms, - fs=fs, - prerenderer_bin=prerenderer_bin, - ) - - -def render_from_metadata_object( - metadata_obj, output_config, audio_out_path: str = None, frame_size_ms=20, fs=48000 -): - """ - Renders a 3D audio scene from an object of class pyaudio3dtools.spatialmetadata - - Parameters - ---------- - metadata_obj : pyaudio3dtools.spatialmetadata - metadata containing an input audio path and 3D scene description - - output_config : str - for ambisonics of order N: "sbaN", - for multichannel cicp index X: "cicpX" - - frame_size_ms : int - processing frame size - - fs : int - input and processing sampling rate - - Returns - ------- - Numpy array - with processed samples - """ - - config_path = os.path.join(_tmp_dir, "prerenderer_metadata.txt") - metadata_obj.write_metadata(metadata_path=config_path, metadata_format="iis") - - return render_from_file( - metadata_obj.audio_wav[0], - config_path, - output_config, - audio_out_path=audio_out_path, - frame_size_ms=frame_size_ms, - fs=fs, - ) diff --git a/scripts/pyaudio3dtools/quaternions/functions.py b/scripts/pyaudio3dtools/quaternions/functions.py index a58cb79427..a36c6cd026 100644 --- a/scripts/pyaudio3dtools/quaternions/functions.py +++ b/scripts/pyaudio3dtools/quaternions/functions.py @@ -30,10 +30,11 @@ the United Nations Convention on Contracts on the International Sales of Goods. """ +from typing import Tuple import numpy as np -def Quat2Euler(quat: np.array, degrees: bool = True): +def Quat2Euler(quat: np.ndarray, degrees: bool = True): "Convert Quaternion to Euler angles" sinr = +2.0 * (quat[..., 0] * quat[..., 1] + quat[..., 2] * quat[..., 3]) @@ -55,7 +56,7 @@ def Quat2Euler(quat: np.array, degrees: bool = True): return ypr -def Euler2Quat(ypr: np.array, degrees: bool = True): +def Euler2Quat(ypr: np.ndarray, degrees: bool = True): "Convert Euler angles to Quaternion" if degrees: @@ -89,7 +90,7 @@ def Euler2Quat(ypr: np.array, degrees: bool = True): return quat -def Quat2RotMat(quat: np.array): +def Quat2RotMat(quat: np.ndarray): "Convert quaternion to rotation matrix" R = np.zeros([3, 3]) @@ -157,3 +158,26 @@ def Quat2RotMat(quat: np.array): R[2, 2] = c1 * c2 return R + + +def rotateAziEle( + azi: float, ele: float, R: np.ndarray, is_planar: bool = False +) -> Tuple[float, float]: + w = np.cos(np.deg2rad(ele)) + dv = np.array( + [ + w * np.cos(np.deg2rad(azi)), + w * np.sin(np.deg2rad(azi)), + np.sin(np.deg2rad(ele)), + ] + ) + + dv_rot = R @ dv + + azi = np.rad2deg(np.arctan2(dv_rot[1], dv_rot[0])) + if is_planar: + ele = 0 + else: + ele = np.rad2deg(np.arctan2(dv_rot[2], np.sqrt(np.sum(dv_rot[:2] ** 2)))) + + return azi, ele diff --git a/scripts/pyaudio3dtools/rotateHOA.py b/scripts/pyaudio3dtools/rotateHOA.py index e658027a04..cd11be2e59 100644 --- a/scripts/pyaudio3dtools/rotateHOA.py +++ b/scripts/pyaudio3dtools/rotateHOA.py @@ -32,6 +32,7 @@ import numpy as np +from pyaudio3dtools.constants import * from pyaudio3dtools.quaternions.functions import Quat2RotMat ######################################################################### @@ -43,7 +44,7 @@ from pyaudio3dtools.quaternions.functions import Quat2RotMat def SHrot_p( - i: int, l: int, a: int, b: int, SHrotmat: np.array, R_lm1: np.array + i: int, l: int, a: int, b: int, SHrotmat: np.ndarray, R_lm1: np.ndarray ) -> float: """Helper function to calculate the ps""" @@ -67,12 +68,12 @@ def SHrot_p( return p -def SHrot_u(l: int, m: int, n: int, SHrotmat: np.array, R_lm1: np.array) -> float: +def SHrot_u(l: int, m: int, n: int, SHrotmat: np.ndarray, R_lm1: np.ndarray) -> float: """Helper function to calculate the us""" return SHrot_p(0, l, m, n, SHrotmat, R_lm1) -def SHrot_v(l: int, m: int, n: int, SHrotmat: np.array, R_lm1: np.array) -> float: +def SHrot_v(l: int, m: int, n: int, SHrotmat: np.ndarray, R_lm1: np.ndarray) -> float: """Helper function to calculate the vs""" if m == 0: @@ -92,7 +93,7 @@ def SHrot_v(l: int, m: int, n: int, SHrotmat: np.array, R_lm1: np.array) -> floa return p0 * (1.0 - d) + p1 * np.sqrt(1.0 + d) -def SHrot_w(l: int, m: int, n: int, SHrotmat: np.array, R_lm1: np.array) -> float: +def SHrot_w(l: int, m: int, n: int, SHrotmat: np.ndarray, R_lm1: np.ndarray) -> float: """Helper function to calculate the w""" if m == 0: raise ValueError("ERROR should not be called\n") @@ -111,12 +112,12 @@ def SHrot_w(l: int, m: int, n: int, SHrotmat: np.array, R_lm1: np.array) -> floa # SHD rotation matrix calculation # translated from ivas_rotation.c ######################################## -def SHrotmatgen(R: np.array, order: int = 3) -> np.array: +def SHrotmatgen(R: np.ndarray, order: int = 3) -> np.ndarray: """Calculate SHD roatation matrix from that in real space Parameters: ---------- - R: np.array + R: np.ndarray real-space rotation matrix order: Optional[int] @@ -124,7 +125,7 @@ def SHrotmatgen(R: np.array, order: int = 3) -> np.array: Returns: ---------- - SHrotmat: np.array + SHrotmat: np.ndarray SHD rotation matrix """ @@ -196,20 +197,20 @@ def SHrotmatgen(R: np.array, order: int = 3) -> np.array: return SHrotmat -def rotateHOA(x: np.array, trajectory: str) -> np.array: +def rotateHOA(x: np.ndarray, trajectory: str) -> np.ndarray: """Rotate HOA signal by applying a rotation matrix calculated from the current quaternion in each subframe Parameters: ---------- - x: np.array + x: np.ndarray input signal upto HOA3 trajectory: str path to trajectory file Returns: ---------- - y: np.array + y: np.ndarray rotated HOA signal """ @@ -218,16 +219,19 @@ def rotateHOA(x: np.array, trajectory: str) -> np.array: sig_len = x.shape[0] sig_dim = x.shape[1] - frame_len = 240 - N_frames = np.int(sig_len / frame_len) + frame_len = (IVAS_FRAME_LEN_MS // 4) * 48000 + N_frames = int(sig_len / frame_len) if sig_dim not in [4, 9, 16]: raise ValueError("rotateHOA can only handle FOA, HOA2 or HOA3 signals!") y = np.zeros([sig_len, sig_dim]) - R1 = np.eye(sig_dim) - R2 = np.eye(sig_dim) + a = np.linspace(0, 1.0, frame_len, endpoint=False)[:, np.newaxis] + b = 1.0 - a + + R = np.eye(sig_dim) + R_old = np.eye(sig_dim) for i_frame in range(N_frames): i1 = i_frame * frame_len @@ -235,19 +239,13 @@ def rotateHOA(x: np.array, trajectory: str) -> np.array: q1 = trj_data[i_frame % trj_frames, 1:] R_r = Quat2RotMat(q1) - R1[:, :] = SHrotmatgen(R_r, order=np.int(np.sqrt(sig_dim)) - 1) + R[:, :] = SHrotmatgen(R_r, order=int(np.sqrt(sig_dim)) - 1) frame_in = x[i1:i2, :] frame_out = y[i1:i2, :] - frame_out1 = np.matmul(frame_in, R1.T) - frame_out2 = np.matmul(frame_in, R2.T) - - a = np.linspace(0, 1.0, 240) - b = 1.0 - a - - frame_out[:, :] = frame_out2 * b[:, np.newaxis] + frame_out1 * a[:, np.newaxis] + frame_out[:, :] = (frame_in @ R_old.T) * b + (frame_in @ R.T) * a - R2[:, :] = R1[:, :] + R_old[:, :] = R.copy() return y diff --git a/scripts/pyaudio3dtools/rotateISM.py b/scripts/pyaudio3dtools/rotateISM.py index 37cbfd5eaf..4ed88f43ba 100644 --- a/scripts/pyaudio3dtools/rotateISM.py +++ b/scripts/pyaudio3dtools/rotateISM.py @@ -32,12 +32,12 @@ import numpy as np -from pyaudio3dtools.quaternions.functions import Quat2RotMat +from pyaudio3dtools.quaternions.functions import Quat2RotMat, rotateAziEle def rotateISM( - azi: np.array, - ele: np.array, + azi: np.ndarray, + ele: np.ndarray, trajectory: str = None, ) -> tuple: @@ -54,26 +54,10 @@ def rotateISM( azi_rot = np.zeros([N_frames]) ele_rot = np.zeros([N_frames]) - R_r = np.zeros([3, 3]) for i_frame in range(N_frames): - - azi1 = azi[i_frame] - ele1 = ele[i_frame] - - r = np.array( - [-np.sin(azi1) * np.cos(ele1), np.cos(azi1) * np.cos(ele1), np.sin(ele1)] + q = trj_data[i_frame % trj_frames, 1:] + azi_rot[i_frame], ele_rot[i_frame] = rotateAziEle( + azi[i_frame], ele[i_frame], Quat2RotMat(q) ) - q1 = trj_data[i_frame % trj_frames, 1:] - R_r[:, :] = Quat2RotMat(q1) - - r = np.matmul(R_r, r) - - r_xy = np.sqrt(r[0] * r[0] + r[1] * r[1]) - azi_rot1 = np.arctan2(-r[0], r[1]) / np.pi * 180.0 - ele_rot1 = np.arctan2(r[2], r_xy) / np.pi * 180.0 - - azi_rot[i_frame] = azi_rot1 - ele_rot[i_frame] = ele_rot1 - return azi_rot, ele_rot diff --git a/scripts/pyaudio3dtools/rotateMC.py b/scripts/pyaudio3dtools/rotateMC.py new file mode 100644 index 0000000000..65b47b9b18 --- /dev/null +++ b/scripts/pyaudio3dtools/rotateMC.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 + +""" + (C) 2022 Baseline Development Group with portions copyright Dolby International AB, Ericsson AB, + Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., + Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies OY, Orange, + Panasonic Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation. All Rights Reserved. + + This software is protected by copyright law and by international treaties. + The Baseline Development Group consisting of Dolby International AB, Ericsson AB, + Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD., + Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies OY, Orange, + Panasonic Corporation, Qualcomm Technologies, Inc., and VoiceAge Corporation retain full ownership + rights in their respective contributions in the software. No license of any kind, including but not + limited to patent license, of any foregoing parties is hereby granted by implication, estoppel or + otherwise. + + This software is provided "AS IS", without any express or implied warranties. The software is in the + development stage. It is intended exclusively for experts who have experience with such software and + solely for the purpose of inspection. All implied warranties of non-infringement, merchantability + and/or fitness for a particular purpose are hereby disclaimed and excluded. + + Any dispute, controversy or claim arising under or in relation to providing this software shall be + submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in + accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and + the United Nations Convention on Contracts on the International Sales of Goods. +""" + +import numpy as np + +from pyaudio3dtools import EFAP, spatialaudioformat +from pyaudio3dtools.constants import * +from pyaudio3dtools.quaternions.functions import Quat2RotMat, rotateAziEle + + +def rotateMC(x: np.ndarray, trajectory: str, layout: spatialaudioformat) -> np.ndarray: + """Rotate MC signal by applying a rotation matrix calculated from the current quaternion + in each subframe + + Parameters: + ---------- + x: np.ndarray + input multichannel signal + trajectory: str + path to trajectory file + + Returns: + ---------- + y: np.ndarray + rotated multichannel signal + """ + + # TODO needs optimization, currently slow + trj_data = np.genfromtxt(trajectory, delimiter=",") + trj_frames = trj_data.shape[0] + + sig_len = x.shape[0] + sig_dim = x.shape[1] + frame_len = (IVAS_FRAME_LEN_MS // 4) * 48000 + N_frames = int(sig_len / frame_len) + + y = np.zeros([sig_len, sig_dim]) + + panner = EFAP.EFAP(layout.ls_azi, layout.ls_ele) + + a = np.linspace(0, 1.0, frame_len, endpoint=False)[:, np.newaxis] + b = 1.0 - a + + R = np.eye(layout.nchannels) + R_old = np.eye(layout.nchannels) + + for i_frame in range(N_frames): + + start = i_frame * frame_len + end = (i_frame + 1) * frame_len + + q = trj_data[i_frame % trj_frames, 1:] + + rotated_pos = np.array( + [ + rotateAziEle(a, e, Quat2RotMat(q)) + for a, e in zip(layout.ls_azi, layout.ls_ele) + ] + ) + R = panner.pan(rotated_pos[:, 0], rotated_pos[:, 1]) + R[:, layout.lfe_index] = np.zeros([layout.nchannels, 1]) + R[layout.lfe_index, layout.lfe_index] = 1 + + frame_in = x[start:end, :] + frame_out = y[start:end, :] + + frame_out[:, :] = (frame_in @ R_old) * b + (frame_in @ R) * a + + R_old = R.copy() + + return y diff --git a/scripts/pyaudio3dtools/spatialaudioconvert.py b/scripts/pyaudio3dtools/spatialaudioconvert.py index 1e78def724..3d71dc0e6f 100644 --- a/scripts/pyaudio3dtools/spatialaudioconvert.py +++ b/scripts/pyaudio3dtools/spatialaudioconvert.py @@ -32,17 +32,20 @@ import logging import os -from typing import Optional +from typing import Optional, Tuple + +import numpy as np from pyaudio3dtools import ( + EFAP, audioarray, audiofile, binauralrenderer, hoadecoder, - prerenderer, spatialaudioformat, spatialmetadata, ) +from pyaudio3dtools.constants import * main_logger = logging.getLogger("__main__") logger = main_logger.getChild(__name__) @@ -55,6 +58,7 @@ def spatial_audio_convert( in_format: Optional[str] = None, in_fs: Optional[int] = None, in_nchans: Optional[int] = None, + in_meta_files: Optional[list] = None, in_ls_layout_file: Optional[str] = None, out_format: Optional[str] = None, out_fs: Optional[int] = None, @@ -62,13 +66,12 @@ def spatial_audio_convert( output_loudness: Optional[int] = None, loudness_tool: Optional[str] = None, limit_output: Optional[bool] = False, - prerenderer_bin: Optional[str] = None, cut_preamble_s: Optional[int] = None, trajectory: Optional[str] = None, bin_rend_include_LFE: Optional[bool] = False, - bin_rend_LFE_gain: Optional[float] = 1.0, + bin_rend_LFE_gain: Optional[float] = 10 ** (5.5 / 20), binaural_dataset: Optional[str] = "orange53", -) -> tuple: +) -> Tuple[np.ndarray, int]: """ Spatial audio conversion between various formats @@ -105,8 +108,6 @@ def spatial_audio_convert( limit_output: Optional[bool] flag whether to apply limiting to the output - prerenderer_bin: Optional[str] - path to prerenderer binary cut_preamble_s: Optional[int] preamble to cut in seconds @@ -121,9 +122,9 @@ def spatial_audio_convert( Returns ------- - out_format_config.name: + out_spfmt.name: str output spatial audio format name - out_fs: + out_fs: int output sampling frequency """ # Input is either waveform file (.pcm or .wav) or iis metadata (.txt) @@ -138,13 +139,13 @@ def spatial_audio_convert( raise Exception("Input and output fs not defined.") if in_nchans is None: if in_format is not None: - in_format_config = spatialaudioformat.Format( + in_spfmt = spatialaudioformat.Format( in_format=in_format, ls_layout_file=in_ls_layout_file ) - in_nchans = in_format_config.nchannels + in_nchans = in_spfmt.nchannels elif out_format is not None: - out_format_config = spatialaudioformat.Format(in_format=out_format) - in_nchans = out_format_config.nchannels + out_spfmt = spatialaudioformat.Format(in_format=out_format) + in_nchans = out_spfmt.nchannels else: raise Exception( "Number if input channels not defined and can't be deduced." @@ -153,17 +154,19 @@ def spatial_audio_convert( elif input_ext == ".wav": in_sig, in_fs = audiofile.readfile(in_file) if in_format is not None: - in_format_config = spatialaudioformat.Format( + in_spfmt = spatialaudioformat.Format( in_format=in_format, ls_layout_file=in_ls_layout_file ) # Adjust number of channels if case of HOA, zeroed vert channels if planar - if in_format_config.ambi_order > 0: - in_sig = audioarray.convert( - in_sig, out_nchans=in_format_config.nchannels - ) + if in_spfmt.ambi_order > 0: + in_sig = audioarray.convert(in_sig, out_nchans=in_spfmt.nchannels) elif input_ext == ".txt": metadata_obj = spatialmetadata.Metadata(in_file, audio_fs=in_fs) in_sig, in_fs = metadata_obj.get_audio_array() + if in_format != "META": + logger.info( + f" {in_format} specified with .txt input file: overriding to META format" + ) in_format = "META" else: raise Exception(f"Not supported file {input_ext}") @@ -176,56 +179,57 @@ def spatial_audio_convert( else: logger.info(f" Input spatial audio format: {in_format}") - """ convert metadata to output format """ - if in_format == "META": + """ convert metadata based formats directly to output format """ + if in_format.startswith("META") or in_format.startswith("ISM"): if out_format is None: - raise Exception("out format must be specified for META (.txt) input") - - run_prerenderer = True - - # for other LS conversions -> pre-renderer - _, output_ext = os.path.splitext(os.path.basename(out_file)) - prerenderer_out_path = out_file.replace(output_ext, "_prerender_meta.wav") - out_format_config = spatialaudioformat.Format(in_format=out_format) - # in case of Binaural rendering output format force to HOA3 as intermediate format - if out_format_config.name == "BINAURAL" or ( - out_format_config.name == "BINAURAL_ROOM" and trajectory is not None - ): - in_format_config = spatialaudioformat.Format("HOA3") - if in_ls_layout_file() is not None: - raise NotImplementedError( - "Custom layout files only supported for BINURAL[_ROOM]_REF output" + raise Exception("out format must be specified for META (.txt) or ISM input") + + if in_format.startswith("ISM"): + if in_meta_files is None: + raise ValueError( + f"Please specify a list of metadata files for {in_format}" ) - elif out_format_config.name == "BINAURAL_ROOM": - # in case of Binaural rendering output format force to 7_1_4 as intermediate format - in_format_config = spatialaudioformat.Format("7_1_4") - if in_ls_layout_file() is not None: - raise NotImplementedError( - "Custom layout files only supported for BINURAL[_ROOM]_REF output" + if len(in_meta_files) != int(in_format[-1]): + raise ValueError( + f"Mismatch between number of streams and number of specified metadata files for {in_format}" ) - elif out_format_config.name == "BINAURAL_REF": - run_prerenderer = False - raise NotImplementedError("BINAURAL_REF not implemented for META input") - elif out_format_config.name == "BINAURAL_ROOM_REF": - run_prerenderer = False - raise NotImplementedError( - "BINAURAL_ROOM_REF not implemented for META input" - ) - else: - in_format_config = out_format_config - if run_prerenderer: - logger.info(f" {in_format} -> {in_format_config.name}") - in_format = in_format_config.name - in_sig = prerenderer.render_from_array( - in_sig, - in_file, - in_format_config.altname, - audio_out_path=prerenderer_out_path, - frame_size_ms=20, - fs=in_fs, - prerenderer_bin=prerenderer_bin, - ) + # initialise metadata object for ISM + metadata_obj = spatialmetadata.Metadata() + metadata_obj.init_for_ism(in_file, in_fs, in_meta_files) + + # TODO alternative paths for binaural rendering for now + if out_format.startswith("BINAURAL_ROOM"): + in_format = "7_1_4" + elif out_format.startswith("BINAURAL"): + in_format = "HOA3" + else: + in_format = out_format + + in_spfmt = spatialaudioformat.Format(in_format) + out_spfmt = spatialaudioformat.Format(out_format) + + else: + # set input format to output format + # render_meta() handles all conversions + in_spfmt = spatialaudioformat.Format(in_format) + out_spfmt = spatialaudioformat.Format(out_format) + + in_format = out_format + in_spfmt = out_spfmt + + in_sig = render_meta( + metadata_obj, + in_spfmt, + dataset=binaural_dataset, + fs=in_fs, + trajectory=trajectory, + in_ls_layout_file=in_ls_layout_file, + include_LFE=bin_rend_include_LFE, + LFE_gain=bin_rend_LFE_gain, + ) + elif in_format.startswith("MASA"): + raise NotImplementedError(f"Rendering of MASA is not yet supported!") """ cut preamble """ if cut_preamble_s is not None: @@ -235,126 +239,61 @@ def spatial_audio_convert( in_sig = audioarray.cut(in_sig, (samples_to_cut, -1)) """ get spatial input and audio format configurations """ - in_format_config = spatialaudioformat.Format( + in_spfmt = spatialaudioformat.Format( in_format=in_format, ls_layout_file=in_ls_layout_file ) if out_format is None: out_format = in_format - out_format_config = spatialaudioformat.Format(in_format=out_format) + out_spfmt = spatialaudioformat.Format(in_format=out_format) """ zero non-planar input ambisonics channels """ - if in_format_config.ambi_order > 0 and in_format_config.isplanar: + if in_spfmt.ambi_order > 0 and in_spfmt.isplanar: in_sig = spatialaudioformat.Format.zero_vert_hoa_channels(in_sig) """ Spatial audio format conversion """ out_sig = in_sig - if out_format != in_format: - logger.info(f" {in_format_config.name} -> {out_format_config.name}") + if (out_format != in_format) and not ( + in_spfmt.isheadphones and out_spfmt.isheadphones + ): + logger.info(f" {in_spfmt.name} -> {out_spfmt.name}") - # HOA -> LS - if in_format_config.ambi_order > 0 and out_format_config.isloudspeaker: - mtx_hoa_dec = hoadecoder.get_hoa_mtx( - in_format_config.ambi_order, out_format_config.name - ) - out_sig = hoadecoder.hoa_linear_decoding(in_sig, mtx_hoa_dec) - # HOA/MC -> BINAURAL - elif ( - (in_format_config.ambi_order > 0) or (in_format_config.isloudspeaker) - ) and (out_format_config.name == "BINAURAL"): + # binaural output + if out_spfmt.name.startswith("BINAURAL"): out_sig = binauralrenderer.binaural_rendering( in_sig, - in_format_config.name, + in_spfmt, + out_spfmt, dataset=binaural_dataset, fs=in_fs, trajectory=trajectory, + in_ls_layout_file=in_ls_layout_file, include_LFE=bin_rend_include_LFE, LFE_gain=bin_rend_LFE_gain, ) - # HOA/MC -> BINAURAL_ROOM - elif ( - (in_format_config.ambi_order > 0) or (in_format_config.isloudspeaker) - ) and (out_format_config.name == "BINAURAL_ROOM"): - out_sig = binauralrenderer.binaural_rendering( - in_sig, - in_format_config.name, - dataset="mozart_iis", - fs=in_fs, - trajectory=trajectory, - include_LFE=bin_rend_include_LFE, - LFE_gain=bin_rend_LFE_gain, - ) - # HOA -> HOA - elif (in_format_config.ambi_order > 0) and (out_format_config.ambi_order > 0): - out_sig = audioarray.convert( - in_sig, in_fs=in_fs, out_nchans=out_format_config.nchannels - ) - # for other LS conversions -> pre-renderer - elif in_format_config.isloudspeaker and ( - out_format_config.altname.startswith("sba") - or out_format_config.altname.startswith("cicp") - ): - _, output_ext = os.path.splitext(os.path.basename(out_file)) - prerenderer_out_path = out_file.replace(output_ext, "_prerender_mc.wav") - - out_sig = prerenderer.render_from_array( - in_sig, - "mc", - out_format_config.altname, - audio_out_path=prerenderer_out_path, - frame_size_ms=20, - fs=in_fs, - prerenderer_bin=prerenderer_bin, - ) - # BINAURAL passthrough, nothing to be done - elif (in_format_config.isheadphones is True) and ( - out_format_config.isheadphones is True - ): - out_sig = in_sig - # BINAURAL_REF rendering - elif out_format_config.name == "BINAURAL_REF": - if ( - in_format_config.isloudspeaker and in_format_config.nchannels > 2 - ) or in_format_config.name == "CUSTOM_LS": - out_sig = binauralrenderer.binaural_rendering( - in_sig, - in_format_config.name, - dataset="orange53_full", - fs=in_fs, - trajectory=trajectory, - in_ls_layout_file=in_ls_layout_file, - include_LFE=bin_rend_include_LFE, - LFE_gain=bin_rend_LFE_gain, - ) - else: - raise NotImplementedError( - f"{in_format_config.name} -> {out_format_config.name}: format conversion not implemented" - ) - # BINAURAL_ROOM_REF rendering - elif out_format_config.name == "BINAURAL_ROOM_REF": - if ( - in_format_config.isloudspeaker and in_format_config.nchannels > 2 - ) or in_format_config.name == "CUSTOM_LS": - out_sig = binauralrenderer.binaural_rendering( - in_sig, - in_format_config.name, - dataset="mozart_iis_full", - fs=in_fs, - trajectory=trajectory, - in_ls_layout_file=in_ls_layout_file, - include_LFE=bin_rend_include_LFE, - LFE_gain=bin_rend_LFE_gain, - ) - else: - raise NotImplementedError( - f"{in_format_config.name} -> {out_format_config.name}: format conversion not implemented" - ) + # non-binaural outputs + # HOA conversion + elif in_spfmt.ambi_order > 0: + out_sig = convert_sba(in_sig, in_spfmt, out_spfmt) + + # MC conversion + elif in_spfmt.isloudspeaker: + out_sig = convert_mc(in_sig, in_spfmt, out_spfmt) + + # ISM conversion + elif in_spfmt.name.startswith("ISM"): + out_sig = convert_ism(in_sig, in_fs, in_spfmt, out_spfmt) + + # MASA conversion + elif in_spfmt.name.startswith("MASA"): + raise AssertionError(f"Shouldn't execute!") # TODO remove + out_sig = convert_masa(in_sig, in_spfmt, out_spfmt) else: raise NotImplementedError( - f"{in_format_config.name} -> {out_format_config.name}: format conversion not implemented" + f"{in_spfmt.name} -> {out_spfmt.name}: format conversion not implemented" ) """ zero non-planar output ambisonics channels """ - if out_format_config.ambi_order > 0 and out_format_config.isplanar: + if out_spfmt.ambi_order > 0 and out_spfmt.isplanar: out_sig = spatialaudioformat.Format.zero_vert_hoa_channels(out_sig) """ resampling """ @@ -386,4 +325,215 @@ def spatial_audio_convert( audiofile.writefile(out_file, out_sig, out_fs) - return out_format_config.name, out_fs + return out_sig, out_fs + + +def convert_sba( + in_sig: np.ndarray, + in_spfmt: spatialaudioformat.Format, + out_spfmt: spatialaudioformat.Format, +) -> np.ndarray: + """Convert an ambisonics signal to the requested output format""" + # HOA -> LS + if out_spfmt.isloudspeaker: + HOA2LS = hoadecoder.get_hoa_mtx(in_spfmt.ambi_order, out_spfmt) + return hoadecoder.hoa_linear_decoding(in_sig, HOA2LS) + # HOA -> HOA + elif out_spfmt.ambi_order > 0: + return audioarray.convert(in_sig, in_fs=None, out_nchans=out_spfmt.nchannels) + else: + raise NotImplementedError( + f"{in_spfmt.name} -> {out_spfmt.name}: format conversion not implemented" + ) + + +def convert_mc( + in_sig: np.ndarray, + in_spfmt: spatialaudioformat.Format, + out_spfmt: spatialaudioformat.Format, +) -> np.ndarray: + """Convert a multichannel signal to the requested output format""" + # MC -> LS + if out_spfmt.isloudspeaker: + try: + MC2LS = IVAS_MC_CONVERSION[in_spfmt.name][out_spfmt.name] + except KeyError: + panner = EFAP.EFAP(in_spfmt.ls_azi, in_spfmt.ls_ele) + MC2LS = np.vstack( + [panner.pan(a, e) for a, e in zip(out_spfmt.ls_azi, out_spfmt.ls_ele)] + ).T + + # pass-through for LFE + MC2LS[in_spfmt.lfe_index, :] = 0 + MC2LS[in_spfmt.lfe_index, out_spfmt.lfe_index] = 1 + + return in_sig @ MC2LS + # MC -> HOA + elif out_spfmt.ambi_order > 0: + # SH response for loudspeaker positions + MC2HOA = np.hstack( + [ + hoadecoder.getRSH([a], [e], out_spfmt.ambi_order) + for a, e in zip(in_spfmt.ls_azi, in_spfmt.ls_ele) + ] + ).T + + # do not add LFE to output + MC2HOA[in_spfmt.lfe_index] = 0 + + return in_sig @ MC2HOA + else: + raise NotImplementedError( + f"{in_spfmt.name} -> {out_spfmt.name}: format conversion not implemented" + ) + + +def convert_ism( + in_sig: np.ndarray, + in_fs: int, + in_pos: dict, + in_spfmt: spatialaudioformat.Format, + out_spfmt: spatialaudioformat.Format, +) -> np.ndarray: + """Convert an ISM signal to the requested output format""" + pos_data = [] + for pos in in_pos: + pos_data.extend( + [pos["azimuth"], pos["elevation"]] for _ in range(pos["use_for_frames"]) + ) + pos_data = np.array(pos_data) + pos_frames = pos_data.shape[0] + + sig_len = in_sig.shape[0] + frame_len = IVAS_FRAME_LEN_MS * (in_fs // 1000) + + out_sig = np.zeros([sig_len, out_spfmt.nchannels]) + + fade_in = np.linspace(0, 1.0, frame_len, endpoint=False)[:, np.newaxis] + fade_out = 1.0 - fade_in + + if out_spfmt.isloudspeaker: + ls_azi_woLFE = np.delete(out_spfmt.ls_azi, out_spfmt.lfe_index) + ls_ele_woLFE = np.delete(out_spfmt.ls_ele, out_spfmt.lfe_index) + panner = EFAP.EFAP(ls_azi_woLFE, ls_ele_woLFE) + + gains_old = None + + for i_frame, (in_frame, out_frame) in enumerate( + zip( + audioarray.get_framewise(in_sig, frame_len), + audioarray.get_framewise(out_sig, frame_len), + ) + ): + pos = EFAP.wrap_angles(*pos_data[i_frame % pos_frames, :], clip_ele=True) + + # ISM -> MC + if out_spfmt.isloudspeaker: + gains = panner.pan(pos[0], pos[1]) + gains = np.insert(gains, out_spfmt.lfe_index, 0) + gains = gains[:, np.newaxis] + # ISM -> HOA + elif out_spfmt.ambi_order > 0: + gains = hoadecoder.getRSH([pos[0]], [pos[1]], out_spfmt.ambi_order) + else: + raise NotImplementedError( + f"{in_spfmt.name} -> {out_spfmt.name}: format conversion not implemented" + ) + + if gains_old is None: + gains_old = gains.copy() + + out_frame[:] = (in_frame @ gains.T) * fade_in + ( + in_frame @ gains_old.T + ) * fade_out + + gains_old = gains.copy() + + return out_sig + + +def convert_masa( + in_sig: np.ndarray, + in_spfmt: spatialaudioformat.Format, + out_spfmt: spatialaudioformat.Format, +) -> np.ndarray: + """Convert a MASA signal to the requested output format""" + # MASA -> LS + if out_spfmt.isloudspeaker: + # TODO + raise NotImplementedError( + f"{in_spfmt.name} -> {out_spfmt.name}: format conversion not implemented" + ) + # MASA -> HOA + elif out_spfmt.ambi_order > 0: + # TODO + raise NotImplementedError( + f"{in_spfmt.name} -> {out_spfmt.name}: format conversion not implemented" + ) + else: + raise NotImplementedError( + f"{in_spfmt.name} -> {out_spfmt.name}: format conversion not implemented" + ) + + return out_sig + + +def render_meta( + metadata_obj: spatialmetadata.Metadata, + dest_fmt: spatialaudioformat.Format, + dataset: str, + fs: int, + trajectory: str, + in_ls_layout_file: str, + include_LFE: bool = False, + LFE_gain: float = 10 ** (5.5 / 20), +) -> np.ndarray: + """Render mixed scene metadata to the desired format""" + + logger.info(f" META -> {dest_fmt.name}") + + out_sig = np.zeros([metadata_obj.audio_array.shape[0], dest_fmt.nchannels]) + + for object in metadata_obj.objects: + # extract object signal + start = object["track_index"] + stop = start + object["nb_tracks"] + obj_sig = metadata_obj.audio_array[:, start:stop] + + if dest_fmt.name.startswith("BINAURAL"): + if object["input_type"] == "ism": + src_format = spatialaudioformat.Format(f"ISM") + positions = object["positions"] + if object["input_type"] == "sba": + src_format = spatialaudioformat.Format(f"SBA{object['order']}") + positions = None + elif object["input_type"] == "mc": + src_format = spatialaudioformat.Format(f"CICP{object['cicp_index']}") + positions = None + + out_sig += binauralrenderer.binaural_rendering( + obj_sig, + src_format, + dest_fmt, + dataset=dataset, + fs=fs, + trajectory=trajectory, + include_LFE=include_LFE, + LFE_gain=LFE_gain, + in_ls_layout_file=in_ls_layout_file, + in_pos=positions, + ) + else: + if object["input_type"] == "ism": + src_format = spatialaudioformat.Format("ISM") + out_sig += convert_ism( + obj_sig, fs, object["positions"], src_format, dest_fmt + ) + elif object["input_type"] == "sba": + src_format = spatialaudioformat.Format(f"SBA{object['order']}") + out_sig += convert_sba(obj_sig, src_format, dest_fmt) + elif object["input_type"] == "mc": + src_format = spatialaudioformat.Format(f"CICP{object['cicp_index']}") + out_sig += convert_mc(obj_sig, src_format, dest_fmt) + + return out_sig diff --git a/scripts/pyaudio3dtools/spatialaudioformat.py b/scripts/pyaudio3dtools/spatialaudioformat.py index 9bd832f794..83eb4b6802 100644 --- a/scripts/pyaudio3dtools/spatialaudioformat.py +++ b/scripts/pyaudio3dtools/spatialaudioformat.py @@ -40,6 +40,8 @@ _format_configs = { "nchannels": 1, "isloudspeaker": True, "isheadphones": False, + "ls_azi": [0], + "ls_ele": [0], "lfe_index": [], "altname": "HOA0", }, @@ -50,6 +52,8 @@ _format_configs = { "nchannels": 2, "isloudspeaker": True, "isheadphones": False, + "ls_azi": [30, -30], + "ls_ele": [0, 0], "lfe_index": [], "altname": "cicp2", }, @@ -162,7 +166,23 @@ _format_configs = { "nchannels": 15, "isloudspeaker": True, "isheadphones": False, - "ls_azi": [30, -30, 0, 135, -135, 110, -110, 90, -90, 30, -30, 110, -110, 135, -135], + "ls_azi": [ + 30, + -30, + 0, + 135, + -135, + 110, + -110, + 90, + -90, + 30, + -30, + 110, + -110, + 135, + -135, + ], "ls_ele": [0, 0, 0, 0, 0, 0, 0, 0, 0, 35, 35, 35, 35, 35, 35], "lfe_index": None, "altname": "combined", @@ -365,12 +385,22 @@ class Format: if self.name == "CUSTOM_LS" and ls_layout_file is not None: with open(ls_layout_file, "r") as f_ls: - self.ls_azi = [float(x) for x in f_ls.readline().split(",").strip()] - self.ls_ele = [float(x) for x in f_ls.readline().split(",").strip()] - try: - self.lfe_index = [int(x) for x in f_ls.readline().split(",").strip()] - except: - self.lfe_index = [] + self.ls_azi = [ + float(x.strip()) for x in f_ls.readline().strip().split(",") + ] + self.ls_ele = [ + float(x.strip()) for x in f_ls.readline().strip().split(",") + ] + try: + self.lfe_index = [ + int(x.strip()) for x in f_ls.readline().strip().split(",") + ] + except: + self.lfe_index = [] + + if self.lfe_index: + [self.ls_azi.insert(i, 0.0) for i in self.lfe_index] + [self.ls_ele.insert(i, 0.0) for i in self.lfe_index] def get_nchannels(self): return self.nchannels diff --git a/scripts/pyaudio3dtools/spatialmetadata.py b/scripts/pyaudio3dtools/spatialmetadata.py index a4d378eded..829bd298bc 100644 --- a/scripts/pyaudio3dtools/spatialmetadata.py +++ b/scripts/pyaudio3dtools/spatialmetadata.py @@ -37,7 +37,7 @@ from typing import Optional, TextIO import numpy as np -from pyaudio3dtools import audioarray, audiofile +from pyaudio3dtools import audioarray, audiofile, spatialaudioformat main_logger = logging.getLogger("__main__") logger = main_logger.getChild(__name__) @@ -53,7 +53,7 @@ class Metadata: audio_fs: Optional[int] = 48000, ): """ - initialization + Spatial Metadata Parameters ---------- @@ -86,7 +86,7 @@ class Metadata: self.nb_objects = 0 # Number of objects self.nb_tracks = 0 # Number of tracks self.audio_wav = [] # list of wav files - self.audio_array = np.zeros((1, 0)) + self.audio_array = np.zeros([1, 0]) self.nb_frames = 0 # Number of frames def read_metadata( @@ -165,7 +165,7 @@ class Metadata: elif metadata_format == "ivas_ism": outfilename, output_ext = os.path.splitext(os.path.basename(metadata_path)) - x = np.zeros((1, 0)) + x = np.zeros([1, 0]) for object_index in range(self.nb_objects): if self.objects[object_index]["input_type"] == "ism": @@ -240,6 +240,21 @@ class Metadata: self.nb_tracks = self.nb_tracks + x.shape[1] self.nb_frames = math.ceil(50.0 * self.audio_array.shape[0] / self.audio_fs) + # init with list of ISM metadata files + def init_for_ism( + self, + in_file: str, + in_fs: int, + metadata_files: list, + ) -> None: + self.audio_wav.append(in_file) + + for csv in metadata_files: + self.objects.append(read_ism_ivas_data(csv, object_index=self.nb_objects)) + self.objects[-1]["track_index"] = self.nb_objects + self._append_audio_array(self.audio_wav[-1], fs=in_fs) + self.nb_objects += 1 + # Get audio array with sampling rate def get_audio_array(self): return self.audio_array, self.audio_fs @@ -273,7 +288,7 @@ def read_ism_input(file_handle: TextIO, dirname: str) -> dict: Returns ------- dict - ISM dictionnary with positions + ISM dictionary with positions """ ism = {"input_type": "ism"} ism["track_index"] = int(file_handle.readline()) - 1 @@ -364,7 +379,7 @@ def write_ism_input( def read_sba_input(file_handle: TextIO) -> dict: sba = {"input_type": "sba"} - sba["track_index"] = int(file_handle.readline()) + sba["track_index"] = int(file_handle.readline()) - 1 sba["order"] = int(file_handle.readline()) sba["nb_tracks"] = (sba["order"] + 1) ** 2 return sba @@ -373,23 +388,25 @@ def read_sba_input(file_handle: TextIO) -> dict: def write_sba_input(file_handle: TextIO, sba_dict: dict) -> None: file_handle.write("SBA\n") track_index = sba_dict["track_index"] - file_handle.write(f"{str(track_index)}\n") + file_handle.write(f"{str(track_index + 1)}\n") order = sba_dict["order"] file_handle.write(f"{str(order)}\n") def read_mc_input(file_handle: TextIO) -> dict: mc = {"input_type": "mc"} - mc["track_index"] = int(file_handle.readline()) - mc["cicp_index"] = int(file_handle.readline()) - mc["nb_tracks"] = 12 + mc["track_index"] = int(file_handle.readline()) - 1 + mc["cicp_index"] = int( + file_handle.readline() + ) # TODO try to support custom LS files? + mc["nb_tracks"] = spatialaudioformat.Format(f"CICP{mc['cicp_index']}").nchannels return mc def write_mc_input(file_handle: TextIO, mc_dict: dict) -> None: file_handle.write("MC\n") track_index = mc_dict["track_index"] - file_handle.write(f"{str(track_index)}\n") + file_handle.write(f"{str(track_index + 1)}\n") order = mc_dict["order"] file_handle.write(f"{str(order)}\n") @@ -405,15 +422,25 @@ def read_ism_ivas_data(metadata_path: str, object_index: int = 0) -> None: ism["positions"] = [] pos_idx = 0 - with open(metadata_path) as file_handle: - for line in file_handle: - current_values = line.strip().split(",") - pos = {} - pos["use_for_frames"] = 1 - pos["azimuth"] = float(current_values[1]) - pos["elevation"] = float(current_values[2]) - ism["positions"].append(pos) - pos_idx += 1 + + try: + with open(metadata_path) as file_handle: + for line in file_handle: + current_values = line.strip().split(",") + pos = {} + pos["use_for_frames"] = 1 + pos["azimuth"] = float(current_values[1]) + pos["elevation"] = float(current_values[2]) + ism["positions"].append(pos) + pos_idx += 1 + except FileNotFoundError: + # TODO in case of NULL metadata we can also spread the objects spatially + pos = {} + pos["use_for_frames"] = 1 + pos["azimuth"] = 0.0 + pos["elevation"] = 0.0 + ism["positions"].append(pos) + pos_idx += 1 ism["num_positions"] = pos_idx return ism diff --git a/scripts/pyprocessing/evs.py b/scripts/pyprocessing/evs.py index f04e81e7b8..3c7f8e1918 100644 --- a/scripts/pyprocessing/evs.py +++ b/scripts/pyprocessing/evs.py @@ -33,7 +33,8 @@ import logging import os -from pyaudio3dtools import audiofile, prerenderer, spatialaudioformat, spatialmetadata +from pyaudio3dtools import audiofile, spatialaudioformat, spatialmetadata +from pyaudio3dtools.spatialaudioconvert import render_meta from pyprocessing import utils from pyprocessing.processing import Processing @@ -103,15 +104,7 @@ class EVS(Processing): ) else: in_sig, fs = metadata_obj.get_audio_array() - # pre-render to desired input format - in_sig = prerenderer.render_from_array( - in_sig, - input_path, - self.in_format_config.altname, - audio_out_path=input_path.replace(".txt", "_prerenderer.wav"), - frame_size_ms=20, - fs=metadata_obj.audio_fs, - ) + in_sig = render_meta(metadata_obj, self.in_spfmt) audiofile.writefile(input_multi_channels, in_sig, self.in_fs) elif input_ext == ".wav" or input_ext == ".pcm": input_multi_channels = input_path diff --git a/scripts/pyprocessing/ivas.py b/scripts/pyprocessing/ivas.py index 4ee07f3cef..3533e7a0bc 100644 --- a/scripts/pyprocessing/ivas.py +++ b/scripts/pyprocessing/ivas.py @@ -34,7 +34,8 @@ import logging import os from typing import Optional -from pyaudio3dtools import audiofile, prerenderer, spatialaudioformat, spatialmetadata +from pyaudio3dtools import audiofile, spatialaudioformat, spatialmetadata +from pyaudio3dtools.spatialaudioconvert import render_meta from pyprocessing import utils from pyprocessing.processing import Processing @@ -117,16 +118,7 @@ class IVAS(Processing): self.in_format.name = "ISM" + str(len(metadata_files)) self.in_format = spatialaudioformat.Format(self.in_format.name[:4]) else: - in_sig, fs = metadata_obj.get_audio_array() - # pre-render to desired input format - in_sig = prerenderer.render_from_array( - in_sig, - input_path, - self.in_format_config.altname, - audio_out_path=input_path.replace(".txt", "_prerenderer.wav"), - frame_size_ms=20, - fs=metadata_obj.audio_fs, - ) + in_sig = render_meta(metadata_obj, self.in_spfmt) audiofile.writefile(input_pcm, in_sig, self.enc_fs) else: raise ValueError(f"IVAS: invalid audio input extension: {input_ext}") diff --git a/scripts/pyprocessing/prepost_processing.py b/scripts/pyprocessing/prepost_processing.py index 4834b7fba2..dee95b77a6 100644 --- a/scripts/pyprocessing/prepost_processing.py +++ b/scripts/pyprocessing/prepost_processing.py @@ -58,7 +58,6 @@ class PreProcessing(Processing): out_format: str, out_fs: int = 48000, out_fc: Optional[int] = None, - prerenderer_bin: str = "../IVAS_prerenderer", output_loudness: Optional[int] = None, loudness_tool: Optional[str] = "bs1770demo", ): @@ -66,7 +65,6 @@ class PreProcessing(Processing): self.out_format = out_format self.out_fs = out_fs self.fc = out_fc - self.prerenderer_bin = utils.get_exec_path(prerenderer_bin) self.output_loudness = output_loudness self.loudness_tool = loudness_tool @@ -82,7 +80,6 @@ class PreProcessing(Processing): tmp_path, out_format=self.out_format, out_fs=self.out_fs, - prerenderer_bin=self.prerenderer_bin, output_loudness=self.output_loudness, loudness_tool=self.loudness_tool, ) @@ -111,24 +108,22 @@ class PostProcessing(Processing): limit_output: bool = False, cut_preamble: float = 0.0, split_file_path: str = "", - prerenderer_bin: str = "../IVAS_prerenderer", bin_rend_include_LFE: bool = False, - bin_rend_LFE_gain: float = 1.0, + bin_rend_LFE_gain: Optional[float] = 10 ** (5.5 / 20), binaural_dataset: Optional[str] = "orange53", output_loudness: Optional[int] = None, loudness_tool: Optional[str] = "bs1770demo", ): super().__init__() - self.in_format_config = spatialaudioformat.Format(in_format=in_format) - self.out_format_config = spatialaudioformat.Format(in_format=out_format) - + self.in_spfmt = spatialaudioformat.Format(in_format=in_format) + self.out_spfmt = spatialaudioformat.Format(in_format=out_format) + self.in_fs = in_fs self.out_fs = out_fs self.fc = out_fc self.binaural_rendered = binaural_rendered self.cut_preamble = cut_preamble self.split_file_path = split_file_path - self.prerenderer_bin = utils.get_exec_path(prerenderer_bin) self.bin_rend_include_LFE = bin_rend_include_LFE self.bin_rend_LFE_gain = bin_rend_LFE_gain self.binaural_dataset = binaural_dataset @@ -139,20 +134,19 @@ class PostProcessing(Processing): def process(self, input_path: str, output_path: str, tmp_path: str): output_nickname = utils.get_nickname(output_path) logger.info( - f" Post Processing: {self.in_format_config.name} -> {self.out_format_config.name} : {output_nickname}" + f" Post Processing: {self.in_spfmt.name} -> {self.out_spfmt.name} : {output_nickname}" ) # Spatial audio format conversion spatialaudioconvert.spatial_audio_convert( input_path, tmp_path, - in_format=self.in_format_config.name, - out_format=self.out_format_config.name, + in_format=self.in_spfmt.name, + out_format=self.out_spfmt.name, in_fs=self.in_fs, out_fs=self.out_fs, out_fc=self.fc, cut_preamble_s=self.cut_preamble, - prerenderer_bin=self.prerenderer_bin, limit_output=self.limit_output, bin_rend_include_LFE=self.bin_rend_include_LFE, bin_rend_LFE_gain=self.bin_rend_LFE_gain, @@ -163,18 +157,18 @@ class PostProcessing(Processing): shutil.move(tmp_path, output_path) # Binaural rendering - if self.binaural_rendered and "BINAURAL" not in self.out_format_config.name: + if self.binaural_rendered and "BINAURAL" not in self.out_spfmt.name: out_sig, fs = audiofile.readfile(output_path) bin_sig = binauralrenderer.binaural_rendering( out_sig, - self.out_format_config.name, + self.out_spfmt.name, fs=fs, include_LFE=self.bin_rend_include_LFE, LFE_gain=self.bin_rend_LFE_gain, ) output_binaural_wav = output_path.replace(".wav", "_BINAURAL.wav") logger.info( - f" Rendering {self.out_format_config.name} -> BINAURAL : {output_nickname[:-4]}_BINAURAL.wav" + f" Rendering {self.out_spfmt.name} -> BINAURAL : {output_nickname[:-4]}_BINAURAL.wav" ) if self.limit_output: logger.info(f" limiting") @@ -209,7 +203,7 @@ class PostProcessing(Processing): split_sig = audioarray.cut(out_sig, (start, stop)) audiofile.writefile(output_path_split, split_sig, self.out_fs) if (self.binaural_rendered is True) and ( - self.out_format_config.name != "BINAURAL" + self.out_spfmt.name != "BINAURAL" ): output_bin_wav_split = output_binaural_wav.replace( ".wav", f"_split{split_idx}.wav" diff --git a/scripts/pyprocessing/processing_configs.py b/scripts/pyprocessing/processing_configs.py index a1594259b5..d7619230dd 100644 --- a/scripts/pyprocessing/processing_configs.py +++ b/scripts/pyprocessing/processing_configs.py @@ -215,8 +215,8 @@ class test_config: # binaural rendering "binaural_rendered": False, "bin_rend_include_LFE": False, - "bin_rend_LFE_gain": 1.0, - "binaural_dataset" : "orange53", + "bin_rend_LFE_gain": 10 ** (5.5 / 20), + "binaural_dataset": "orange53", # apply limiter in the postprocessing "limit_output": False, # loudness adjustments -- GitLab From d5c9a967a5f60c0badec25f48b5f4e6e13053078 Mon Sep 17 00:00:00 2001 From: Archit Tamarapu Date: Fri, 15 Jul 2022 16:48:00 +0200 Subject: [PATCH 2/2] fix 5_1_4 to 5_1_2 downmix matrix in pyaudio3dtools --- scripts/pyaudio3dtools/constants.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/pyaudio3dtools/constants.py b/scripts/pyaudio3dtools/constants.py index 648efb01c7..7a68725451 100644 --- a/scripts/pyaudio3dtools/constants.py +++ b/scripts/pyaudio3dtools/constants.py @@ -87,8 +87,8 @@ IVAS_CICP16_TO_12[[48, 57, 68, 77]] = 0.89999964 IVAS_CICP16_TO_12 = IVAS_CICP16_TO_12.reshape(10, 8) IVAS_CICP16_TO_14 = np.zeros(10 * 8) -IVAS_CICP16_TO_14[[0, 11, 22, 33, 44, 55, 66, 77]] = 1 -IVAS_CICP16_TO_14[[48, 59]] = 0.899999964 +IVAS_CICP16_TO_14[[0, 9, 18, 27, 36, 45, 54, 63]] = 1 +IVAS_CICP16_TO_14[[68, 77]] = 0.899999964 IVAS_CICP16_TO_14 = IVAS_CICP16_TO_14.reshape(10, 8) IVAS_CICP19_TO_6 = np.zeros(12 * 6) -- GitLab