Commit 0420a498 authored by vaclav's avatar vaclav
Browse files

Merge branch 'main' into 360-handling-of-objects-being-speech-noise

parents bddfcdb1 c2007202
Loading
Loading
Loading
Loading
Loading
+16 −7
Original line number Diff line number Diff line
@@ -1640,8 +1640,18 @@ static void usage_enc( void )
    fprintf( stdout, "                                                *VBR mode (average bitrate),\n" );
    fprintf( stdout, "                      for AMR-WB IO modes R =  (6600, 8850, 12650, 14250, 15850, 18250,\n" );
    fprintf( stdout, "                                                19850, 23050, 23850) \n" );
#ifdef ISM_HIGHEST_BITRATE
    fprintf( stdout, "                      for IVAS stereo R =      (13200, 16400, 24400, 32000, 48000, 64000, 80000, \n" );
    fprintf( stdout, "                                                96000, 128000, 160000, 192000, 256000) \n" );
    fprintf( stdout, "                      for IVAS ISM R =          13200 for 1 ISM, 16400 for 1 ISM and 2 ISM, \n" );
    fprintf( stdout, "                                               (24400, 32000, 48000, 64000, 80000, 96000, 128000) \n" );
    fprintf( stdout, "                                                for 2 ISM, 3 ISM and 4 ISM also 160000, 192000, 256000) \n" );
    fprintf( stdout, "                                                for 3 ISM and 4 ISM also 384000 \n" );
    fprintf( stdout, "                                                for 4 ISM also 512000 \n" );
#else
    fprintf( stdout, "                      for IVAS stereo & ISm R =(13200, 16400, 24400, 32000, 48000, 64000, 80000, \n" );
    fprintf( stdout, "                                                96000, 128000, 160000, 192000, 256000) \n" );
#endif
    fprintf( stdout, "                      for IVAS SBA, MASA, MC R=(13200, 16400, 24400, 32000, 48000, 64000, 80000, \n" );
    fprintf( stdout, "                                                96000, 128000, 160000, 192000, 256000, 384000, 512000) \n" );
    fprintf( stdout, "                      Alternatively, R can be a bitrate switching file which consists of R values\n" );
@@ -1657,16 +1667,16 @@ static void usage_enc( void )
    fprintf( stdout, "EVS mono is default, for IVAS choose one of the following: -stereo, -ism, -sba, -masa, -mc\n" );
    fprintf( stdout, "-stereo [Mode]      : Stereo format, default is unified stereo \n" );
    fprintf( stdout, "                      optional for Mode: 1: DFT Stereo, 2: TD Stereo, 3: MDCT Stereo\n" );
    fprintf( stdout, "-ism Channels Files : ISm format \n" );
    fprintf( stdout, "                      where Channels specifies the number of ISms (1-4)\n" );
    fprintf( stdout, "-ism Channels Files : ISM format \n" );
    fprintf( stdout, "                      where Channels specifies the number of ISMs (1-4)\n" );
    fprintf( stdout, "                      and Files specify input files containing metadata, one file per object\n" );
    fprintf( stdout, "                      (use NULL for no input metadata)\n" );
    fprintf( stdout, "-sba +/-Order       : Scene Based Audio input format (Ambisonics ACN/SN3D),\n" );
    fprintf( stdout, "                      where Order specifies the Ambisionics order (1-3),\n" );
    fprintf( stdout, "                      where positive (+) means full 3D and negative (-) only 2D/planar components to be coded\n" );
    fprintf( stdout, "-masa Ch File       : MASA format \n" );
    fprintf( stdout, "                      where Ch specifies the number of input/transport channels (1 or 2): \n" );
    fprintf( stdout, "                      and File specifies input file containing parametric metadata \n" );
    fprintf( stdout, "-masa Channels File : MASA format \n" );
    fprintf( stdout, "                      where Channels specifies the number of input/transport channels (1 or 2): \n" );
    fprintf( stdout, "                      and File specifies input file containing parametric MASA metadata \n" );
    fprintf( stdout, "-mc InputConf       : Multi-channel format\n" );
    fprintf( stdout, "                      where InputConf specifies the channel configuration: 5_1, 7_1, 5_1_2, 5_1_4, 7_1_4\n" );
    fprintf( stdout, "                      Loudspeaker positions are assumed to have azimuth and elevation as per \n" );
@@ -1676,8 +1686,7 @@ static void usage_enc( void )
    fprintf( stdout, "                      where 0 = adaptive, 3-100 = fixed in number of frames,\n" );
    fprintf( stdout, "                      default is deactivated\n" );
    fprintf( stdout, "-dtx                : Activate DTX mode with a SID update rate of 8 frames\n" );
    fprintf( stdout, "                      Note: DTX is currently supported in EVS, stereo, 1 ISm, \n" );
    fprintf( stdout, "                      SBA (up to 128kbps) and MASA (up to 128kbps)\n" );
    fprintf( stdout, "                      Note: DTX is supported in EVS, stereo, ISM, SBA up to 80kbps and MASA up to 128kbps \n" );
    fprintf( stdout, "-rf p o             : Activate channel-aware mode for WB and SWB signal at 13.2kbps, \n" );
    fprintf( stdout, "                      where FEC indicator, p: LO or HI, and FEC offset, o: 2, 3, 5, or 7 in number of frames.\n" );
    fprintf( stdout, "                      Alternatively p and o can be replaced by a rf configuration file with each line  \n" );
+1 −0
Original line number Diff line number Diff line
@@ -163,6 +163,7 @@
#define BINAURALIZATION_DELAY_REPORT                    /* VA: Issue 255 - Changes the way the decoder delay is reported */
#define FIX_351_HRTF_COMMAND                            /* VA: Issue 354 - improve "-hrtf" command-line option */
#define FIX_94_VERIFY_WAV_NUM_CHANNELS                  /* FhG: Issue 94 - Check if number of channels in input wav file matches encoder/renderer configuration */
#define ISM_HIGHEST_BITRATE                             /* VA: Issue 284: Update highest bitrate limit in ISM format */
#define TUNE_360_OBJECT_WITH_NOISE                      /* VA: issue 360: consider objects being speech+noise for active speech coding */


+20 −3
Original line number Diff line number Diff line
@@ -1985,24 +1985,41 @@ static ivas_error sanitizeBandwidth(
static ivas_error sanitizeBitrateISM(
    const ENCODER_CONFIG_HANDLE hEncoderConfig )
{
#ifdef ISM_HIGHEST_BITRATE
    if ( hEncoderConfig->ivas_total_brate > IVAS_128k && hEncoderConfig->nchan_inp == 1 )
    {
        return IVAS_ERROR( IVAS_ERR_INVALID_BITRATE, "Too high bitrate for 1 ISM specified in IVAS: %d", hEncoderConfig->ivas_total_brate );
    }

    if ( hEncoderConfig->ivas_total_brate > IVAS_256k && hEncoderConfig->nchan_inp == 2 )
    {
        return IVAS_ERROR( IVAS_ERR_INVALID_BITRATE, "Too high bitrate for 2 ISM specified in IVAS: %d", hEncoderConfig->ivas_total_brate );
    }

    if ( hEncoderConfig->ivas_total_brate > IVAS_384k && hEncoderConfig->nchan_inp == 3 )
    {
        return IVAS_ERROR( IVAS_ERR_INVALID_BITRATE, "Too high bitrate for 3 ISM specified in IVAS: %d", hEncoderConfig->ivas_total_brate );
    }
#else
    if ( hEncoderConfig->ivas_total_brate > IVAS_256k )
    {
        return IVAS_ERROR( IVAS_ERR_INVALID_BITRATE, "Too high bitrate for ISm specified in IVAS: %d", hEncoderConfig->ivas_total_brate );
    }
#endif

    if ( hEncoderConfig->ivas_total_brate < IVAS_16k4 && hEncoderConfig->nchan_inp == 2 )
    {
        return IVAS_ERROR( IVAS_ERR_INVALID_BITRATE, "Too low bitrate for 2 ISm specified in IVAS: %d", hEncoderConfig->ivas_total_brate );
        return IVAS_ERROR( IVAS_ERR_INVALID_BITRATE, "Too low bitrate for 2 ISM specified in IVAS: %d", hEncoderConfig->ivas_total_brate );
    }

    if ( hEncoderConfig->ivas_total_brate < IVAS_24k4 && hEncoderConfig->nchan_inp == 3 )
    {
        return IVAS_ERROR( IVAS_ERR_INVALID_BITRATE, "Too low bitrate for 3 ISm specified in IVAS: %d", hEncoderConfig->ivas_total_brate );
        return IVAS_ERROR( IVAS_ERR_INVALID_BITRATE, "Too low bitrate for 3 ISM specified in IVAS: %d", hEncoderConfig->ivas_total_brate );
    }

    if ( hEncoderConfig->ivas_total_brate < IVAS_24k4 && hEncoderConfig->nchan_inp == 4 )
    {
        return IVAS_ERROR( IVAS_ERR_INVALID_BITRATE, "Too low bitrate for 4 ISm specified in IVAS: %d", hEncoderConfig->ivas_total_brate );
        return IVAS_ERROR( IVAS_ERR_INVALID_BITRATE, "Too low bitrate for 4 ISM specified in IVAS: %d", hEncoderConfig->ivas_total_brate );
    }

    return IVAS_ERR_OK;
+74 −53
Original line number Diff line number Diff line
@@ -160,8 +160,13 @@ R : Bitrate in bps,
                                                *VBR mode (average bitrate),
                      for AMR-WB IO modes R =  (6600, 8850, 12650, 14250, 15850, 18250,
                                                19850, 23050, 23850)
                      for IVAS stereo & ISm R =(13200, 16400, 24400, 32000, 48000, 64000, 80000,
                      for IVAS stereo R =      (13200, 16400, 24400, 32000, 48000, 64000, 80000,
											    96000, 128000, 160000, 192000, 256000)
					  for IVAS ISM R =          13200 for 1 ISM, 16400 for 1 ISM and 2 ISM,
											   (24400, 32000, 48000, 64000, 80000, 96000, 128000)                                                
                                                for 2 ISM, 3 ISM and 4 ISM also 160000, 192000, 256000
                                                for 3 ISM and 4 ISM also 384000
                                                for 4 ISM also 512000											
                      for IVAS SBA, MASA, MC R=(13200, 16400, 24400, 32000, 48000, 64000, 80000,
                                                96000, 128000, 160000, 192000, 256000, 384000, 512000)
                      Alternatively, R can be a bitrate switching file which consists of R values
@@ -176,16 +181,16 @@ Options:
EVS mono is default, for IVAS choose one of the following: -stereo, -ism, -sba, -masa, -mc
-stereo [Mode]      : Stereo format, default is unified stereo
                      optional for Mode: 1: DFT Stereo, 2: TD Stereo, 3: MDCT Stereo
-ism Channels Files : ISm format
                      where Channels specifies the number of ISms (1-4)
-ism Channels Files : ISM format
                      where Channels specifies the number of ISMs (1-4)
                      and Files specify input files containing metadata, one file per object
                      (use NULL for no input metadata)
-sba +/-Order       : Scene Based Audio input format (Ambisonics ACN/SN3D),
                      where Order specifies the Ambisionics order (1-3),
                      where positive (+) means full 3D and negative (-) only 2D/planar components to be coded
-masa Ch File       : MASA format
                      where Ch specifies the number of input/transport channels (1 or 2):
                      and File specifies input file containing parametric metadata
-masa Channels File : MASA format
                      where Channels specifies the number of input/transport channels (1 or 2):
                      and File specifies input file containing parametric MASA metadata
-mc InputConf       : Multi-channel format
                      where InputConf specifies the channel configuration: 5_1, 7_1, 5_1_2, 5_1_4, 7_1_4
                      Loudspeaker positions are assumed to have azimuth and elevation as per
@@ -195,8 +200,7 @@ EVS mono is default, for IVAS choose one of the following: -stereo, -ism, -sba,
                      where 0 = adaptive, 3-100 = fixed in number of frames,
                      default is deactivated
-dtx                : Activate DTX mode with a SID update rate of 8 frames
                      Note: DTX is currently supported in EVS, stereo, 1 ISm,
                      SBA (up to 128kbps) and MASA (up to 128kbps)
                      Note: DTX is supported in EVS, stereo, ISM, SBA up to 80kbps and MASA up to 128kbps
-rf p o             : Activate channel-aware mode for WB and SWB signal at 13.2kbps,
                      where FEC indicator, p: LO or HI, and FEC offset, o: 2, 3, 5, or 7 in number of frames.
                      Alternatively p and o can be replaced by a rf configuration file with each line
@@ -292,6 +296,7 @@ The output channel ordering is 0, 1, ... N-1. The third row contains an index "L
specifying the output channel to which the LFE input will be routed if present. If the third row is 
omitted, the LFE input is downmixed to all channels with a factor of 1/N. Position is not considered for
the LFE channel.
An example custom loudspeaker layout file is available: ls_setup_16ch_8+4+4.txt

                       

@@ -306,55 +311,56 @@ points or complete coverage.

Documentation on the self_test.py can be found as a part of scripts/README.md.

Note: Running the self_test.py requires the input vectors in the folder scripts/testv. The
audio files could unfortunately not be shared, and they need to be replaced in order to
run the self_test.py. To complement the test vector set, please replace the empty *.pcm-files
in the self_test folder with 16 bit PCM files following the specification below.

stv1ISM48s.pcm  - 1 channel (1 audio object), 48000 Hz, 1440000 samples
stv2ISM48s.pcm  - 2 channels (discrete audio objects), 48000 Hz, 1440000 samples per channel
stv2OA32c.pcm   - 9 channels (2nd order Ambisonics ACN/SN3D), 32000 Hz 
stv2OA48c.pcm   - 9 channels (2nd order Ambisonics ACN/SN3D), 48000 Hz
stv3ISM48s.pcm  - 3 channels (discrete audio objects), 48000 Hz, 1440000 samples per channel
stv3OA32c.pcm   - 16 channels (3rd order Ambisonics ACN/SN3D), 32000 Hz, 288939 samples per channel
stv3OA48c.pcm   - 16 channels (3rd order Ambisonics ACN/SN3D), 48000 Hz, 433408 samples per channel
stv4ISM48s.pcm  - 4 channel (discrete audio objects), 48000 Hz, 1440000 samples per channel
stv8c.pcm       - 1 channel, 8000 Hz, clean speech/audio
stv8n.pcm       - 1 channel, 8000 Hz, noisy speech
stv16c.pcm      - 1 channel, 16000 Hz, 610307 samples, clean speech 
stv16n.pcm      - 1 channel, 16000 Hz, 257024 samples, noisy speech
stv32c.pcm      - 1 channel, 32000 Hz, 1220613 samples, clean speech/audio
stv32n.pcm      - 1 channel, 32000 Hz, 514048 samples, noisy speech
stv48c.pcm      - 1 channel, 48000 Hz, 1830919 samples, clean speech/audio
stv51MC48c.pcm  - 6 channels (5.1 1..6 where 4th channel is LFE), 3231233 samples per channel, 48000 Hz, movie excerpt
stv512MC48c.pcm - 8 channels (5.1+2 1..8 where 4th channel is LFE), 144000 samples per channel, 48000 Hz, movie excerpt
stv714MC48c.pcm - 12 channels (7.1+4 1..12 where 4th channel is LFE), 144000 samples per channel, 48000 Hz, movie excerpt
stvFOA16c.pcm   - 4 channels (1st order Ambisonics ACN/SN3D), 16000 Hz,
stvFOA32c.pcm   - 4 channels (1st order Ambisonics ACN/SN3D), 32000 Hz, 288939 samples per channel
stvFOA48c.pcm   - 4 channels (1st order Ambisonics ACN/SN3D), 48000 Hz, 433408 samples per channel
stvST16c.pcm    - 2 channels, 16000 Hz, 329601 samples per channel, clean speech/audio
stvST16n.pcm    - 2 channels, 16000 Hz, 310401 samples per channel, noisy speech
stvST32c.pcm    - 2 channels, 32000 Hz, 659200 samples per channel, clean speech/audio
stvST32n.pcm    - 2 channels, 32000 Hz, 620800 samples per channel, noisy speech
stvST48c.pcm    - 2 channels, 48000 Hz, 988800 samples per channel, clean speech/audio
stvST48n.pcm    - 2 channels, 48000 Hz, 931200 samples per channel, noisy speech
stv_IVASMASA_1dir1TC.pcm     - 1 channel (1 MASA transport channel), 48000 Hz, 48000 Hz, 144000 samples 
stv_IVASMASA_1dir1TC_DTX.pcm - 1 channel (1 MASA transport channel), 48000 Hz, 48000 Hz, 963840 samples
stv_IVASMASA_1dir2TC.pcm     - 2 channels (2 MASA transport channel), 48000 Hz, 48000 Hz, 288000 samples per channel
stv_IVASMASA_1dir2TC_DTX.pcm - 2 channels (2 MASA transport channel), 48000 Hz, 48000 Hz, 963840 samples per channel
stv_IVASMASA_2dir1TC.pcm     - 1 channel (1 MASA transport channel), 48000 Hz, 48000 Hz, 288000
stv_IVASMASA_2dir2TC.pcm     - 2 channels (2 MASA transport channel), 48000 Hz, 48000 Hz, 144000 samples per channel
Note: Running the self_test.py requires the input vectors in the folder scripts/testv. 

stv1ISM48s.wav     - 1 channel (1 audio object), 48000 Hz, 1440000 samples
stv2ISM48s.wav     - 2 channels (discrete audio objects), 48000 Hz, 1440000 samples per channel
stv2OA32c.wav      - 9 channels (2nd order Ambisonics ACN/SN3D), 32000 Hz 
stv2OA48c.wav      - 9 channels (2nd order Ambisonics ACN/SN3D), 48000 Hz
stv3ISM48s.wav     - 3 channels (discrete audio objects), 48000 Hz, 1440000 samples per channel
stv3OA32c.wav      - 16 channels (3rd order Ambisonics ACN/SN3D), 32000 Hz, 288939 samples per channel
stv3OA48c.wav      - 16 channels (3rd order Ambisonics ACN/SN3D), 48000 Hz, 433408 samples per channel
stv4ISM48s.wav     - 4 channel (discrete audio objects), 48000 Hz, 1440000 samples per channel
stv4ISM48n.wav     - 4 channel (discrete audio objects), 48000 Hz, noisy speech
stv8c.wav          - 1 channel, 8000 Hz, clean speech/audio
stv8n.wav          - 1 channel, 8000 Hz, noisy speech
stv16c.wav         - 1 channel, 16000 Hz, 610307 samples, clean speech 
stv16n.wav         - 1 channel, 16000 Hz, 257024 samples, noisy speech
stv32c.wav         - 1 channel, 32000 Hz, 1220613 samples, clean speech/audio
stv32n.wav         - 1 channel, 32000 Hz, 514048 samples, noisy speech
stv48c.wav         - 1 channel, 48000 Hz, 960000 samples, clean speech/audio
stv48n.wav         - 1 channel, 48000 Hz, 931200 samples, noisy clean speech
stv51MC48c.wav     - 6 channels (5.1 1..6 where 4th channel is LFE), 960000 samples per channel, 48000 Hz
stv512MC48c.wav    - 8 channels (5.1+2 1..8 where 4th channel is LFE), 144000 samples per channel, 48000 Hz
stv514MC48c.wav    - 10 channels (7.1+2 1..10 where 4th channel is LFE), 144000 samples per channel, 48000 Hz
stv71MC48c.wav     - 8 channels (7.1 1..8 where 4th channel is LFE), 144000 samples per channel, 48000 Hz
stv714MC48c.wav    - 12 channels (7.1+4 1..12 where 4th channel is LFE), 144000 samples per channel, 48000 Hz
stvFOA16c.wav      - 4 channels (1st order Ambisonics ACN/SN3D), 16000 Hz,
stvFOA32c.wav      - 4 channels (1st order Ambisonics ACN/SN3D), 32000 Hz, 288939 samples per channel
stvFOA48c.wav      - 4 channels (1st order Ambisonics ACN/SN3D), 48000 Hz, 433408 samples per channel
stvST16c.wav       - 2 channels, 16000 Hz, 329601 samples per channel, clean speech/audio
stvST16n.wav       - 2 channels, 16000 Hz, 310401 samples per channel, noisy speech
stvST32c.wav       - 2 channels, 32000 Hz, 659200 samples per channel, clean speech/audio
stvST32n.wav       - 2 channels, 32000 Hz, 620800 samples per channel, noisy speech
stvST48c.wav       - 2 channels, 48000 Hz, 988800 samples per channel, clean speech/audio
stvST48n.wav       - 2 channels, 48000 Hz, 931200 samples per channel, noisy speech
stv1MASA1TC48c.wav - 1 channel (1 MASA transport channel), 48000 Hz, 48000 Hz, 144000 samples 
stv1MASA1TC48n.wav - 1 channel (1 MASA transport channel), 48000 Hz, 48000 Hz, 963840 samples
stv1MASA2TC48c.wav - 2 channels (2 MASA transport channel), 48000 Hz, 48000 Hz, 288000 samples per channel
stv1MASA2TC48n.wav - 2 channels (2 MASA transport channel), 48000 Hz, 48000 Hz, 963840 samples per channel
stv2MASA1TC48c.wav - 1 channel (1 MASA transport channel), 48000 Hz, 48000 Hz, 288000
stv2MASA2TC48c.wav - 2 channels (2 MASA transport channel), 48000 Hz, 48000 Hz, 144000 samples per channel


For the MASA operation modes, in addition the following metadata files
are required:

stv_IVASMASA_1dir1TC.met
stv_IVASMASA_1dir1TC_DTX.met
stv_IVASMASA_1dir2TC.met
stv_IVASMASA_1dir2TC_DTX.met
stv_IVASMASA_2dir1TC.met
stv_IVASMASA_2dir2TC.met
stv1MASA1TC48c.met
stv1MASA1TC48n.met
stv1MASA2TC48c.met
stv1MASA2TC48n.met
stv2MASA1TC48c.met
stv2MASA2TC48c.met

It is strongly recommended to align these files to the corresponding
PCM audio files. The MASA metadata files can be generated with the
@@ -389,6 +395,21 @@ with the following meaning:
-----------------------------------------------------------------------------------


For the Head rotation operation modes, external trajectory files are available:

headrot.csv 
headrot_case00_3000_q.csv 
headrot_case01_3000_q.csv 
headrot_case02_3000_q.csv 
headrot_case03_3000_q.csv


For the Renderer configuration option operation modes, external configuration files are available:

rend_config_hospital_patientroom.cfg
config_recreation.cfg
config_renderer.cfg


                       ADDITIONAL SCRIPTS
                       ==================
+21 −30
Original line number Diff line number Diff line
@@ -1828,10 +1828,7 @@
                    64000,
                    80000,
                    96000,
                    128000,
                    160000,
                    192000,
                    256000
                    128000
                ],
                "swb": [
                    13200,
@@ -1842,10 +1839,7 @@
                    64000,
                    80000,
                    96000,
                    128000,
                    160000,
                    192000,
                    256000
                    128000
                ],
                "fb": [
                    32000,
@@ -1853,10 +1847,7 @@
                    64000,
                    80000,
                    96000,
                    128000,
                    160000,
                    192000,
                    256000
                    128000
                ]
            }
        },
@@ -1892,10 +1883,7 @@
                    64000,
                    80000,
                    96000,
                    128000,
                    160000,
                    192000,
                    256000
                    128000
                ],
                "swb": [
                    13200,
@@ -1906,10 +1894,7 @@
                    64000,
                    80000,
                    96000,
                    128000,
                    160000,
                    192000,
                    256000
                    128000
                ],
                "fb": [
                    32000,
@@ -1917,10 +1902,7 @@
                    64000,
                    80000,
                    96000,
                    128000,
                    160000,
                    192000,
                    256000
                    128000
                ]
            }
        }
@@ -2021,7 +2003,8 @@
                    128000,
                    160000,
                    192000,
                    256000
                    256000,
					384000
                ],
                "swb": [
                    24400,
@@ -2033,7 +2016,8 @@
                    128000,
                    160000,
                    192000,
                    256000
                    256000,
					384000
                ],
                "fb": [
                    32000,
@@ -2044,7 +2028,8 @@
                    128000,
                    160000,
                    192000,
                    256000
                    256000,
					384000
                ]
            }
        },
@@ -2118,7 +2103,9 @@
                    128000,
                    160000,
                    192000,
                    256000
                    256000,
					384000,
					512000
                ],
                "swb": [
                    24400,
@@ -2130,7 +2117,9 @@
                    128000,
                    160000,
                    192000,
                    256000
                    256000,
					384000,
					512000
                ],
                "fb": [
                    32000,
@@ -2141,7 +2130,9 @@
                    128000,
                    160000,
                    192000,
                    256000
                    256000,
					384000,
					512000
                ]
            }
        },
Loading