Merge branch 'main' into 360-handling-of-objects-being-speech-noise (0420a498) · Commits · IVAS Codec Public Collaboration / IVAS Codec

apps/encoder.c

+16 −7

Original line number	Diff line number	Diff line
		@@ -1640,8 +1640,18 @@ static void usage_enc( void )
		fprintf( stdout, " *VBR mode (average bitrate),\n" );
		fprintf( stdout, " for AMR-WB IO modes R = (6600, 8850, 12650, 14250, 15850, 18250,\n" );
		fprintf( stdout, " 19850, 23050, 23850) \n" );
		#ifdef ISM_HIGHEST_BITRATE
		fprintf( stdout, " for IVAS stereo R = (13200, 16400, 24400, 32000, 48000, 64000, 80000, \n" );
		fprintf( stdout, " 96000, 128000, 160000, 192000, 256000) \n" );
		fprintf( stdout, " for IVAS ISM R = 13200 for 1 ISM, 16400 for 1 ISM and 2 ISM, \n" );
		fprintf( stdout, " (24400, 32000, 48000, 64000, 80000, 96000, 128000) \n" );
		fprintf( stdout, " for 2 ISM, 3 ISM and 4 ISM also 160000, 192000, 256000) \n" );
		fprintf( stdout, " for 3 ISM and 4 ISM also 384000 \n" );
		fprintf( stdout, " for 4 ISM also 512000 \n" );
		#else
		fprintf( stdout, " for IVAS stereo & ISm R =(13200, 16400, 24400, 32000, 48000, 64000, 80000, \n" );
		fprintf( stdout, " 96000, 128000, 160000, 192000, 256000) \n" );
		#endif
		fprintf( stdout, " for IVAS SBA, MASA, MC R=(13200, 16400, 24400, 32000, 48000, 64000, 80000, \n" );
		fprintf( stdout, " 96000, 128000, 160000, 192000, 256000, 384000, 512000) \n" );
		fprintf( stdout, " Alternatively, R can be a bitrate switching file which consists of R values\n" );
		@@ -1657,16 +1667,16 @@ static void usage_enc( void )
		fprintf( stdout, "EVS mono is default, for IVAS choose one of the following: -stereo, -ism, -sba, -masa, -mc\n" );
		fprintf( stdout, "-stereo [Mode] : Stereo format, default is unified stereo \n" );
		fprintf( stdout, " optional for Mode: 1: DFT Stereo, 2: TD Stereo, 3: MDCT Stereo\n" );
		fprintf( stdout, "-ism Channels Files : ISm format \n" );
		fprintf( stdout, " where Channels specifies the number of ISms (1-4)\n" );
		fprintf( stdout, "-ism Channels Files : ISM format \n" );
		fprintf( stdout, " where Channels specifies the number of ISMs (1-4)\n" );
		fprintf( stdout, " and Files specify input files containing metadata, one file per object\n" );
		fprintf( stdout, " (use NULL for no input metadata)\n" );
		fprintf( stdout, "-sba +/-Order : Scene Based Audio input format (Ambisonics ACN/SN3D),\n" );
		fprintf( stdout, " where Order specifies the Ambisionics order (1-3),\n" );
		fprintf( stdout, " where positive (+) means full 3D and negative (-) only 2D/planar components to be coded\n" );
		fprintf( stdout, "-masa Ch File : MASA format \n" );
		fprintf( stdout, " where Ch specifies the number of input/transport channels (1 or 2): \n" );
		fprintf( stdout, " and File specifies input file containing parametric metadata \n" );
		fprintf( stdout, "-masa Channels File : MASA format \n" );
		fprintf( stdout, " where Channels specifies the number of input/transport channels (1 or 2): \n" );
		fprintf( stdout, " and File specifies input file containing parametric MASA metadata \n" );
		fprintf( stdout, "-mc InputConf : Multi-channel format\n" );
		fprintf( stdout, " where InputConf specifies the channel configuration: 5_1, 7_1, 5_1_2, 5_1_4, 7_1_4\n" );
		fprintf( stdout, " Loudspeaker positions are assumed to have azimuth and elevation as per \n" );
		@@ -1676,8 +1686,7 @@ static void usage_enc( void )
		fprintf( stdout, " where 0 = adaptive, 3-100 = fixed in number of frames,\n" );
		fprintf( stdout, " default is deactivated\n" );
		fprintf( stdout, "-dtx : Activate DTX mode with a SID update rate of 8 frames\n" );
		fprintf( stdout, " Note: DTX is currently supported in EVS, stereo, 1 ISm, \n" );
		fprintf( stdout, " SBA (up to 128kbps) and MASA (up to 128kbps)\n" );
		fprintf( stdout, " Note: DTX is supported in EVS, stereo, ISM, SBA up to 80kbps and MASA up to 128kbps \n" );
		fprintf( stdout, "-rf p o : Activate channel-aware mode for WB and SWB signal at 13.2kbps, \n" );
		fprintf( stdout, " where FEC indicator, p: LO or HI, and FEC offset, o: 2, 3, 5, or 7 in number of frames.\n" );
		fprintf( stdout, " Alternatively p and o can be replaced by a rf configuration file with each line \n" );

lib_com/options.h

+1 −0

Original line number	Diff line number	Diff line
		@@ -163,6 +163,7 @@
		#define BINAURALIZATION_DELAY_REPORT /* VA: Issue 255 - Changes the way the decoder delay is reported */
		#define FIX_351_HRTF_COMMAND /* VA: Issue 354 - improve "-hrtf" command-line option */
		#define FIX_94_VERIFY_WAV_NUM_CHANNELS /* FhG: Issue 94 - Check if number of channels in input wav file matches encoder/renderer configuration */
		#define ISM_HIGHEST_BITRATE /* VA: Issue 284: Update highest bitrate limit in ISM format */
		#define TUNE_360_OBJECT_WITH_NOISE /* VA: issue 360: consider objects being speech+noise for active speech coding */

lib_enc/lib_enc.c

+20 −3

Original line number	Diff line number	Diff line
		@@ -1985,24 +1985,41 @@ static ivas_error sanitizeBandwidth(
		static ivas_error sanitizeBitrateISM(
		const ENCODER_CONFIG_HANDLE hEncoderConfig )
		{
		#ifdef ISM_HIGHEST_BITRATE
		if ( hEncoderConfig->ivas_total_brate > IVAS_128k && hEncoderConfig->nchan_inp == 1 )
		{
		return IVAS_ERROR( IVAS_ERR_INVALID_BITRATE, "Too high bitrate for 1 ISM specified in IVAS: %d", hEncoderConfig->ivas_total_brate );
		}

		if ( hEncoderConfig->ivas_total_brate > IVAS_256k && hEncoderConfig->nchan_inp == 2 )
		{
		return IVAS_ERROR( IVAS_ERR_INVALID_BITRATE, "Too high bitrate for 2 ISM specified in IVAS: %d", hEncoderConfig->ivas_total_brate );
		}

		if ( hEncoderConfig->ivas_total_brate > IVAS_384k && hEncoderConfig->nchan_inp == 3 )
		{
		return IVAS_ERROR( IVAS_ERR_INVALID_BITRATE, "Too high bitrate for 3 ISM specified in IVAS: %d", hEncoderConfig->ivas_total_brate );
		}
		#else
		if ( hEncoderConfig->ivas_total_brate > IVAS_256k )
		{
		return IVAS_ERROR( IVAS_ERR_INVALID_BITRATE, "Too high bitrate for ISm specified in IVAS: %d", hEncoderConfig->ivas_total_brate );
		}
		#endif

		if ( hEncoderConfig->ivas_total_brate < IVAS_16k4 && hEncoderConfig->nchan_inp == 2 )
		{
		return IVAS_ERROR( IVAS_ERR_INVALID_BITRATE, "Too low bitrate for 2 ISm specified in IVAS: %d", hEncoderConfig->ivas_total_brate );
		return IVAS_ERROR( IVAS_ERR_INVALID_BITRATE, "Too low bitrate for 2 ISM specified in IVAS: %d", hEncoderConfig->ivas_total_brate );
		}

		if ( hEncoderConfig->ivas_total_brate < IVAS_24k4 && hEncoderConfig->nchan_inp == 3 )
		{
		return IVAS_ERROR( IVAS_ERR_INVALID_BITRATE, "Too low bitrate for 3 ISm specified in IVAS: %d", hEncoderConfig->ivas_total_brate );
		return IVAS_ERROR( IVAS_ERR_INVALID_BITRATE, "Too low bitrate for 3 ISM specified in IVAS: %d", hEncoderConfig->ivas_total_brate );
		}

		if ( hEncoderConfig->ivas_total_brate < IVAS_24k4 && hEncoderConfig->nchan_inp == 4 )
		{
		return IVAS_ERROR( IVAS_ERR_INVALID_BITRATE, "Too low bitrate for 4 ISm specified in IVAS: %d", hEncoderConfig->ivas_total_brate );
		return IVAS_ERROR( IVAS_ERR_INVALID_BITRATE, "Too low bitrate for 4 ISM specified in IVAS: %d", hEncoderConfig->ivas_total_brate );
		}

		return IVAS_ERR_OK;

readme.txt

+74 −53

Original line number	Diff line number	Diff line
		@@ -160,8 +160,13 @@ R : Bitrate in bps,
		*VBR mode (average bitrate),
		for AMR-WB IO modes R = (6600, 8850, 12650, 14250, 15850, 18250,
		19850, 23050, 23850)
		for IVAS stereo & ISm R =(13200, 16400, 24400, 32000, 48000, 64000, 80000,
		for IVAS stereo R = (13200, 16400, 24400, 32000, 48000, 64000, 80000,
		96000, 128000, 160000, 192000, 256000)
		for IVAS ISM R = 13200 for 1 ISM, 16400 for 1 ISM and 2 ISM,
		(24400, 32000, 48000, 64000, 80000, 96000, 128000)
		for 2 ISM, 3 ISM and 4 ISM also 160000, 192000, 256000
		for 3 ISM and 4 ISM also 384000
		for 4 ISM also 512000
		for IVAS SBA, MASA, MC R=(13200, 16400, 24400, 32000, 48000, 64000, 80000,
		96000, 128000, 160000, 192000, 256000, 384000, 512000)
		Alternatively, R can be a bitrate switching file which consists of R values
		@@ -176,16 +181,16 @@ Options:
		EVS mono is default, for IVAS choose one of the following: -stereo, -ism, -sba, -masa, -mc
		-stereo [Mode] : Stereo format, default is unified stereo
		optional for Mode: 1: DFT Stereo, 2: TD Stereo, 3: MDCT Stereo
		-ism Channels Files : ISm format
		where Channels specifies the number of ISms (1-4)
		-ism Channels Files : ISM format
		where Channels specifies the number of ISMs (1-4)
		and Files specify input files containing metadata, one file per object
		(use NULL for no input metadata)
		-sba +/-Order : Scene Based Audio input format (Ambisonics ACN/SN3D),
		where Order specifies the Ambisionics order (1-3),
		where positive (+) means full 3D and negative (-) only 2D/planar components to be coded
		-masa Ch File : MASA format
		where Ch specifies the number of input/transport channels (1 or 2):
		and File specifies input file containing parametric metadata
		-masa Channels File : MASA format
		where Channels specifies the number of input/transport channels (1 or 2):
		and File specifies input file containing parametric MASA metadata
		-mc InputConf : Multi-channel format
		where InputConf specifies the channel configuration: 5_1, 7_1, 5_1_2, 5_1_4, 7_1_4
		Loudspeaker positions are assumed to have azimuth and elevation as per
		@@ -195,8 +200,7 @@ EVS mono is default, for IVAS choose one of the following: -stereo, -ism, -sba,
		where 0 = adaptive, 3-100 = fixed in number of frames,
		default is deactivated
		-dtx : Activate DTX mode with a SID update rate of 8 frames
		Note: DTX is currently supported in EVS, stereo, 1 ISm,
		SBA (up to 128kbps) and MASA (up to 128kbps)
		Note: DTX is supported in EVS, stereo, ISM, SBA up to 80kbps and MASA up to 128kbps
		-rf p o : Activate channel-aware mode for WB and SWB signal at 13.2kbps,
		where FEC indicator, p: LO or HI, and FEC offset, o: 2, 3, 5, or 7 in number of frames.
		Alternatively p and o can be replaced by a rf configuration file with each line
		@@ -292,6 +296,7 @@ The output channel ordering is 0, 1, ... N-1. The third row contains an index "L
		specifying the output channel to which the LFE input will be routed if present. If the third row is
		omitted, the LFE input is downmixed to all channels with a factor of 1/N. Position is not considered for
		the LFE channel.
		An example custom loudspeaker layout file is available: ls_setup_16ch_8+4+4.txt



		@@ -306,55 +311,56 @@ points or complete coverage.

		Documentation on the self_test.py can be found as a part of scripts/README.md.

		Note: Running the self_test.py requires the input vectors in the folder scripts/testv. The
		audio files could unfortunately not be shared, and they need to be replaced in order to
		run the self_test.py. To complement the test vector set, please replace the empty *.pcm-files
		in the self_test folder with 16 bit PCM files following the specification below.

		stv1ISM48s.pcm - 1 channel (1 audio object), 48000 Hz, 1440000 samples
		stv2ISM48s.pcm - 2 channels (discrete audio objects), 48000 Hz, 1440000 samples per channel
		stv2OA32c.pcm - 9 channels (2nd order Ambisonics ACN/SN3D), 32000 Hz
		stv2OA48c.pcm - 9 channels (2nd order Ambisonics ACN/SN3D), 48000 Hz
		stv3ISM48s.pcm - 3 channels (discrete audio objects), 48000 Hz, 1440000 samples per channel
		stv3OA32c.pcm - 16 channels (3rd order Ambisonics ACN/SN3D), 32000 Hz, 288939 samples per channel
		stv3OA48c.pcm - 16 channels (3rd order Ambisonics ACN/SN3D), 48000 Hz, 433408 samples per channel
		stv4ISM48s.pcm - 4 channel (discrete audio objects), 48000 Hz, 1440000 samples per channel
		stv8c.pcm - 1 channel, 8000 Hz, clean speech/audio
		stv8n.pcm - 1 channel, 8000 Hz, noisy speech
		stv16c.pcm - 1 channel, 16000 Hz, 610307 samples, clean speech
		stv16n.pcm - 1 channel, 16000 Hz, 257024 samples, noisy speech
		stv32c.pcm - 1 channel, 32000 Hz, 1220613 samples, clean speech/audio
		stv32n.pcm - 1 channel, 32000 Hz, 514048 samples, noisy speech
		stv48c.pcm - 1 channel, 48000 Hz, 1830919 samples, clean speech/audio
		stv51MC48c.pcm - 6 channels (5.1 1..6 where 4th channel is LFE), 3231233 samples per channel, 48000 Hz, movie excerpt
		stv512MC48c.pcm - 8 channels (5.1+2 1..8 where 4th channel is LFE), 144000 samples per channel, 48000 Hz, movie excerpt
		stv714MC48c.pcm - 12 channels (7.1+4 1..12 where 4th channel is LFE), 144000 samples per channel, 48000 Hz, movie excerpt
		stvFOA16c.pcm - 4 channels (1st order Ambisonics ACN/SN3D), 16000 Hz,
		stvFOA32c.pcm - 4 channels (1st order Ambisonics ACN/SN3D), 32000 Hz, 288939 samples per channel
		stvFOA48c.pcm - 4 channels (1st order Ambisonics ACN/SN3D), 48000 Hz, 433408 samples per channel
		stvST16c.pcm - 2 channels, 16000 Hz, 329601 samples per channel, clean speech/audio
		stvST16n.pcm - 2 channels, 16000 Hz, 310401 samples per channel, noisy speech
		stvST32c.pcm - 2 channels, 32000 Hz, 659200 samples per channel, clean speech/audio
		stvST32n.pcm - 2 channels, 32000 Hz, 620800 samples per channel, noisy speech
		stvST48c.pcm - 2 channels, 48000 Hz, 988800 samples per channel, clean speech/audio
		stvST48n.pcm - 2 channels, 48000 Hz, 931200 samples per channel, noisy speech
		stv_IVASMASA_1dir1TC.pcm - 1 channel (1 MASA transport channel), 48000 Hz, 48000 Hz, 144000 samples
		stv_IVASMASA_1dir1TC_DTX.pcm - 1 channel (1 MASA transport channel), 48000 Hz, 48000 Hz, 963840 samples
		stv_IVASMASA_1dir2TC.pcm - 2 channels (2 MASA transport channel), 48000 Hz, 48000 Hz, 288000 samples per channel
		stv_IVASMASA_1dir2TC_DTX.pcm - 2 channels (2 MASA transport channel), 48000 Hz, 48000 Hz, 963840 samples per channel
		stv_IVASMASA_2dir1TC.pcm - 1 channel (1 MASA transport channel), 48000 Hz, 48000 Hz, 288000
		stv_IVASMASA_2dir2TC.pcm - 2 channels (2 MASA transport channel), 48000 Hz, 48000 Hz, 144000 samples per channel
		Note: Running the self_test.py requires the input vectors in the folder scripts/testv.

		stv1ISM48s.wav - 1 channel (1 audio object), 48000 Hz, 1440000 samples
		stv2ISM48s.wav - 2 channels (discrete audio objects), 48000 Hz, 1440000 samples per channel
		stv2OA32c.wav - 9 channels (2nd order Ambisonics ACN/SN3D), 32000 Hz
		stv2OA48c.wav - 9 channels (2nd order Ambisonics ACN/SN3D), 48000 Hz
		stv3ISM48s.wav - 3 channels (discrete audio objects), 48000 Hz, 1440000 samples per channel
		stv3OA32c.wav - 16 channels (3rd order Ambisonics ACN/SN3D), 32000 Hz, 288939 samples per channel
		stv3OA48c.wav - 16 channels (3rd order Ambisonics ACN/SN3D), 48000 Hz, 433408 samples per channel
		stv4ISM48s.wav - 4 channel (discrete audio objects), 48000 Hz, 1440000 samples per channel
		stv4ISM48n.wav - 4 channel (discrete audio objects), 48000 Hz, noisy speech
		stv8c.wav - 1 channel, 8000 Hz, clean speech/audio
		stv8n.wav - 1 channel, 8000 Hz, noisy speech
		stv16c.wav - 1 channel, 16000 Hz, 610307 samples, clean speech
		stv16n.wav - 1 channel, 16000 Hz, 257024 samples, noisy speech
		stv32c.wav - 1 channel, 32000 Hz, 1220613 samples, clean speech/audio
		stv32n.wav - 1 channel, 32000 Hz, 514048 samples, noisy speech
		stv48c.wav - 1 channel, 48000 Hz, 960000 samples, clean speech/audio
		stv48n.wav - 1 channel, 48000 Hz, 931200 samples, noisy clean speech
		stv51MC48c.wav - 6 channels (5.1 1..6 where 4th channel is LFE), 960000 samples per channel, 48000 Hz
		stv512MC48c.wav - 8 channels (5.1+2 1..8 where 4th channel is LFE), 144000 samples per channel, 48000 Hz
		stv514MC48c.wav - 10 channels (7.1+2 1..10 where 4th channel is LFE), 144000 samples per channel, 48000 Hz
		stv71MC48c.wav - 8 channels (7.1 1..8 where 4th channel is LFE), 144000 samples per channel, 48000 Hz
		stv714MC48c.wav - 12 channels (7.1+4 1..12 where 4th channel is LFE), 144000 samples per channel, 48000 Hz
		stvFOA16c.wav - 4 channels (1st order Ambisonics ACN/SN3D), 16000 Hz,
		stvFOA32c.wav - 4 channels (1st order Ambisonics ACN/SN3D), 32000 Hz, 288939 samples per channel
		stvFOA48c.wav - 4 channels (1st order Ambisonics ACN/SN3D), 48000 Hz, 433408 samples per channel
		stvST16c.wav - 2 channels, 16000 Hz, 329601 samples per channel, clean speech/audio
		stvST16n.wav - 2 channels, 16000 Hz, 310401 samples per channel, noisy speech
		stvST32c.wav - 2 channels, 32000 Hz, 659200 samples per channel, clean speech/audio
		stvST32n.wav - 2 channels, 32000 Hz, 620800 samples per channel, noisy speech
		stvST48c.wav - 2 channels, 48000 Hz, 988800 samples per channel, clean speech/audio
		stvST48n.wav - 2 channels, 48000 Hz, 931200 samples per channel, noisy speech
		stv1MASA1TC48c.wav - 1 channel (1 MASA transport channel), 48000 Hz, 48000 Hz, 144000 samples
		stv1MASA1TC48n.wav - 1 channel (1 MASA transport channel), 48000 Hz, 48000 Hz, 963840 samples
		stv1MASA2TC48c.wav - 2 channels (2 MASA transport channel), 48000 Hz, 48000 Hz, 288000 samples per channel
		stv1MASA2TC48n.wav - 2 channels (2 MASA transport channel), 48000 Hz, 48000 Hz, 963840 samples per channel
		stv2MASA1TC48c.wav - 1 channel (1 MASA transport channel), 48000 Hz, 48000 Hz, 288000
		stv2MASA2TC48c.wav - 2 channels (2 MASA transport channel), 48000 Hz, 48000 Hz, 144000 samples per channel


		For the MASA operation modes, in addition the following metadata files
		are required:

		stv_IVASMASA_1dir1TC.met
		stv_IVASMASA_1dir1TC_DTX.met
		stv_IVASMASA_1dir2TC.met
		stv_IVASMASA_1dir2TC_DTX.met
		stv_IVASMASA_2dir1TC.met
		stv_IVASMASA_2dir2TC.met
		stv1MASA1TC48c.met
		stv1MASA1TC48n.met
		stv1MASA2TC48c.met
		stv1MASA2TC48n.met
		stv2MASA1TC48c.met
		stv2MASA2TC48c.met

		It is strongly recommended to align these files to the corresponding
		PCM audio files. The MASA metadata files can be generated with the
		@@ -389,6 +395,21 @@ with the following meaning:
		-----------------------------------------------------------------------------------


		For the Head rotation operation modes, external trajectory files are available:

		headrot.csv
		headrot_case00_3000_q.csv
		headrot_case01_3000_q.csv
		headrot_case02_3000_q.csv
		headrot_case03_3000_q.csv


		For the Renderer configuration option operation modes, external configuration files are available:

		rend_config_hospital_patientroom.cfg
		config_recreation.cfg
		config_renderer.cfg


		ADDITIONAL SCRIPTS
		==================

scripts/config/ivas_modes.json

+21 −30

Original line number	Diff line number	Diff line
		@@ -1828,10 +1828,7 @@
		64000,
		80000,
		96000,
		128000,
		160000,
		192000,
		256000
		128000
		],
		"swb": [
		13200,
		@@ -1842,10 +1839,7 @@
		64000,
		80000,
		96000,
		128000,
		160000,
		192000,
		256000
		128000
		],
		"fb": [
		32000,
		@@ -1853,10 +1847,7 @@
		64000,
		80000,
		96000,
		128000,
		160000,
		192000,
		256000
		128000
		]
		}
		},
		@@ -1892,10 +1883,7 @@
		64000,
		80000,
		96000,
		128000,
		160000,
		192000,
		256000
		128000
		],
		"swb": [
		13200,
		@@ -1906,10 +1894,7 @@
		64000,
		80000,
		96000,
		128000,
		160000,
		192000,
		256000
		128000
		],
		"fb": [
		32000,
		@@ -1917,10 +1902,7 @@
		64000,
		80000,
		96000,
		128000,
		160000,
		192000,
		256000
		128000
		]
		}
		}
		@@ -2021,7 +2003,8 @@
		128000,
		160000,
		192000,
		256000
		256000,
		384000
		],
		"swb": [
		24400,
		@@ -2033,7 +2016,8 @@
		128000,
		160000,
		192000,
		256000
		256000,
		384000
		],
		"fb": [
		32000,
		@@ -2044,7 +2028,8 @@
		128000,
		160000,
		192000,
		256000
		256000,
		384000
		]
		}
		},
		@@ -2118,7 +2103,9 @@
		128000,
		160000,
		192000,
		256000
		256000,
		384000,
		512000
		],
		"swb": [
		24400,
		@@ -2130,7 +2117,9 @@
		128000,
		160000,
		192000,
		256000
		256000,
		384000,
		512000
		],
		"fb": [
		32000,
		@@ -2141,7 +2130,9 @@
		128000,
		160000,
		192000,
		256000
		256000,
		384000,
		512000
		]
		}
		},