Bug 1749046 - Resampling input audio to the encoder sample-rate. r=chunmin

Differential Revision: https://phabricator.services.mozilla.com/D199534
This commit is contained in:
Paul Adenot 2024-03-27 14:16:35 +00:00
parent fa10e30c03
commit b1c61fbf38
4 changed files with 133 additions and 30 deletions

View file

@ -23,6 +23,7 @@
# include "mozilla/gfx/Rect.h"
# include "nsString.h"
# include "nsTArray.h"
# include "EncoderConfig.h"
namespace mozilla {
@ -723,6 +724,9 @@ class MediaRawData final : public MediaData {
// Currently this is only used for the media engine DRM playback.
bool mShouldCopyCryptoToRemoteRawData = false;
// Config used to encode this packet.
UniquePtr<const EncoderConfig> mConfig;
// It's only used when the remote decoder reconstructs the media raw data.
CryptoSample& GetWritableCrypto() { return mCryptoInternal; }

View file

@ -9,6 +9,7 @@
#include "FFmpegRuntimeLinker.h"
#include "FFmpegLog.h"
#include "FFmpegUtils.h"
#include "MediaData.h"
#include "AudioSegment.h"
@ -34,6 +35,11 @@ nsCString FFmpegAudioEncoder<LIBAV_VER>::GetDescriptionName() const {
#endif
}
void FFmpegAudioEncoder<LIBAV_VER>::ResamplerDestroy::operator()(
SpeexResamplerState* aResampler) {
speex_resampler_destroy(aResampler);
}
nsresult FFmpegAudioEncoder<LIBAV_VER>::InitSpecific() {
MOZ_ASSERT(mTaskQueue->IsOnCurrentThread());
@ -46,6 +52,54 @@ nsresult FFmpegAudioEncoder<LIBAV_VER>::InitSpecific() {
return NS_ERROR_DOM_MEDIA_NOT_SUPPORTED_ERR;
}
// Find a compatible input rate for the codec, update the encoder config, and
// note the rate at which this instance was configured.
mInputSampleRate = AssertedCast<int>(mConfig.mSampleRate);
if (codec->supported_samplerates) {
// Ensure the sample-rate list is sorted, iterate and either find that the
// sample rate is supported, or pick the same rate just above the audio
// input sample-rate (as to not lose information). If the audio is higher
// than the highest supported sample-rate, down-sample to the highest
// sample-rate supported by the codec. This is the case when encoding high
// samplerate audio to opus.
AutoTArray<int, 16> supportedSampleRates;
IterateZeroTerminated(codec->supported_samplerates,
[&supportedSampleRates](int aRate) mutable {
supportedSampleRates.AppendElement(aRate);
});
supportedSampleRates.Sort();
for (const auto& rate : supportedSampleRates) {
if (mInputSampleRate == rate) {
mConfig.mSampleRate = rate;
break;
}
if (mInputSampleRate < rate) {
// This rate is the smallest supported rate above the content's rate.
mConfig.mSampleRate = rate;
break;
}
if (mInputSampleRate > rate) {
mConfig.mSampleRate = rate;
}
}
}
if (mConfig.mSampleRate != AssertedCast<uint32_t>(mInputSampleRate)) {
// Need to resample to targetRate
int err;
SpeexResamplerState* resampler = speex_resampler_init(
mConfig.mNumberOfChannels, mInputSampleRate, mConfig.mSampleRate,
SPEEX_RESAMPLER_QUALITY_DEFAULT, &err);
if (!err) {
mResampler.reset(resampler);
} else {
FFMPEG_LOG(
"Error creating resampler in FFmpegAudioEncoder %dHz -> %dHz (%dch)",
mInputSampleRate, mConfig.mSampleRate, mConfig.mNumberOfChannels);
}
}
// And now the audio-specific part
mCodecContext->sample_rate = AssertedCast<int>(mConfig.mSampleRate);
mCodecContext->channels = AssertedCast<int>(mConfig.mNumberOfChannels);
@ -75,7 +129,7 @@ nsresult FFmpegAudioEncoder<LIBAV_VER>::InitSpecific() {
mLib->av_opt_set(mCodecContext->priv_data, "vbr", "off", 0);
}
}
// Override the time base: always the sample-rate in audio
// Override the time base: always the sample-rate the encoder is running at
mCodecContext->time_base =
AVRational{.num = 1, .den = mCodecContext->sample_rate};
@ -99,8 +153,6 @@ nsresult FFmpegAudioEncoder<LIBAV_VER>::InitSpecific() {
Result<MediaDataEncoder::EncodedData, nsresult>
FFmpegAudioEncoder<LIBAV_VER>::EncodeOnePacket(Span<float> aSamples,
uint32_t aChannels,
uint32_t aRate,
media::TimeUnit aPts) {
// Allocate AVFrame.
if (!PrepareFrame()) {
@ -108,13 +160,13 @@ FFmpegAudioEncoder<LIBAV_VER>::EncodeOnePacket(Span<float> aSamples,
return Err(NS_ERROR_OUT_OF_MEMORY);
}
uint32_t frameCount = aFrames.Length() / aChannels;
uint32_t frameCount = aSamples.Length() / mConfig.mNumberOfChannels;
// This method assumes that the audio has been packetized appropriately --
// packets smaller than the packet size are allowed when draining.
MOZ_ASSERT(AssertedCast<int>(frameCount) <= mCodecContext->frame_size);
mFrame->channels = AssertedCast<int>(aChannels);
mFrame->channels = AssertedCast<int>(mConfig.mNumberOfChannels);
# if LIBAVCODEC_VERSION_MAJOR >= 60
int rv = mLib->av_channel_layout_copy(&mFrame->ch_layout,
@ -126,16 +178,17 @@ FFmpegAudioEncoder<LIBAV_VER>::EncodeOnePacket(Span<float> aSamples,
}
# endif
mFrame->sample_rate = AssertedCast<int>(aRate);
mFrame->sample_rate = AssertedCast<int>(mConfig.mSampleRate);
// Not a mistake, nb_samples is per channel in ffmpeg
mFrame->nb_samples = AssertedCast<int>(frameCount);
// Audio is converted below if needed
mFrame->format = mCodecContext->sample_fmt;
// Set presentation timestamp and duration of the AVFrame.
# if LIBAVCODEC_VERSION_MAJOR >= 59
mFrame->time_base = AVRational{.num = 1, .den = static_cast<int>(aRate)};
mFrame->time_base =
AVRational{.num = 1, .den = static_cast<int>(mConfig.mSampleRate)};
# endif
mFrame->pts = aPts.ToTicksAtRate(aRate);
mFrame->pts = aPts.ToTicksAtRate(mConfig.mSampleRate);
mFrame->pkt_duration = frameCount;
# if LIBAVCODEC_VERSION_MAJOR >= 60
mFrame->duration = frameCount;
@ -163,9 +216,9 @@ FFmpegAudioEncoder<LIBAV_VER>::EncodeOnePacket(Span<float> aSamples,
aSamples.Length());
} else {
MOZ_ASSERT(mCodecContext->sample_fmt == AV_SAMPLE_FMT_FLTP);
for (uint32_t i = 0; i < aChannels; i++) {
for (uint32_t i = 0; i < mConfig.mNumberOfChannels; i++) {
DeinterleaveAndConvertBuffer(aSamples.data(), mFrame->nb_samples,
aChannels, mFrame->data);
mFrame->channels, mFrame->data);
}
}
@ -184,14 +237,13 @@ Result<MediaDataEncoder::EncodedData, nsresult> FFmpegAudioEncoder<
FFMPEG_LOG("Encoding %" PRIu32 " frames of audio at pts: %s",
sample->Frames(), sample->mTime.ToString().get());
// TODO: Validate input -- error out of sample rate or channel count is not
// what's expected
if (sample->mRate != mConfig.mSampleRate ||
if ((!mResampler && sample->mRate != mConfig.mSampleRate) ||
(mResampler &&
sample->mRate != AssertedCast<uint32_t>(mInputSampleRate)) ||
sample->mChannels != mConfig.mNumberOfChannels) {
FFMPEG_LOG(
"Rate or sample-rate mismatch at the input of the audio encoder, "
"erroring out");
"Rate or sample-rate at the inputof the encoder different from what "
"has been configured initially, erroring out");
return Result<MediaDataEncoder::EncodedData, nsresult>(
NS_ERROR_DOM_ENCODING_NOT_SUPPORTED_ERR);
}
@ -213,10 +265,32 @@ Result<MediaDataEncoder::EncodedData, nsresult> FFmpegAudioEncoder<
mFirstPacketPts = sample->mTime;
}
Span<float> audio = sample->Data();
if (mResampler) {
// Ensure that all input frames are consumed each time by oversizing the
// output buffer.
int bufferLengthGuess = std::ceil(2. * static_cast<float>(audio.size()) *
mConfig.mSampleRate / mInputSampleRate);
mTempBuffer.SetLength(bufferLengthGuess);
uint32_t inputFrames = audio.size() / mConfig.mNumberOfChannels;
uint32_t inputFramesProcessed = inputFrames;
uint32_t outputFrames = bufferLengthGuess / mConfig.mNumberOfChannels;
DebugOnly<int> rv = speex_resampler_process_interleaved_float(
mResampler.get(), audio.data(), &inputFramesProcessed,
mTempBuffer.Elements(), &outputFrames);
audio = Span<float>(mTempBuffer.Elements(),
outputFrames * mConfig.mNumberOfChannels);
MOZ_ASSERT(inputFrames == inputFramesProcessed,
"increate the buffer to consume all input each time");
MOZ_ASSERT(rv == RESAMPLER_ERR_SUCCESS);
}
EncodedData output;
MediaResult rv = NS_OK;
mPacketizer->Input(sample->Data().data(), sample->Frames());
mPacketizer->Input(audio.data(),
audio.Length() / mConfig.mNumberOfChannels);
// Dequeue and encode each packet
while (mPacketizer->PacketsAvailable() && rv.Code() == NS_OK) {
@ -252,8 +326,7 @@ FFmpegAudioEncoder<LIBAV_VER>::DrainWithModernAPIs() {
media::TimeUnit pts = mPacketizer->Drain(mTempBuffer.Elements(), written);
auto audio =
Span(mTempBuffer.Elements(), written * mPacketizer->ChannelCount());
auto encodeResult = EncodeOnePacket(audio, mPacketizer->ChannelCount(),
mConfig.mSampleRate, pts);
auto encodeResult = EncodeOnePacket(audio, pts);
if (encodeResult.isOk()) {
auto array = encodeResult.unwrap();
output.AppendElements(std::move(array));
@ -283,6 +356,7 @@ RefPtr<MediaRawData> FFmpegAudioEncoder<LIBAV_VER>::ToMediaRawData(
data->mTimecode = data->mTime;
data->mDuration =
media::TimeUnit(mCodecContext->frame_size, mConfig.mSampleRate);
// Add here the new rate
// Handle encoder delay
// Tracked in https://github.com/w3c/webcodecs/issues/626 because not quite
@ -299,9 +373,14 @@ RefPtr<MediaRawData> FFmpegAudioEncoder<LIBAV_VER>::ToMediaRawData(
data->mTime = mFirstPacketPts;
}
if (mPacketsDelivered++ == 0) {
// Attach extradata, and the config (including any channel / samplerate
// modification to fit the encoder requirements), if needed.
if (auto r = GetExtraData(aPacket); r.isOk()) {
data->mExtraData = r.unwrap();
}
data->mConfig = MakeUnique<EncoderConfig>(mConfig);
}
if (data->mExtraData) {
FFMPEG_LOG(
@ -318,13 +397,16 @@ RefPtr<MediaRawData> FFmpegAudioEncoder<LIBAV_VER>::ToMediaRawData(
}
Result<already_AddRefed<MediaByteBuffer>, nsresult>
FFmpegAudioEncoder<LIBAV_VER>::GetExtraData(AVPacket* aPacket) {
MOZ_ASSERT(aPacket);
// Create extra data.
FFmpegAudioEncoder<LIBAV_VER>::GetExtraData(AVPacket* /* aPacket */) {
if (!mCodecContext->extradata_size) {
return Err(NS_ERROR_NOT_AVAILABLE);
}
// Create extra data -- they are on the context.
auto extraData = MakeRefPtr<MediaByteBuffer>();
extraData->SetLength(mCodecContext->extradata_size);
MOZ_ASSERT(extraData);
PodCopy(extraData->Elements(), mCodecContext->extradata,
mCodecContext->extradata_size);
return extraData.forget();
}

View file

@ -14,6 +14,7 @@
// This must be the last header included
#include "FFmpegLibs.h"
#include "speex/speex_resampler.h"
namespace mozilla {
@ -34,8 +35,6 @@ class FFmpegAudioEncoder<LIBAV_VER> : public FFmpegDataEncoder<LIBAV_VER> {
virtual nsresult InitSpecific() override;
#if LIBAVCODEC_VERSION_MAJOR >= 58
Result<EncodedData, nsresult> EncodeOnePacket(Span<float> aSamples,
uint32_t aChannels,
uint32_t aRate,
media::TimeUnit aPts);
Result<EncodedData, nsresult> EncodeInputWithModernAPIs(
RefPtr<const MediaData> aSample) override;
@ -47,11 +46,20 @@ class FFmpegAudioEncoder<LIBAV_VER> : public FFmpegDataEncoder<LIBAV_VER> {
AVPacket* aPacket) override;
// Most audio codecs (except PCM) require a very specific frame size.
Maybe<TimedPacketizer<float, float>> mPacketizer;
// A temporary buffer kept around for shuffling audio frames, if needed.
// A temporary buffer kept around for shuffling audio frames, resampling,
// packetization, etc.
nsTArray<float> mTempBuffer;
// The pts of the first packet this encoder has seen, to be able to properly
// mark encoder delay as such.
media::TimeUnit mFirstPacketPts{media::TimeUnit::Invalid()};
struct ResamplerDestroy {
void operator()(SpeexResamplerState* aResampler);
};
// Rate at which this instance has been configured, which might be different
// from the rate the underlying encoder is running at.
int mInputSampleRate = 0;
UniquePtr<SpeexResamplerState, ResamplerDestroy> mResampler;
uint64_t mPacketsDelivered = 0;
};
} // namespace mozilla

View file

@ -464,9 +464,18 @@ AudioDecoderConfigInternal AudioEncoder::EncoderConfigToDecoderConfig(
const AudioEncoderConfigInternal& aOutputConfig) const {
MOZ_ASSERT(aOutputConfig.mSampleRate.isSome());
MOZ_ASSERT(aOutputConfig.mNumberOfChannels.isSome());
uint32_t sampleRate = aOutputConfig.mSampleRate.value();
uint32_t channelCount = aOutputConfig.mNumberOfChannels.value();
// Check if the encoder had to modify the settings because of codec
// constraints. e.g. FFmpegAudioEncoder can encode any sample-rate, but if the
// codec is Opus, then it will resample the audio one of the specific rates
// supported by the encoder.
if (aRawData->mConfig) {
sampleRate = aRawData->mConfig->mSampleRate;
channelCount = aRawData->mConfig->mNumberOfChannels;
}
return AudioDecoderConfigInternal(
aOutputConfig.mCodec, aOutputConfig.mSampleRate.value(),
aOutputConfig.mNumberOfChannels.value(),
aOutputConfig.mCodec, sampleRate, channelCount,
aRawData->mExtraData ? Some(aRawData->mExtraData) : Nothing());
}