fune/dom/media/platforms/ffmpeg/FFmpegAudioDecoder.cpp
Cristian Tuns bbbd3c280f Backed out 5 changesets (bug 1889978) for causing mda failures in mochitest_compat.toml CLOSED TREE
Backed out changeset ce0c34e548fa (bug 1889978)
Backed out changeset 96e08e04fa4d (bug 1889978)
Backed out changeset 2aeb87615ee0 (bug 1889978)
Backed out changeset 2272c3a73ad3 (bug 1889978)
Backed out changeset dc81d0c812b8 (bug 1889978)
2024-04-29 15:02:55 -04:00

499 lines
18 KiB
C++

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim:set ts=2 sw=2 sts=2 et cindent: */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "FFmpegAudioDecoder.h"
#include "AudioSampleFormat.h"
#include "FFmpegLog.h"
#include "TimeUnits.h"
#include "VideoUtils.h"
#include "BufferReader.h"
#include "libavutil/dict.h"
#include "libavutil/samplefmt.h"
#if defined(FFVPX_VERSION)
# include "libavutil/channel_layout.h"
#endif
#include "mozilla/StaticPrefs_media.h"
#include "mozilla/Telemetry.h"
namespace mozilla {
using TimeUnit = media::TimeUnit;
FFmpegAudioDecoder<LIBAV_VER>::FFmpegAudioDecoder(
FFmpegLibWrapper* aLib, const CreateDecoderParams& aDecoderParams)
: FFmpegDataDecoder(aLib, GetCodecId(aDecoderParams.AudioConfig().mMimeType,
aDecoderParams.AudioConfig())),
mAudioInfo(aDecoderParams.AudioConfig()) {
MOZ_COUNT_CTOR(FFmpegAudioDecoder);
if (mCodecID == AV_CODEC_ID_AAC &&
mAudioInfo.mCodecSpecificConfig.is<AacCodecSpecificData>()) {
const AacCodecSpecificData& aacCodecSpecificData =
mAudioInfo.mCodecSpecificConfig.as<AacCodecSpecificData>();
mExtraData = new MediaByteBuffer;
// Ffmpeg expects the DecoderConfigDescriptor blob.
mExtraData->AppendElements(
*aacCodecSpecificData.mDecoderConfigDescriptorBinaryBlob);
FFMPEG_LOG("FFmpegAudioDecoder ctor (aac)");
return;
}
if (mCodecID == AV_CODEC_ID_MP3) {
// Nothing to do
return;
}
if (mCodecID == AV_CODEC_ID_FLAC) {
MOZ_DIAGNOSTIC_ASSERT(
mAudioInfo.mCodecSpecificConfig.is<FlacCodecSpecificData>());
// Gracefully handle bad data. If don't hit the preceding assert once this
// has been shipped for awhile, we can remove it and make the following code
// non-conditional.
if (mAudioInfo.mCodecSpecificConfig.is<FlacCodecSpecificData>()) {
const FlacCodecSpecificData& flacCodecSpecificData =
mAudioInfo.mCodecSpecificConfig.as<FlacCodecSpecificData>();
if (flacCodecSpecificData.mStreamInfoBinaryBlob->IsEmpty()) {
// Flac files without headers will be missing stream info. In this case
// we don't want to feed ffmpeg empty extra data as it will fail, just
// early return.
return;
}
// Use a new MediaByteBuffer as the object will be modified during
// initialization.
mExtraData = new MediaByteBuffer;
mExtraData->AppendElements(*flacCodecSpecificData.mStreamInfoBinaryBlob);
return;
}
}
// Vorbis and Opus are handled by this case.
RefPtr<MediaByteBuffer> audioCodecSpecificBinaryBlob =
GetAudioCodecSpecificBlob(mAudioInfo.mCodecSpecificConfig);
if (audioCodecSpecificBinaryBlob && audioCodecSpecificBinaryBlob->Length()) {
// Use a new MediaByteBuffer as the object will be modified during
// initialization.
mExtraData = new MediaByteBuffer;
mExtraData->AppendElements(*audioCodecSpecificBinaryBlob);
}
if (mCodecID == AV_CODEC_ID_OPUS) {
mDefaultPlaybackDeviceMono = aDecoderParams.mOptions.contains(
CreateDecoderParams::Option::DefaultPlaybackDeviceMono);
}
}
RefPtr<MediaDataDecoder::InitPromise> FFmpegAudioDecoder<LIBAV_VER>::Init() {
AVDictionary* options = nullptr;
if (mCodecID == AV_CODEC_ID_OPUS) {
// Opus has a special feature for stereo coding where it represent wide
// stereo channels by 180-degree out of phase. This improves quality, but
// needs to be disabled when the output is downmixed to mono. Playback
// number of channels are set in AudioSink, using the same method
// `DecideAudioPlaybackChannels()`, and triggers downmix if needed.
if (mDefaultPlaybackDeviceMono ||
DecideAudioPlaybackChannels(mAudioInfo) == 1) {
mLib->av_dict_set(&options, "apply_phase_inv", "false", 0);
}
}
MediaResult rv = InitDecoder(&options);
mLib->av_dict_free(&options);
return NS_SUCCEEDED(rv)
? InitPromise::CreateAndResolve(TrackInfo::kAudioTrack, __func__)
: InitPromise::CreateAndReject(rv, __func__);
}
void FFmpegAudioDecoder<LIBAV_VER>::InitCodecContext() {
MOZ_ASSERT(mCodecContext);
// We do not want to set this value to 0 as FFmpeg by default will
// use the number of cores, which with our mozlibavutil get_cpu_count
// isn't implemented.
mCodecContext->thread_count = 1;
// FFmpeg takes this as a suggestion for what format to use for audio samples.
// LibAV 0.8 produces rubbish float interleaved samples, request 16 bits
// audio.
mCodecContext->request_sample_fmt =
(mLib->mVersion == 53) ? AV_SAMPLE_FMT_S16 : AV_SAMPLE_FMT_FLT;
#ifdef FFVPX_VERSION
// AudioInfo's layout first 32-bits are bit-per-bit compatible with
// WAVEFORMATEXTENSIBLE and FFmpeg's AVChannel enum. We can cast here.
mCodecContext->ch_layout.nb_channels =
AssertedCast<int>(mAudioInfo.mChannels);
if (mAudioInfo.mChannelMap != AudioConfig::ChannelLayout::UNKNOWN_MAP) {
mLib->av_channel_layout_from_mask(
&mCodecContext->ch_layout,
AssertedCast<uint64_t>(mAudioInfo.mChannelMap));
} else {
mLib->av_channel_layout_default(&mCodecContext->ch_layout,
AssertedCast<int>(mAudioInfo.mChannels));
}
mCodecContext->sample_rate = AssertedCast<int>(mAudioInfo.mRate);
#endif
}
static AlignedAudioBuffer CopyAndPackAudio(AVFrame* aFrame,
uint32_t aNumChannels,
uint32_t aNumAFrames) {
AlignedAudioBuffer audio(aNumChannels * aNumAFrames);
if (!audio) {
return audio;
}
if (aFrame->format == AV_SAMPLE_FMT_FLT) {
// Audio data already packed. No need to do anything other than copy it
// into a buffer we own.
memcpy(audio.get(), aFrame->data[0],
aNumChannels * aNumAFrames * sizeof(AudioDataValue));
} else if (aFrame->format == AV_SAMPLE_FMT_FLTP) {
// Planar audio data. Pack it into something we can understand.
AudioDataValue* tmp = audio.get();
AudioDataValue** data = reinterpret_cast<AudioDataValue**>(aFrame->data);
for (uint32_t frame = 0; frame < aNumAFrames; frame++) {
for (uint32_t channel = 0; channel < aNumChannels; channel++) {
*tmp++ = data[channel][frame];
}
}
} else if (aFrame->format == AV_SAMPLE_FMT_S16) {
// Audio data already packed. Need to convert from S16 to 32 bits Float
AudioDataValue* tmp = audio.get();
int16_t* data = reinterpret_cast<int16_t**>(aFrame->data)[0];
for (uint32_t frame = 0; frame < aNumAFrames; frame++) {
for (uint32_t channel = 0; channel < aNumChannels; channel++) {
*tmp++ = ConvertAudioSample<float>(*data++);
}
}
} else if (aFrame->format == AV_SAMPLE_FMT_S16P) {
// Planar audio data. Convert it from S16 to 32 bits float
// and pack it into something we can understand.
AudioDataValue* tmp = audio.get();
int16_t** data = reinterpret_cast<int16_t**>(aFrame->data);
for (uint32_t frame = 0; frame < aNumAFrames; frame++) {
for (uint32_t channel = 0; channel < aNumChannels; channel++) {
*tmp++ = ConvertAudioSample<float>(data[channel][frame]);
}
}
} else if (aFrame->format == AV_SAMPLE_FMT_S32) {
// Audio data already packed. Need to convert from S16 to 32 bits Float
AudioDataValue* tmp = audio.get();
int32_t* data = reinterpret_cast<int32_t**>(aFrame->data)[0];
for (uint32_t frame = 0; frame < aNumAFrames; frame++) {
for (uint32_t channel = 0; channel < aNumChannels; channel++) {
*tmp++ = ConvertAudioSample<float>(*data++);
}
}
} else if (aFrame->format == AV_SAMPLE_FMT_S32P) {
// Planar audio data. Convert it from S32 to 32 bits float
// and pack it into something we can understand.
AudioDataValue* tmp = audio.get();
int32_t** data = reinterpret_cast<int32_t**>(aFrame->data);
for (uint32_t frame = 0; frame < aNumAFrames; frame++) {
for (uint32_t channel = 0; channel < aNumChannels; channel++) {
*tmp++ = ConvertAudioSample<float>(data[channel][frame]);
}
}
} else if (aFrame->format == AV_SAMPLE_FMT_U8) {
// Interleaved audio data. Convert it from u8 to the expected sample-format
AudioDataValue* tmp = audio.get();
uint8_t* data = reinterpret_cast<uint8_t**>(aFrame->data)[0];
for (uint32_t frame = 0; frame < aNumAFrames; frame++) {
for (uint32_t channel = 0; channel < aNumChannels; channel++) {
*tmp++ = ConvertAudioSample<float>(*data++);
}
}
} else if (aFrame->format == AV_SAMPLE_FMT_U8P) {
// Planar audio data. Convert it from u8 to the expected sample-format
// and pack it into something we can understand.
AudioDataValue* tmp = audio.get();
uint8_t** data = reinterpret_cast<uint8_t**>(aFrame->data);
for (uint32_t frame = 0; frame < aNumAFrames; frame++) {
for (uint32_t channel = 0; channel < aNumChannels; channel++) {
*tmp++ = ConvertAudioSample<float>(data[channel][frame]);
}
}
}
return audio;
}
using ChannelLayout = AudioConfig::ChannelLayout;
MediaResult FFmpegAudioDecoder<LIBAV_VER>::PostProcessOutput(
bool aDecoded, MediaRawData* aSample, DecodedData& aResults,
bool* aGotFrame, int32_t aSubmitted) {
media::TimeUnit pts = aSample->mTime;
if (mFrame->format != AV_SAMPLE_FMT_FLT &&
mFrame->format != AV_SAMPLE_FMT_FLTP &&
mFrame->format != AV_SAMPLE_FMT_S16 &&
mFrame->format != AV_SAMPLE_FMT_S16P &&
mFrame->format != AV_SAMPLE_FMT_S32 &&
mFrame->format != AV_SAMPLE_FMT_S32P &&
mFrame->format != AV_SAMPLE_FMT_U8 &&
mFrame->format != AV_SAMPLE_FMT_U8P) {
return MediaResult(
NS_ERROR_DOM_MEDIA_DECODE_ERR,
RESULT_DETAIL("FFmpeg audio decoder outputs unsupported audio format"));
}
if (aSubmitted < 0) {
FFMPEG_LOG("Got %d more frame from packet", mFrame->nb_samples);
}
FFMPEG_LOG("FFmpegAudioDecoder decoded: [%s,%s] (Duration: %s) [%s]",
aSample->mTime.ToString().get(),
aSample->GetEndTime().ToString().get(),
aSample->mDuration.ToString().get(),
mLib->av_get_sample_fmt_name(mFrame->format));
uint32_t numChannels = mCodecContext->channels;
uint32_t samplingRate = mCodecContext->sample_rate;
if (!numChannels) {
numChannels = mAudioInfo.mChannels;
}
if (!samplingRate) {
samplingRate = mAudioInfo.mRate;
}
AlignedAudioBuffer audio =
CopyAndPackAudio(mFrame, numChannels, mFrame->nb_samples);
if (!audio) {
FFMPEG_LOG("CopyAndPackAudio error (OOM)");
return MediaResult(NS_ERROR_OUT_OF_MEMORY, __func__);
}
media::TimeUnit duration = TimeUnit(mFrame->nb_samples, samplingRate);
if (!duration.IsValid()) {
FFMPEG_LOG("Duration isn't valid (%d + %d)", mFrame->nb_samples,
samplingRate);
return MediaResult(NS_ERROR_DOM_MEDIA_OVERFLOW_ERR,
RESULT_DETAIL("Invalid sample duration"));
}
media::TimeUnit newpts = pts + duration;
if (!newpts.IsValid()) {
FFMPEG_LOG("New pts isn't valid (%lf + %lf)", pts.ToSeconds(),
duration.ToSeconds());
return MediaResult(
NS_ERROR_DOM_MEDIA_OVERFLOW_ERR,
RESULT_DETAIL("Invalid count of accumulated audio samples"));
}
RefPtr<AudioData> data =
new AudioData(aSample->mOffset, pts, std::move(audio), numChannels,
samplingRate, mCodecContext->channel_layout);
MOZ_ASSERT(duration == data->mDuration, "must be equal");
aResults.AppendElement(std::move(data));
pts = newpts;
if (aGotFrame) {
*aGotFrame = true;
}
return NS_OK;
}
#if LIBAVCODEC_VERSION_MAJOR < 59
MediaResult FFmpegAudioDecoder<LIBAV_VER>::DecodeUsingFFmpeg(
AVPacket* aPacket, bool& aDecoded, MediaRawData* aSample,
DecodedData& aResults, bool* aGotFrame) {
int decoded = 0;
int rv =
mLib->avcodec_decode_audio4(mCodecContext, mFrame, &decoded, aPacket);
aDecoded = decoded == 1;
if (rv < 0) {
NS_WARNING("FFmpeg audio decoder error.");
return MediaResult(NS_ERROR_DOM_MEDIA_DECODE_ERR,
RESULT_DETAIL("FFmpeg audio error"));
}
PostProcessOutput(decoded, aSample, aResults, aGotFrame, 0);
return NS_OK;
}
#else
# define AVRESULT_OK 0
MediaResult FFmpegAudioDecoder<LIBAV_VER>::DecodeUsingFFmpeg(
AVPacket* aPacket, bool& aDecoded, MediaRawData* aSample,
DecodedData& aResults, bool* aGotFrame) {
// This in increment whenever avcodec_send_packet succeeds, and decremented
// whenever avcodec_receive_frame succeeds. Because it is possible to have
// multiple AVFrames from a single AVPacket, this number can be negative.
// This is used to ensure that pts and duration are correctly set on the
// resulting audio buffers.
int32_t submitted = 0;
int ret = mLib->avcodec_send_packet(mCodecContext, aPacket);
switch (ret) {
case AVRESULT_OK:
submitted++;
break;
case AVERROR(EAGAIN):
FFMPEG_LOG(" av_codec_send_packet: EAGAIN.");
MOZ_ASSERT(false, "EAGAIN");
break;
case AVERROR_EOF:
FFMPEG_LOG(" End of stream.");
return MediaResult(NS_ERROR_DOM_MEDIA_END_OF_STREAM,
RESULT_DETAIL("End of stream"));
default:
NS_WARNING("FFmpeg audio decoder error (avcodec_send_packet).");
return MediaResult(NS_ERROR_DOM_MEDIA_DECODE_ERR,
RESULT_DETAIL("FFmpeg audio error"));
}
MediaResult rv;
while (ret == 0) {
aDecoded = false;
ret = mLib->avcodec_receive_frame(mCodecContext, mFrame);
switch (ret) {
case AVRESULT_OK:
aDecoded = true;
submitted--;
if (submitted < 0) {
FFMPEG_LOG("Multiple AVFrame from a single AVPacket");
}
break;
case AVERROR(EAGAIN): {
// Quirk of the vorbis decoder -- the first packet doesn't return audio.
if (submitted == 1 && mCodecID == AV_CODEC_ID_VORBIS) {
AlignedAudioBuffer buf;
aResults.AppendElement(
new AudioData(0, TimeUnit::Zero(), std::move(buf),
mAudioInfo.mChannels, mAudioInfo.mRate));
}
FFMPEG_LOG(" EAGAIN (packets submitted: %" PRIu32 ").", submitted);
rv = NS_OK;
break;
}
case AVERROR_EOF: {
FFMPEG_LOG(" End of stream.");
rv = MediaResult(NS_ERROR_DOM_MEDIA_END_OF_STREAM,
RESULT_DETAIL("End of stream"));
break;
}
default:
FFMPEG_LOG(" avcodec_receive_packet error.");
NS_WARNING("FFmpeg audio decoder error (avcodec_receive_packet).");
rv = MediaResult(NS_ERROR_DOM_MEDIA_DECODE_ERR,
RESULT_DETAIL("FFmpeg audio error"));
}
if (aDecoded) {
PostProcessOutput(aDecoded, aSample, aResults, aGotFrame, submitted);
}
}
return NS_OK;
}
#endif
MediaResult FFmpegAudioDecoder<LIBAV_VER>::DoDecode(MediaRawData* aSample,
uint8_t* aData, int aSize,
bool* aGotFrame,
DecodedData& aResults) {
MOZ_ASSERT(mTaskQueue->IsOnCurrentThread());
PROCESS_DECODE_LOG(aSample);
AVPacket packet;
mLib->av_init_packet(&packet);
FFMPEG_LOG("FFmpegAudioDecoder::DoDecode: %d bytes, [%s,%s] (Duration: %s)",
aSize, aSample->mTime.ToString().get(),
aSample->GetEndTime().ToString().get(),
aSample->mDuration.ToString().get());
packet.data = const_cast<uint8_t*>(aData);
packet.size = aSize;
if (aGotFrame) {
*aGotFrame = false;
}
if (!PrepareFrame()) {
FFMPEG_LOG("FFmpegAudioDecoder: OOM in PrepareFrame");
return MediaResult(
NS_ERROR_OUT_OF_MEMORY,
RESULT_DETAIL("FFmpeg audio decoder failed to allocate frame"));
}
bool decoded = false;
auto rv = DecodeUsingFFmpeg(&packet, decoded, aSample, aResults, aGotFrame);
NS_ENSURE_SUCCESS(rv, rv);
return NS_OK;
}
AVCodecID FFmpegAudioDecoder<LIBAV_VER>::GetCodecId(const nsACString& aMimeType,
const AudioInfo& aInfo) {
if (aMimeType.EqualsLiteral("audio/mp4a-latm")) {
return AV_CODEC_ID_AAC;
}
#ifdef FFVPX_VERSION
if (aMimeType.EqualsLiteral("audio/mpeg")) {
return AV_CODEC_ID_MP3;
}
if (aMimeType.EqualsLiteral("audio/flac")) {
return AV_CODEC_ID_FLAC;
}
if (aMimeType.EqualsLiteral("audio/vorbis")) {
return AV_CODEC_ID_VORBIS;
}
if (aMimeType.EqualsLiteral("audio/opus")) {
return AV_CODEC_ID_OPUS;
}
if (aMimeType.Find("wav") != kNotFound) {
if (aMimeType.EqualsLiteral("audio/x-wav") ||
aMimeType.EqualsLiteral("audio/wave; codecs=1") ||
aMimeType.EqualsLiteral("audio/wave; codecs=65534")) {
// find the pcm format
switch (aInfo.mBitDepth) {
case 8:
return AV_CODEC_ID_PCM_U8;
case 16:
return AV_CODEC_ID_PCM_S16LE;
case 24:
return AV_CODEC_ID_PCM_S24LE;
case 32:
return AV_CODEC_ID_PCM_S32LE;
case 0:
// ::Init will find and use the right type here, this is just
// returning something that means that this media type can be decoded.
// This happens when attempting to find what decoder to use for a
// media type, without actually having looked at the actual
// bytestream. This decoder can decode all usual PCM bytestream
// anyway.
return AV_CODEC_ID_PCM_S16LE;
default:
return AV_CODEC_ID_NONE;
};
}
if (aMimeType.EqualsLiteral("audio/wave; codecs=3")) {
return AV_CODEC_ID_PCM_F32LE;
}
// A-law
if (aMimeType.EqualsLiteral("audio/wave; codecs=6")) {
return AV_CODEC_ID_PCM_ALAW;
}
// Mu-law
if (aMimeType.EqualsLiteral("audio/wave; codecs=7")) {
return AV_CODEC_ID_PCM_MULAW;
}
}
#endif
return AV_CODEC_ID_NONE;
}
nsCString FFmpegAudioDecoder<LIBAV_VER>::GetCodecName() const {
#if LIBAVCODEC_VERSION_MAJOR > 53
return nsCString(mLib->avcodec_descriptor_get(mCodecID)->name);
#else
return "unknown"_ns;
#endif
}
FFmpegAudioDecoder<LIBAV_VER>::~FFmpegAudioDecoder() {
MOZ_COUNT_DTOR(FFmpegAudioDecoder);
}
} // namespace mozilla