fune/dom/media/platforms/ffmpeg/FFmpegAudioDecoder.cpp

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim:set ts=2 sw=2 sts=2 et cindent: */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#include "FFmpegAudioDecoder.h"
#include "AudioSampleFormat.h"
#include "FFmpegLog.h"
#include "TimeUnits.h"
#include "VideoUtils.h"
#include "BufferReader.h"
#include "libavutil/dict.h"
#include "libavutil/samplefmt.h"
#if defined(FFVPX_VERSION)
#  include "libavutil/channel_layout.h"
#endif
#include "mozilla/StaticPrefs_media.h"
#include "mozilla/Telemetry.h"

namespace mozilla {

using TimeUnit = media::TimeUnit;

FFmpegAudioDecoder<LIBAV_VER>::FFmpegAudioDecoder(
    FFmpegLibWrapper* aLib, const CreateDecoderParams& aDecoderParams)
    : FFmpegDataDecoder(aLib, GetCodecId(aDecoderParams.AudioConfig().mMimeType,
                                         aDecoderParams.AudioConfig())),
      mAudioInfo(aDecoderParams.AudioConfig()) {
  MOZ_COUNT_CTOR(FFmpegAudioDecoder);

  if (mCodecID == AV_CODEC_ID_AAC &&
      mAudioInfo.mCodecSpecificConfig.is<AacCodecSpecificData>()) {
    const AacCodecSpecificData& aacCodecSpecificData =
        mAudioInfo.mCodecSpecificConfig.as<AacCodecSpecificData>();
    mExtraData = new MediaByteBuffer;
    // Ffmpeg expects the DecoderConfigDescriptor blob.
    mExtraData->AppendElements(
        *aacCodecSpecificData.mDecoderConfigDescriptorBinaryBlob);
    FFMPEG_LOG("FFmpegAudioDecoder ctor (aac)");
    return;
  }

  if (mCodecID == AV_CODEC_ID_MP3) {
    // Nothing to do
    return;
  }

  if (mCodecID == AV_CODEC_ID_FLAC) {
    MOZ_DIAGNOSTIC_ASSERT(
        mAudioInfo.mCodecSpecificConfig.is<FlacCodecSpecificData>());
    // Gracefully handle bad data. If don't hit the preceding assert once this
    // has been shipped for awhile, we can remove it and make the following code
    // non-conditional.
    if (mAudioInfo.mCodecSpecificConfig.is<FlacCodecSpecificData>()) {
      const FlacCodecSpecificData& flacCodecSpecificData =
          mAudioInfo.mCodecSpecificConfig.as<FlacCodecSpecificData>();
      if (flacCodecSpecificData.mStreamInfoBinaryBlob->IsEmpty()) {
        // Flac files without headers will be missing stream info. In this case
        // we don't want to feed ffmpeg empty extra data as it will fail, just
        // early return.
        return;
      }
      // Use a new MediaByteBuffer as the object will be modified during
      // initialization.
      mExtraData = new MediaByteBuffer;
      mExtraData->AppendElements(*flacCodecSpecificData.mStreamInfoBinaryBlob);
      return;
    }
  }

  // Vorbis and Opus are handled by this case.
  RefPtr<MediaByteBuffer> audioCodecSpecificBinaryBlob =
      GetAudioCodecSpecificBlob(mAudioInfo.mCodecSpecificConfig);
  if (audioCodecSpecificBinaryBlob && audioCodecSpecificBinaryBlob->Length()) {
    // Use a new MediaByteBuffer as the object will be modified during
    // initialization.
    mExtraData = new MediaByteBuffer;
    mExtraData->AppendElements(*audioCodecSpecificBinaryBlob);
  }

  if (mCodecID == AV_CODEC_ID_OPUS) {
    mDefaultPlaybackDeviceMono = aDecoderParams.mOptions.contains(
        CreateDecoderParams::Option::DefaultPlaybackDeviceMono);
  }
}

RefPtr<MediaDataDecoder::InitPromise> FFmpegAudioDecoder<LIBAV_VER>::Init() {
  AVDictionary* options = nullptr;
  if (mCodecID == AV_CODEC_ID_OPUS) {
    // Opus has a special feature for stereo coding where it represent wide
    // stereo channels by 180-degree out of phase. This improves quality, but
    // needs to be disabled when the output is downmixed to mono. Playback
    // number of channels are set in AudioSink, using the same method
    // `DecideAudioPlaybackChannels()`, and triggers downmix if needed.
    if (mDefaultPlaybackDeviceMono ||
        DecideAudioPlaybackChannels(mAudioInfo) == 1) {
      mLib->av_dict_set(&options, "apply_phase_inv", "false", 0);
    }
  }

  MediaResult rv = InitDecoder(&options);

  mLib->av_dict_free(&options);

  return NS_SUCCEEDED(rv)
             ? InitPromise::CreateAndResolve(TrackInfo::kAudioTrack, __func__)
             : InitPromise::CreateAndReject(rv, __func__);
}

void FFmpegAudioDecoder<LIBAV_VER>::InitCodecContext() {
  MOZ_ASSERT(mCodecContext);
  // We do not want to set this value to 0 as FFmpeg by default will
  // use the number of cores, which with our mozlibavutil get_cpu_count
  // isn't implemented.
  mCodecContext->thread_count = 1;
  // FFmpeg takes this as a suggestion for what format to use for audio samples.
  // LibAV 0.8 produces rubbish float interleaved samples, request 16 bits
  // audio.
  mCodecContext->request_sample_fmt =
      (mLib->mVersion == 53) ? AV_SAMPLE_FMT_S16 : AV_SAMPLE_FMT_FLT;
#ifdef FFVPX_VERSION
  // AudioInfo's layout first 32-bits are bit-per-bit compatible with
  // WAVEFORMATEXTENSIBLE and FFmpeg's AVChannel enum. We can cast here.
  mCodecContext->ch_layout.nb_channels =
      AssertedCast<int>(mAudioInfo.mChannels);
  if (mAudioInfo.mChannelMap != AudioConfig::ChannelLayout::UNKNOWN_MAP) {
    mLib->av_channel_layout_from_mask(
        &mCodecContext->ch_layout,
        AssertedCast<uint64_t>(mAudioInfo.mChannelMap));
  } else {
    mLib->av_channel_layout_default(&mCodecContext->ch_layout,
                                    AssertedCast<int>(mAudioInfo.mChannels));
  }
  mCodecContext->sample_rate = AssertedCast<int>(mAudioInfo.mRate);
#endif
}

static AlignedAudioBuffer CopyAndPackAudio(AVFrame* aFrame,
                                           uint32_t aNumChannels,
                                           uint32_t aNumAFrames) {
  AlignedAudioBuffer audio(aNumChannels * aNumAFrames);
  if (!audio) {
    return audio;
  }

  if (aFrame->format == AV_SAMPLE_FMT_FLT) {
    // Audio data already packed. No need to do anything other than copy it
    // into a buffer we own.
    memcpy(audio.get(), aFrame->data[0],
           aNumChannels * aNumAFrames * sizeof(AudioDataValue));
  } else if (aFrame->format == AV_SAMPLE_FMT_FLTP) {
    // Planar audio data. Pack it into something we can understand.
    AudioDataValue* tmp = audio.get();
    AudioDataValue** data = reinterpret_cast<AudioDataValue**>(aFrame->data);
    for (uint32_t frame = 0; frame < aNumAFrames; frame++) {
      for (uint32_t channel = 0; channel < aNumChannels; channel++) {
        *tmp++ = data[channel][frame];
      }
    }
  } else if (aFrame->format == AV_SAMPLE_FMT_S16) {
    // Audio data already packed. Need to convert from S16 to 32 bits Float
    AudioDataValue* tmp = audio.get();
    int16_t* data = reinterpret_cast<int16_t**>(aFrame->data)[0];
    for (uint32_t frame = 0; frame < aNumAFrames; frame++) {
      for (uint32_t channel = 0; channel < aNumChannels; channel++) {
        *tmp++ = ConvertAudioSample<float>(*data++);
      }
    }
  } else if (aFrame->format == AV_SAMPLE_FMT_S16P) {
    // Planar audio data. Convert it from S16 to 32 bits float
    // and pack it into something we can understand.
    AudioDataValue* tmp = audio.get();
    int16_t** data = reinterpret_cast<int16_t**>(aFrame->data);
    for (uint32_t frame = 0; frame < aNumAFrames; frame++) {
      for (uint32_t channel = 0; channel < aNumChannels; channel++) {
        *tmp++ = ConvertAudioSample<float>(data[channel][frame]);
      }
    }
  } else if (aFrame->format == AV_SAMPLE_FMT_S32) {
    // Audio data already packed. Need to convert from S16 to 32 bits Float
    AudioDataValue* tmp = audio.get();
    int32_t* data = reinterpret_cast<int32_t**>(aFrame->data)[0];
    for (uint32_t frame = 0; frame < aNumAFrames; frame++) {
      for (uint32_t channel = 0; channel < aNumChannels; channel++) {
        *tmp++ = ConvertAudioSample<float>(*data++);
      }
    }
  } else if (aFrame->format == AV_SAMPLE_FMT_S32P) {
    // Planar audio data. Convert it from S32 to 32 bits float
    // and pack it into something we can understand.
    AudioDataValue* tmp = audio.get();
    int32_t** data = reinterpret_cast<int32_t**>(aFrame->data);
    for (uint32_t frame = 0; frame < aNumAFrames; frame++) {
      for (uint32_t channel = 0; channel < aNumChannels; channel++) {
        *tmp++ = ConvertAudioSample<float>(data[channel][frame]);
      }
    }
  } else if (aFrame->format == AV_SAMPLE_FMT_U8) {
    // Interleaved audio data. Convert it from u8 to the expected sample-format
    AudioDataValue* tmp = audio.get();
    uint8_t* data = reinterpret_cast<uint8_t**>(aFrame->data)[0];
    for (uint32_t frame = 0; frame < aNumAFrames; frame++) {
      for (uint32_t channel = 0; channel < aNumChannels; channel++) {
        *tmp++ = ConvertAudioSample<float>(*data++);
      }
    }
  } else if (aFrame->format == AV_SAMPLE_FMT_U8P) {
    // Planar audio data. Convert it from u8 to the expected sample-format
    // and pack it into something we can understand.
    AudioDataValue* tmp = audio.get();
    uint8_t** data = reinterpret_cast<uint8_t**>(aFrame->data);
    for (uint32_t frame = 0; frame < aNumAFrames; frame++) {
      for (uint32_t channel = 0; channel < aNumChannels; channel++) {
        *tmp++ = ConvertAudioSample<float>(data[channel][frame]);
      }
    }
  }

  return audio;
}

using ChannelLayout = AudioConfig::ChannelLayout;

MediaResult FFmpegAudioDecoder<LIBAV_VER>::PostProcessOutput(
    bool aDecoded, MediaRawData* aSample, DecodedData& aResults,
    bool* aGotFrame, int32_t aSubmitted) {
  media::TimeUnit pts = aSample->mTime;

  if (mFrame->format != AV_SAMPLE_FMT_FLT &&
      mFrame->format != AV_SAMPLE_FMT_FLTP &&
      mFrame->format != AV_SAMPLE_FMT_S16 &&
      mFrame->format != AV_SAMPLE_FMT_S16P &&
      mFrame->format != AV_SAMPLE_FMT_S32 &&
      mFrame->format != AV_SAMPLE_FMT_S32P &&
      mFrame->format != AV_SAMPLE_FMT_U8 &&
      mFrame->format != AV_SAMPLE_FMT_U8P) {
    return MediaResult(
        NS_ERROR_DOM_MEDIA_DECODE_ERR,
        RESULT_DETAIL("FFmpeg audio decoder outputs unsupported audio format"));
  }

  if (aSubmitted < 0) {
    FFMPEG_LOG("Got %d more frame from packet", mFrame->nb_samples);
  }

  FFMPEG_LOG("FFmpegAudioDecoder decoded: [%s,%s] (Duration: %s) [%s]",
             aSample->mTime.ToString().get(),
             aSample->GetEndTime().ToString().get(),
             aSample->mDuration.ToString().get(),
             mLib->av_get_sample_fmt_name(mFrame->format));

  uint32_t numChannels = mCodecContext->channels;
  uint32_t samplingRate = mCodecContext->sample_rate;
  if (!numChannels) {
    numChannels = mAudioInfo.mChannels;
  }
  if (!samplingRate) {
    samplingRate = mAudioInfo.mRate;
  }
  AlignedAudioBuffer audio =
      CopyAndPackAudio(mFrame, numChannels, mFrame->nb_samples);
  if (!audio) {
    FFMPEG_LOG("CopyAndPackAudio error (OOM)");
    return MediaResult(NS_ERROR_OUT_OF_MEMORY, __func__);
  }

  media::TimeUnit duration = TimeUnit(mFrame->nb_samples, samplingRate);
  if (!duration.IsValid()) {
    FFMPEG_LOG("Duration isn't valid (%d + %d)", mFrame->nb_samples,
               samplingRate);
    return MediaResult(NS_ERROR_DOM_MEDIA_OVERFLOW_ERR,
                       RESULT_DETAIL("Invalid sample duration"));
  }

  media::TimeUnit newpts = pts + duration;
  if (!newpts.IsValid()) {
    FFMPEG_LOG("New pts isn't valid (%lf + %lf)", pts.ToSeconds(),
               duration.ToSeconds());
    return MediaResult(
        NS_ERROR_DOM_MEDIA_OVERFLOW_ERR,
        RESULT_DETAIL("Invalid count of accumulated audio samples"));
  }

  RefPtr<AudioData> data =
      new AudioData(aSample->mOffset, pts, std::move(audio), numChannels,
                    samplingRate, mCodecContext->channel_layout);
  MOZ_ASSERT(duration == data->mDuration, "must be equal");
  aResults.AppendElement(std::move(data));

  pts = newpts;

  if (aGotFrame) {
    *aGotFrame = true;
  }
  return NS_OK;
}

#if LIBAVCODEC_VERSION_MAJOR < 59
MediaResult FFmpegAudioDecoder<LIBAV_VER>::DecodeUsingFFmpeg(
    AVPacket* aPacket, bool& aDecoded, MediaRawData* aSample,
    DecodedData& aResults, bool* aGotFrame) {
  int decoded = 0;
  int rv =
      mLib->avcodec_decode_audio4(mCodecContext, mFrame, &decoded, aPacket);
  aDecoded = decoded == 1;
  if (rv < 0) {
    NS_WARNING("FFmpeg audio decoder error.");
    return MediaResult(NS_ERROR_DOM_MEDIA_DECODE_ERR,
                       RESULT_DETAIL("FFmpeg audio error"));
  }
  PostProcessOutput(decoded, aSample, aResults, aGotFrame, 0);
  return NS_OK;
}
#else
#  define AVRESULT_OK 0

MediaResult FFmpegAudioDecoder<LIBAV_VER>::DecodeUsingFFmpeg(
    AVPacket* aPacket, bool& aDecoded, MediaRawData* aSample,
    DecodedData& aResults, bool* aGotFrame) {
  // This in increment whenever avcodec_send_packet succeeds, and decremented
  // whenever avcodec_receive_frame succeeds. Because it is possible to have
  // multiple AVFrames from a single AVPacket, this number can be negative.
  // This is used to ensure that pts and duration are correctly set on the
  // resulting audio buffers.
  int32_t submitted = 0;
  int ret = mLib->avcodec_send_packet(mCodecContext, aPacket);
  switch (ret) {
    case AVRESULT_OK:
      submitted++;
      break;
    case AVERROR(EAGAIN):
      FFMPEG_LOG("  av_codec_send_packet: EAGAIN.");
      MOZ_ASSERT(false, "EAGAIN");
      break;
    case AVERROR_EOF:
      FFMPEG_LOG("  End of stream.");
      return MediaResult(NS_ERROR_DOM_MEDIA_END_OF_STREAM,
                         RESULT_DETAIL("End of stream"));
    default:
      NS_WARNING("FFmpeg audio decoder error (avcodec_send_packet).");
      return MediaResult(NS_ERROR_DOM_MEDIA_DECODE_ERR,
                         RESULT_DETAIL("FFmpeg audio error"));
  }

  MediaResult rv;

  while (ret == 0) {
    aDecoded = false;
    ret = mLib->avcodec_receive_frame(mCodecContext, mFrame);
    switch (ret) {
      case AVRESULT_OK:
        aDecoded = true;
        submitted--;
        if (submitted < 0) {
          FFMPEG_LOG("Multiple AVFrame from a single AVPacket");
        }
        break;
      case AVERROR(EAGAIN): {
        // Quirk of the vorbis decoder -- the first packet doesn't return audio.
        if (submitted == 1 && mCodecID == AV_CODEC_ID_VORBIS) {
          AlignedAudioBuffer buf;
          aResults.AppendElement(
              new AudioData(0, TimeUnit::Zero(), std::move(buf),
                            mAudioInfo.mChannels, mAudioInfo.mRate));
        }
        FFMPEG_LOG("  EAGAIN (packets submitted: %" PRIu32 ").", submitted);
        rv = NS_OK;
        break;
      }
      case AVERROR_EOF: {
        FFMPEG_LOG("  End of stream.");
        rv = MediaResult(NS_ERROR_DOM_MEDIA_END_OF_STREAM,
                         RESULT_DETAIL("End of stream"));
        break;
      }
      default:
        FFMPEG_LOG("  avcodec_receive_packet error.");
        NS_WARNING("FFmpeg audio decoder error (avcodec_receive_packet).");
        rv = MediaResult(NS_ERROR_DOM_MEDIA_DECODE_ERR,
                         RESULT_DETAIL("FFmpeg audio error"));
    }
    if (aDecoded) {
      PostProcessOutput(aDecoded, aSample, aResults, aGotFrame, submitted);
    }
  }

  return NS_OK;
}
#endif

MediaResult FFmpegAudioDecoder<LIBAV_VER>::DoDecode(MediaRawData* aSample,
                                                    uint8_t* aData, int aSize,
                                                    bool* aGotFrame,
                                                    DecodedData& aResults) {
  MOZ_ASSERT(mTaskQueue->IsOnCurrentThread());
  PROCESS_DECODE_LOG(aSample);
  AVPacket packet;
  mLib->av_init_packet(&packet);

  FFMPEG_LOG("FFmpegAudioDecoder::DoDecode: %d bytes, [%s,%s] (Duration: %s)",
             aSize, aSample->mTime.ToString().get(),
             aSample->GetEndTime().ToString().get(),
             aSample->mDuration.ToString().get());

  packet.data = const_cast<uint8_t*>(aData);
  packet.size = aSize;

  if (aGotFrame) {
    *aGotFrame = false;
  }

  if (!PrepareFrame()) {
    FFMPEG_LOG("FFmpegAudioDecoder: OOM in PrepareFrame");
    return MediaResult(
        NS_ERROR_OUT_OF_MEMORY,
        RESULT_DETAIL("FFmpeg audio decoder failed to allocate frame"));
  }

  bool decoded = false;
  auto rv = DecodeUsingFFmpeg(&packet, decoded, aSample, aResults, aGotFrame);
  NS_ENSURE_SUCCESS(rv, rv);
  return NS_OK;
}

AVCodecID FFmpegAudioDecoder<LIBAV_VER>::GetCodecId(const nsACString& aMimeType,
                                                    const AudioInfo& aInfo) {
  if (aMimeType.EqualsLiteral("audio/mp4a-latm")) {
    return AV_CODEC_ID_AAC;
  }
#ifdef FFVPX_VERSION
  if (aMimeType.EqualsLiteral("audio/mpeg")) {
    return AV_CODEC_ID_MP3;
  }
  if (aMimeType.EqualsLiteral("audio/flac")) {
    return AV_CODEC_ID_FLAC;
  }
  if (aMimeType.EqualsLiteral("audio/vorbis")) {
    return AV_CODEC_ID_VORBIS;
  }
  if (aMimeType.EqualsLiteral("audio/opus")) {
    return AV_CODEC_ID_OPUS;
  }
  if (aMimeType.Find("wav") != kNotFound) {
    if (aMimeType.EqualsLiteral("audio/x-wav") ||
        aMimeType.EqualsLiteral("audio/wave; codecs=1") ||
        aMimeType.EqualsLiteral("audio/wave; codecs=65534")) {
      // find the pcm format
      switch (aInfo.mBitDepth) {
        case 8:
          return AV_CODEC_ID_PCM_U8;
        case 16:
          return AV_CODEC_ID_PCM_S16LE;
        case 24:
          return AV_CODEC_ID_PCM_S24LE;
        case 32:
          return AV_CODEC_ID_PCM_S32LE;
        case 0:
          // ::Init will find and use the right type here, this is just
          // returning something that means that this media type can be decoded.
          // This happens when attempting to find what decoder to use for a
          // media type, without actually having looked at the actual
          // bytestream. This decoder can decode all usual PCM bytestream
          // anyway.
          return AV_CODEC_ID_PCM_S16LE;
        default:
          return AV_CODEC_ID_NONE;
      };
    }
    if (aMimeType.EqualsLiteral("audio/wave; codecs=3")) {
      return AV_CODEC_ID_PCM_F32LE;
    }
    // A-law
    if (aMimeType.EqualsLiteral("audio/wave; codecs=6")) {
      return AV_CODEC_ID_PCM_ALAW;
    }
    // Mu-law
    if (aMimeType.EqualsLiteral("audio/wave; codecs=7")) {
      return AV_CODEC_ID_PCM_MULAW;
    }
  }
#endif

  return AV_CODEC_ID_NONE;
}

nsCString FFmpegAudioDecoder<LIBAV_VER>::GetCodecName() const {
#if LIBAVCODEC_VERSION_MAJOR > 53
  return nsCString(mLib->avcodec_descriptor_get(mCodecID)->name);
#else
  return "unknown"_ns;
#endif
}

FFmpegAudioDecoder<LIBAV_VER>::~FFmpegAudioDecoder() {
  MOZ_COUNT_DTOR(FFmpegAudioDecoder);
}

}  // namespace mozilla