fune/dom/media/platforms/ffmpeg/FFmpegAudioEncoder.cpp

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim:set ts=2 sw=2 sts=2 et cindent: */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#include "FFmpegAudioEncoder.h"

#include "FFmpegRuntimeLinker.h"
#include "FFmpegLog.h"
#include "FFmpegUtils.h"
#include "MediaData.h"

#include "AudioSegment.h"

namespace mozilla {

FFmpegAudioEncoder<LIBAV_VER>::FFmpegAudioEncoder(
    const FFmpegLibWrapper* aLib, AVCodecID aCodecID,
    const RefPtr<TaskQueue>& aTaskQueue, const EncoderConfig& aConfig)
    : FFmpegDataEncoder(aLib, aCodecID, aTaskQueue, aConfig) {}

nsCString FFmpegAudioEncoder<LIBAV_VER>::GetDescriptionName() const {
#ifdef USING_MOZFFVPX
  return "ffvpx audio encoder"_ns;
#else
  const char* lib =
#  if defined(MOZ_FFMPEG)
      FFmpegRuntimeLinker::LinkStatusLibraryName();
#  else
      "no library: ffmpeg disabled during build";
#  endif
  return nsPrintfCString("ffmpeg audio encoder (%s)", lib);
#endif
}

void FFmpegAudioEncoder<LIBAV_VER>::ResamplerDestroy::operator()(
    SpeexResamplerState* aResampler) {
  speex_resampler_destroy(aResampler);
}

nsresult FFmpegAudioEncoder<LIBAV_VER>::InitSpecific() {
  MOZ_ASSERT(mTaskQueue->IsOnCurrentThread());

  FFMPEG_LOG("FFmpegAudioEncoder::InitInternal");

  // Initialize the common members of the encoder instance
  AVCodec* codec = FFmpegDataEncoder<LIBAV_VER>::InitCommon();
  if (!codec) {
    FFMPEG_LOG("FFmpegDataEncoder::InitCommon failed");
    return NS_ERROR_DOM_MEDIA_NOT_SUPPORTED_ERR;
  }

  // Find a compatible input rate for the codec, update the encoder config, and
  // note the rate at which this instance was configured.
  mInputSampleRate = AssertedCast<int>(mConfig.mSampleRate);
  if (codec->supported_samplerates) {
    // Ensure the sample-rate list is sorted, iterate and either find that the
    // sample rate is supported, or pick the same rate just above the audio
    // input sample-rate (as to not lose information). If the audio is higher
    // than the highest supported sample-rate, down-sample to the highest
    // sample-rate supported by the codec. This is the case when encoding high
    // samplerate audio to opus.
    AutoTArray<int, 16> supportedSampleRates;
    IterateZeroTerminated(codec->supported_samplerates,
                          [&supportedSampleRates](int aRate) mutable {
                            supportedSampleRates.AppendElement(aRate);
                          });
    supportedSampleRates.Sort();

    for (const auto& rate : supportedSampleRates) {
      if (mInputSampleRate == rate) {
        mConfig.mSampleRate = rate;
        break;
      }
      if (mInputSampleRate < rate) {
        // This rate is the smallest supported rate above the content's rate.
        mConfig.mSampleRate = rate;
        break;
      }
      if (mInputSampleRate > rate) {
        mConfig.mSampleRate = rate;
      }
    }
  }

  if (mConfig.mSampleRate != AssertedCast<uint32_t>(mInputSampleRate)) {
    // Need to resample to targetRate
    int err;
    SpeexResamplerState* resampler = speex_resampler_init(
        mConfig.mNumberOfChannels, mInputSampleRate, mConfig.mSampleRate,
        SPEEX_RESAMPLER_QUALITY_DEFAULT, &err);
    if (!err) {
      mResampler.reset(resampler);
    } else {
      FFMPEG_LOG(
          "Error creating resampler in FFmpegAudioEncoder %dHz -> %dHz (%dch)",
          mInputSampleRate, mConfig.mSampleRate, mConfig.mNumberOfChannels);
    }
  }

  // And now the audio-specific part
  mCodecContext->sample_rate = AssertedCast<int>(mConfig.mSampleRate);
  mCodecContext->channels = AssertedCast<int>(mConfig.mNumberOfChannels);

#if LIBAVCODEC_VERSION_MAJOR >= 60
  // Gecko's ordering intentionnally matches ffmepg's ordering
  mLib->av_channel_layout_default(&mCodecContext->ch_layout,
                                  AssertedCast<int>(mCodecContext->channels));
#endif

  switch (mConfig.mCodec) {
    case CodecType::Opus:
      // When using libopus, ffmpeg supports interleaved float and s16 input.
      mCodecContext->sample_fmt = AV_SAMPLE_FMT_FLT;
      break;
    case CodecType::Vorbis:
      // When using libvorbis, ffmpeg only supports planar f32 input.
      mCodecContext->sample_fmt = AV_SAMPLE_FMT_FLTP;
      break;
    default:
      MOZ_ASSERT_UNREACHABLE("Not supported");
  }

  if (mConfig.mCodec == CodecType::Opus) {
    // Default is VBR
    if (mConfig.mBitrateMode == BitrateMode::Constant) {
      mLib->av_opt_set(mCodecContext->priv_data, "vbr", "off", 0);
    }
    if (mConfig.mCodecSpecific.isSome()) {
      MOZ_ASSERT(mConfig.mCodecSpecific->is<OpusSpecific>());
      const OpusSpecific& specific = mConfig.mCodecSpecific->as<OpusSpecific>();
      // This attribute maps directly to complexity
      mCodecContext->compression_level = specific.mComplexity;
      FFMPEG_LOG("Opus complexity set to %d", specific.mComplexity);
      float frameDurationMs =
          AssertedCast<float>(specific.mFrameDuration) / 1000.f;
      if (mLib->av_opt_set_double(mCodecContext->priv_data, "frame_duration",
                                  frameDurationMs, 0)) {
        FFMPEG_LOG("Error setting the frame duration on Opus encoder");
        return NS_ERROR_FAILURE;
      }
      FFMPEG_LOG("Opus frame duration set to %0.2f", frameDurationMs);
      if (specific.mPacketLossPerc) {
        if (mLib->av_opt_set_int(
                mCodecContext->priv_data, "packet_loss",
                AssertedCast<int64_t>(specific.mPacketLossPerc), 0)) {
          FFMPEG_LOG("Error setting the packet loss percentage to %" PRIu64
                     " on Opus encoder",
                     specific.mPacketLossPerc);
          return NS_ERROR_FAILURE;
        }
        FFMPEG_LOGV("Packet loss set to %d%% in Opus encoder",
                    AssertedCast<int>(specific.mPacketLossPerc));
      }
      if (specific.mUseInBandFEC) {
        if (mLib->av_opt_set(mCodecContext->priv_data, "fec", "on", 0)) {
          FFMPEG_LOG("Error %s FEC on Opus encoder",
                     specific.mUseInBandFEC ? "enabling" : "disabling");
          return NS_ERROR_FAILURE;
        }
        FFMPEG_LOGV("In-band FEC enabled for Opus encoder.");
      }
      if (specific.mUseDTX) {
        if (mLib->av_opt_set(mCodecContext->priv_data, "dtx", "on", 0)) {
          FFMPEG_LOG("Error %s DTX on Opus encoder",
                     specific.mUseDTX ? "enabling" : "disabling");
          return NS_ERROR_FAILURE;
        }
        // DTX packets are a TOC byte, and possibly one byte of length, packets
        // 3 bytes and larger are to be returned.
        mDtxThreshold = 3;
      }
      // TODO: format
      // https://bugzilla.mozilla.org/show_bug.cgi?id=1876066
    }
  }
  // Override the time base: always the sample-rate the encoder is running at
  mCodecContext->time_base =
      AVRational{.num = 1, .den = mCodecContext->sample_rate};

  MediaResult rv = FinishInitCommon(codec);
  if (NS_FAILED(rv)) {
    FFMPEG_LOG("FFmpeg encode initialization failure.");
    return rv.Code();
  }

  return NS_OK;
}

// avcodec_send_frame and avcodec_receive_packet were introduced in version 58.
#if LIBAVCODEC_VERSION_MAJOR >= 58

Result<MediaDataEncoder::EncodedData, nsresult>
FFmpegAudioEncoder<LIBAV_VER>::EncodeOnePacket(Span<float> aSamples,
                                               media::TimeUnit aPts) {
  // Allocate AVFrame.
  if (!PrepareFrame()) {
    FFMPEG_LOG("failed to allocate frame");
    return Err(NS_ERROR_OUT_OF_MEMORY);
  }

  uint32_t frameCount = aSamples.Length() / mConfig.mNumberOfChannels;

  // This method assumes that the audio has been packetized appropriately --
  // packets smaller than the packet size are allowed when draining.
  MOZ_ASSERT(AssertedCast<int>(frameCount) <= mCodecContext->frame_size);

  mFrame->channels = AssertedCast<int>(mConfig.mNumberOfChannels);

#  if LIBAVCODEC_VERSION_MAJOR >= 60
  int rv = mLib->av_channel_layout_copy(&mFrame->ch_layout,
                                        &mCodecContext->ch_layout);
  if (rv < 0) {
    FFMPEG_LOG("channel layout copy error: %s",
               MakeErrorString(mLib, rv).get());
    return Err(NS_ERROR_DOM_MEDIA_FATAL_ERR);
  }
#  endif

  mFrame->sample_rate = AssertedCast<int>(mConfig.mSampleRate);
  // Not a mistake, nb_samples is per channel in ffmpeg
  mFrame->nb_samples = AssertedCast<int>(frameCount);
  // Audio is converted below if needed
  mFrame->format = mCodecContext->sample_fmt;
  // Set presentation timestamp and duration of the AVFrame.
#  if LIBAVCODEC_VERSION_MAJOR >= 59
  mFrame->time_base =
      AVRational{.num = 1, .den = static_cast<int>(mConfig.mSampleRate)};
#  endif
  mFrame->pts = aPts.ToTicksAtRate(mConfig.mSampleRate);
  mFrame->pkt_duration = frameCount;
#  if LIBAVCODEC_VERSION_MAJOR >= 60
  mFrame->duration = frameCount;
#  else
  // Save duration in the time_base unit.
  mDurationMap.Insert(mFrame->pts, mFrame->pkt_duration);
#  endif

  if (int ret = mLib->av_frame_get_buffer(mFrame, 16); ret < 0) {
    FFMPEG_LOG("failed to allocate frame data: %s",
               MakeErrorString(mLib, ret).get());
    return Err(NS_ERROR_OUT_OF_MEMORY);
  }

  // Make sure AVFrame is writable.
  if (int ret = mLib->av_frame_make_writable(mFrame); ret < 0) {
    FFMPEG_LOG("failed to make frame writable: %s",
               MakeErrorString(mLib, ret).get());
    return Err(NS_ERROR_DOM_MEDIA_FATAL_ERR);
  }

  // The input is always in f32 interleaved for now
  if (mCodecContext->sample_fmt == AV_SAMPLE_FMT_FLT) {
    PodCopy(reinterpret_cast<float*>(mFrame->data[0]), aSamples.data(),
            aSamples.Length());
  } else {
    MOZ_ASSERT(mCodecContext->sample_fmt == AV_SAMPLE_FMT_FLTP);
    for (uint32_t i = 0; i < mConfig.mNumberOfChannels; i++) {
      DeinterleaveAndConvertBuffer(aSamples.data(), mFrame->nb_samples,
                                   mFrame->channels, mFrame->data);
    }
  }

  // Now send the AVFrame to ffmpeg for encoding, same code for audio and video.
  return FFmpegDataEncoder<LIBAV_VER>::EncodeWithModernAPIs();
}

Result<MediaDataEncoder::EncodedData, nsresult> FFmpegAudioEncoder<
    LIBAV_VER>::EncodeInputWithModernAPIs(RefPtr<const MediaData> aSample) {
  MOZ_ASSERT(mTaskQueue->IsOnCurrentThread());
  MOZ_ASSERT(mCodecContext);
  MOZ_ASSERT(aSample);

  RefPtr<const AudioData> sample(aSample->As<AudioData>());

  FFMPEG_LOG("Encoding %" PRIu32 " frames of audio at pts: %s",
             sample->Frames(), sample->mTime.ToString().get());

  if ((!mResampler && sample->mRate != mConfig.mSampleRate) ||
      (mResampler &&
       sample->mRate != AssertedCast<uint32_t>(mInputSampleRate)) ||
      sample->mChannels != mConfig.mNumberOfChannels) {
    FFMPEG_LOG(
        "Rate or sample-rate at the inputof the encoder different from what "
        "has been configured initially, erroring out");
    return Result<MediaDataEncoder::EncodedData, nsresult>(
        NS_ERROR_DOM_ENCODING_NOT_SUPPORTED_ERR);
  }

  // ffmpeg expects exactly sized input audio packets most of the time.
  // Packetization is performed if needed, and audio packets of the correct size
  // are fed to ffmpeg, with timestamps extrapolated the timestamp found on
  // the input MediaData.

  if (!mPacketizer) {
    media::TimeUnit basePts = media::TimeUnit::Zero(mConfig.mSampleRate);
    basePts += sample->mTime;
    mPacketizer.emplace(mCodecContext->frame_size, sample->mChannels,
                        basePts.ToTicksAtRate(mConfig.mSampleRate),
                        mConfig.mSampleRate);
  }

  if (!mFirstPacketPts.IsValid()) {
    mFirstPacketPts = sample->mTime;
  }

  Span<float> audio = sample->Data();

  if (mResampler) {
    // Ensure that all input frames are consumed each time by oversizing the
    // output buffer.
    int bufferLengthGuess = std::ceil(2. * static_cast<float>(audio.size()) *
                                      mConfig.mSampleRate / mInputSampleRate);
    mTempBuffer.SetLength(bufferLengthGuess);
    uint32_t inputFrames = audio.size() / mConfig.mNumberOfChannels;
    uint32_t inputFramesProcessed = inputFrames;
    uint32_t outputFrames = bufferLengthGuess / mConfig.mNumberOfChannels;
    DebugOnly<int> rv = speex_resampler_process_interleaved_float(
        mResampler.get(), audio.data(), &inputFramesProcessed,
        mTempBuffer.Elements(), &outputFrames);
    audio = Span<float>(mTempBuffer.Elements(),
                        outputFrames * mConfig.mNumberOfChannels);
    MOZ_ASSERT(inputFrames == inputFramesProcessed,
               "increate the buffer to consume all input each time");
    MOZ_ASSERT(rv == RESAMPLER_ERR_SUCCESS);
  }

  EncodedData output;
  MediaResult rv = NS_OK;

  mPacketizer->Input(audio.data(), audio.Length() / mConfig.mNumberOfChannels);

  // Dequeue and encode each packet
  while (mPacketizer->PacketsAvailable() && rv.Code() == NS_OK) {
    mTempBuffer.SetLength(mCodecContext->frame_size *
                          mConfig.mNumberOfChannels);
    media::TimeUnit pts = mPacketizer->Output(mTempBuffer.Elements());
    auto audio = Span(mTempBuffer.Elements(), mTempBuffer.Length());
    FFMPEG_LOG("Encoding %" PRIu32 " frames, pts: %s",
               mPacketizer->PacketSize(), pts.ToString().get());
    auto encodeResult = EncodeOnePacket(audio, pts);
    if (encodeResult.isOk()) {
      output.AppendElements(std::move(encodeResult.unwrap()));
    } else {
      return encodeResult;
    }
    pts += media::TimeUnit(mPacketizer->PacketSize(), mConfig.mSampleRate);
  }
  return Result<MediaDataEncoder::EncodedData, nsresult>(std::move(output));
}

Result<MediaDataEncoder::EncodedData, nsresult>
FFmpegAudioEncoder<LIBAV_VER>::DrainWithModernAPIs() {
  // If there's no packetizer, or it's empty, we can proceed immediately.
  if (!mPacketizer || mPacketizer->FramesAvailable() == 0) {
    return FFmpegDataEncoder<LIBAV_VER>::DrainWithModernAPIs();
  }
  EncodedData output;
  MediaResult rv = NS_OK;
  // Dequeue and encode each packet
  mTempBuffer.SetLength(mCodecContext->frame_size *
                        mPacketizer->ChannelCount());
  uint32_t written;
  media::TimeUnit pts = mPacketizer->Drain(mTempBuffer.Elements(), written);
  auto audio =
      Span(mTempBuffer.Elements(), written * mPacketizer->ChannelCount());
  auto encodeResult = EncodeOnePacket(audio, pts);
  if (encodeResult.isOk()) {
    auto array = encodeResult.unwrap();
    output.AppendElements(std::move(array));
  } else {
    return encodeResult;
  }
  // Now, drain the encoder
  auto drainResult = FFmpegDataEncoder<LIBAV_VER>::DrainWithModernAPIs();
  if (drainResult.isOk()) {
    auto array = drainResult.unwrap();
    output.AppendElements(std::move(array));
  } else {
    return drainResult;
  }
  return Result<MediaDataEncoder::EncodedData, nsresult>(std::move(output));
}
#endif  // if LIBAVCODEC_VERSION_MAJOR >= 58

RefPtr<MediaRawData> FFmpegAudioEncoder<LIBAV_VER>::ToMediaRawData(
    AVPacket* aPacket) {
  MOZ_ASSERT(mTaskQueue->IsOnCurrentThread());
  MOZ_ASSERT(aPacket);

  if (aPacket->size < mDtxThreshold) {
    FFMPEG_LOG(
        "DTX enabled and packet is %d bytes (threshold %d), not returning.",
        aPacket->size, mDtxThreshold);
    return nullptr;
  }

  RefPtr<MediaRawData> data = ToMediaRawDataCommon(aPacket);

  data->mTime = media::TimeUnit(aPacket->pts, mConfig.mSampleRate);
  data->mTimecode = data->mTime;
  data->mDuration =
      media::TimeUnit(mCodecContext->frame_size, mConfig.mSampleRate);

  // Handle encoder delay
  // Tracked in https://github.com/w3c/webcodecs/issues/626 because not quite
  // specced yet.
  if (mFirstPacketPts > data->mTime) {
    data->mOriginalPresentationWindow =
        Some(media::TimeInterval{data->mTime, data->GetEndTime()});
    // Duration is likely to be ajusted when the above spec issue is fixed. For
    // now, leave it as-is
    //  data->mDuration -= (mFirstPacketPts - data->mTime);
    // if (data->mDuration.IsNegative()) {
    //   data->mDuration = media::TimeUnit::Zero();
    // }
    data->mTime = mFirstPacketPts;
  }

  if (mPacketsDelivered++ == 0) {
    // Attach extradata, and the config (including any channel / samplerate
    // modification to fit the encoder requirements), if needed.
    if (auto r = GetExtraData(aPacket); r.isOk()) {
      data->mExtraData = r.unwrap();
    }
    data->mConfig = MakeUnique<EncoderConfig>(mConfig);
  }

  if (data->mExtraData) {
    FFMPEG_LOG(
        "FFmpegAudioEncoder out: [%s,%s] (%zu bytes, extradata %zu bytes)",
        data->mTime.ToString().get(), data->mDuration.ToString().get(),
        data->Size(), data->mExtraData->Length());
  } else {
    FFMPEG_LOG("FFmpegAudioEncoder out: [%s,%s] (%zu bytes)",
               data->mTime.ToString().get(), data->mDuration.ToString().get(),
               data->Size());
  }

  return data;
}

Result<already_AddRefed<MediaByteBuffer>, nsresult>
FFmpegAudioEncoder<LIBAV_VER>::GetExtraData(AVPacket* /* aPacket */) {
  if (!mCodecContext->extradata_size) {
    return Err(NS_ERROR_NOT_AVAILABLE);
  }
  // Create extra data -- they are on the context.
  auto extraData = MakeRefPtr<MediaByteBuffer>();
  extraData->SetLength(mCodecContext->extradata_size);
  MOZ_ASSERT(extraData);
  PodCopy(extraData->Elements(), mCodecContext->extradata,
          mCodecContext->extradata_size);
  return extraData.forget();
}

}  // namespace mozilla