I have made a small application to extract audio from an mp4 file, or simply convert an existing audio file to AAC/mp4 format (both raw AAC, or inside mp4 container). I have run this application with existing mp4 files as input, and it properly extracts audio, and encodes to mp4 (audio only:AAC), or even directly in AAC format (i.e. test.aac also works). But when I tried running it on mp3 files, output clip plays faster than it should be (a clip of 1:12 seconds plays back till 1:05 seconds only).
Edit: I have made improvements in code - now, it no longer plays back faster, but is still only converted till 1:05 seconds, remaining clip is missing (this is about 89% conversion done, and remaining 11% remaining).
Here is the code I have written to achieve this:
#include "stdafx.h"
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <map>
#include <deque>
#include <queue>
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <conio.h>
extern "C"
#include "libavcodec/avcodec.h"
#include "libavformat/avformat.h"
#include "libavdevice/avdevice.h"
#include "libswscale/swscale.h"
#include "libavutil/dict.h"
#include "libavutil/error.h"
#include "libavutil/opt.h"
#include <libavutil/fifo.h>
#include <libavutil/imgutils.h>
#include <libavutil/samplefmt.h>
#include <libswresample/swresample.h>
AVFormatContext* fmt_ctx= NULL;
int audio_stream_index = -1;
AVCodecContext * codec_ctx_audio = NULL;
AVCodec* codec_audio = NULL;
AVFrame* decoded_frame = NULL;
uint8_t** audio_dst_data = NULL;
int got_frame = 0;
int audiobufsize = 0;
AVPacket input_packet;
int audio_dst_linesize = 0;
int audio_dst_bufsize = 0;
SwrContext * swr = NULL;
AVOutputFormat * output_format = NULL ;
AVFormatContext * output_fmt_ctx= NULL;
AVStream * audio_st = NULL;
AVCodec * audio_codec = NULL;
double audio_pts = 0.0;
AVFrame * out_frame = avcodec_alloc_frame();
int audio_input_frame_size = 0;
uint8_t * audio_data_buf = NULL;
uint8_t * audio_out = NULL;
int audio_bit_rate;
int audio_sample_rate;
int audio_channels;
int decode_packet();
int open_audio_input(char* src_filename);
int decode_frame();
int open_encoder(char* output_filename);
AVStream *add_audio_stream(AVFormatContext *oc, AVCodec **codec,
enum AVCodecID codec_id);
int open_audio(AVFormatContext *oc, AVCodec *codec, AVStream *st);
void close_audio(AVFormatContext *oc, AVStream *st);
void write_audio_frame(uint8_t ** audio_src_data, int audio_src_bufsize);
int open_audio_input(char* src_filename)
int i =0;
/* open input file, and allocate format context */
if (avformat_open_input(&fmt_ctx, src_filename, NULL, NULL) < 0)
fprintf(stderr, "Could not open source file %s\n", src_filename);
// Retrieve stream information
if(avformat_find_stream_info(fmt_ctx, NULL)<0)
return -1; // Couldn't find stream information
// Dump information about file onto standard error
av_dump_format(fmt_ctx, 0, src_filename, 0);
// Find the first video stream
for(i=0; i<fmt_ctx->nb_streams; i++)
if ( audio_stream_index != -1 )
// Get a pointer to the codec context for the audio stream
// Find the decoder for the video stream
if(codec_audio==NULL) {
fprintf(stderr, "Unsupported audio codec!\n");
return -1; // Codec not found
// Open codec
AVDictionary *codecDictOptions = NULL;
if(avcodec_open2(codec_ctx_audio, codec_audio, &codecDictOptions)<0)
return -1; // Could not open codec
// Set up SWR context once you've got codec information
swr = swr_alloc();
av_opt_set_int(swr, "in_channel_layout", codec_ctx_audio->channel_layout, 0);
av_opt_set_int(swr, "out_channel_layout", codec_ctx_audio->channel_layout, 0);
av_opt_set_int(swr, "in_sample_rate", codec_ctx_audio->sample_rate, 0);
av_opt_set_int(swr, "out_sample_rate", codec_ctx_audio->sample_rate, 0);
av_opt_set_sample_fmt(swr, "in_sample_fmt", codec_ctx_audio->sample_fmt, 0);
av_opt_set_sample_fmt(swr, "out_sample_fmt", AV_SAMPLE_FMT_S16, 0);
// Allocate audio frame
if ( decoded_frame == NULL ) decoded_frame = avcodec_alloc_frame();
int nb_planes = 0;
AVStream* audio_stream = fmt_ctx->streams[audio_stream_index];
nb_planes = av_sample_fmt_is_planar(codec_ctx_audio->sample_fmt) ? codec_ctx_audio->channels : 1;
int tempSize = sizeof(uint8_t *) * nb_planes;
audio_dst_data = (uint8_t**)av_mallocz(tempSize);
if (!audio_dst_data)
fprintf(stderr, "Could not allocate audio data buffers\n");
for ( int i = 0 ; i < nb_planes ; i ++ )
audio_dst_data[i] = NULL;
int decode_frame()
int rv = 0;
got_frame = 0;
if ( fmt_ctx == NULL )
return rv;
int ret = 0;
audiobufsize = 0;
rv = av_read_frame(fmt_ctx, &input_packet);
if ( rv < 0 )
return rv;
rv = decode_packet();
// Free the input_packet that was allocated by av_read_frame
return rv;
int decode_packet()
int rv = 0;
int ret = 0;
//audio stream?
if(input_packet.stream_index == audio_stream_index)
/* decode audio frame */
rv = avcodec_decode_audio4(codec_ctx_audio, decoded_frame, &got_frame, &input_packet);
if (rv < 0)
fprintf(stderr, "Error decoding audio frame\n");
//return ret;
if (got_frame)
if ( audio_dst_data[0] == NULL )
ret = av_samples_alloc(audio_dst_data, &audio_dst_linesize, decoded_frame->channels,
decoded_frame->nb_samples, (AVSampleFormat)decoded_frame->format, 1);
if (ret < 0)
fprintf(stderr, "Could not allocate audio buffer\n");
/* TODO: extend return code of the av_samples_* functions so that this call is not needed */
audio_dst_bufsize = av_samples_get_buffer_size(NULL, audio_st->codec->channels,
decoded_frame->nb_samples, (AVSampleFormat)decoded_frame->format, 1);
//int16_t* outputBuffer = ...;
swr_convert( swr, audio_dst_data, out_frame->nb_samples, (const uint8_t**) decoded_frame->extended_data, decoded_frame->nb_samples );
/* copy audio data to destination buffer:
* this is required since rawaudio expects non aligned data */
//av_samples_copy(audio_dst_data, decoded_frame->data, 0, 0,
// decoded_frame->nb_samples, decoded_frame->channels, (AVSampleFormat)decoded_frame->format);
return rv;
int open_encoder(char* output_filename )
int rv = 0;
/* allocate the output media context */
AVOutputFormat *opfmt = NULL;
avformat_alloc_output_context2(&output_fmt_ctx, opfmt, NULL, output_filename);
if (!output_fmt_ctx) {
printf("Could not deduce output format from file extension: using MPEG.\n");
avformat_alloc_output_context2(&output_fmt_ctx, NULL, "mpeg", output_filename);
if (!output_fmt_ctx) {
rv = -1;
output_format = output_fmt_ctx->oformat;
/* Add the audio stream using the default format codecs
* and initialize the codecs. */
audio_st = NULL;
if ( output_fmt_ctx )
if (output_format->audio_codec != AV_CODEC_ID_NONE)
audio_st = add_audio_stream(output_fmt_ctx, &audio_codec, output_format->audio_codec);
/* Now that all the parameters are set, we can open the audio and
* video codecs and allocate the necessary encode buffers. */
if (audio_st)
rv = open_audio(output_fmt_ctx, audio_codec, audio_st);
if ( rv < 0 ) return rv;
av_dump_format(output_fmt_ctx, 0, output_filename, 1);
/* open the output file, if needed */
if (!(output_format->flags & AVFMT_NOFILE))
if (avio_open(&output_fmt_ctx->pb, output_filename, AVIO_FLAG_WRITE) < 0) {
fprintf(stderr, "Could not open '%s'\n", output_filename);
rv = -1;
/* Write the stream header, if any. */
if (avformat_write_header(output_fmt_ctx, NULL) < 0)
fprintf(stderr, "Error occurred when opening output file\n");
rv = -1;
return rv;
AVStream *add_audio_stream(AVFormatContext *oc, AVCodec **codec,
enum AVCodecID codec_id)
AVCodecContext *c;
AVStream *st;
/* find the audio encoder */
*codec = avcodec_find_encoder(codec_id);
if (!(*codec)) {
fprintf(stderr, "Could not find codec\n");
st = avformat_new_stream(oc, *codec);
if (!st) {
fprintf(stderr, "Could not allocate stream\n");
st->id = 1;
c = st->codec;
/* put sample parameters */
c->sample_fmt = AV_SAMPLE_FMT_S16;
c->bit_rate = audio_bit_rate;
c->sample_rate = audio_sample_rate;
c->channels = audio_channels;
// some formats want stream headers to be separate
if (oc->oformat->flags & AVFMT_GLOBALHEADER)
return st;
int open_audio(AVFormatContext *oc, AVCodec *codec, AVStream *st)
int ret=0;
AVCodecContext *c;
st->duration = fmt_ctx->duration;
c = st->codec;
/* open it */
ret = avcodec_open2(c, codec, NULL) ;
if ( ret < 0)
fprintf(stderr, "could not open codec\n");
return -1;
if (c->codec->capabilities & CODEC_CAP_VARIABLE_FRAME_SIZE)
audio_input_frame_size = 10000;
audio_input_frame_size = c->frame_size;
int tempSize = audio_input_frame_size *
av_get_bytes_per_sample(c->sample_fmt) *
return ret;
void close_audio(AVFormatContext *oc, AVStream *st)
void write_audio_frame(uint8_t ** audio_src_data, int audio_src_bufsize)
AVFormatContext *oc = output_fmt_ctx;
AVStream *st = audio_st;
if ( oc == NULL || st == NULL ) return;
AVCodecContext *c;
AVPacket pkt = { 0 }; // data and size must be 0;
int got_packet;
c = st->codec;
out_frame->nb_samples = audio_input_frame_size;
int buf_size = audio_src_bufsize *
av_get_bytes_per_sample(c->sample_fmt) *
avcodec_fill_audio_frame(out_frame, c->channels, c->sample_fmt,
(uint8_t *) *audio_src_data,
buf_size, 1);
avcodec_encode_audio2(c, &pkt, out_frame, &got_packet);
if (!got_packet)
if (pkt.pts != AV_NOPTS_VALUE)
pkt.pts = av_rescale_q(pkt.pts, st->codec->time_base, st->time_base);
if (pkt.dts != AV_NOPTS_VALUE)
pkt.dts = av_rescale_q(pkt.dts, st->codec->time_base, st->time_base);
if ( c && c->coded_frame && c->coded_frame->key_frame)
pkt.flags |= AV_PKT_FLAG_KEY;
pkt.stream_index = st->index;
pkt.flags |= AV_PKT_FLAG_KEY;
/* Write the compressed frame to the media file. */
if (av_interleaved_write_frame(oc, &pkt) != 0)
fprintf(stderr, "Error while writing audio frame\n");
void write_delayed_frames(AVFormatContext *oc, AVStream *st)
AVCodecContext *c = st->codec;
int got_output = 0;
int ret = 0;
AVPacket pkt;
pkt.data = NULL;
pkt.size = 0;
int i = 0;
for (got_output = 1; got_output; i++)
ret = avcodec_encode_audio2(c, &pkt, NULL, &got_output);
if (ret < 0)
fprintf(stderr, "error encoding frame\n");
static int64_t tempPts = 0;
static int64_t tempDts = 0;
/* If size is zero, it means the image was buffered. */
if (got_output)
if (pkt.pts != AV_NOPTS_VALUE)
pkt.pts = av_rescale_q(pkt.pts, st->codec->time_base, st->time_base);
if (pkt.dts != AV_NOPTS_VALUE)
pkt.dts = av_rescale_q(pkt.dts, st->codec->time_base, st->time_base);
if ( c && c->coded_frame && c->coded_frame->key_frame)
pkt.flags |= AV_PKT_FLAG_KEY;
pkt.stream_index = st->index;
/* Write the compressed frame to the media file. */
ret = av_interleaved_write_frame(oc, &pkt);
ret = 0;
int main(int argc, char **argv)
/* register all formats and codecs */
int i =0;
char src_filename[90] = "mp3.mp3";
char dst_filename[90] = "test.mp4";
audio_bit_rate = codec_ctx_audio->bit_rate;
audio_sample_rate = codec_ctx_audio->sample_rate;
audio_channels = codec_ctx_audio->channels;
open_encoder( dst_filename );
int rv = decode_frame();
if ( rv < 0 )
if (audio_st)
audio_pts = (double)audio_st->pts.val * audio_st->time_base.num /
audio_pts = 0.0;
if ( codec_ctx_audio )
if ( got_frame)
write_audio_frame( audio_dst_data, audio_dst_bufsize );
if ( audio_dst_data[0] )
audio_dst_data[0] = NULL;
printf("\naudio_pts: %.3f", audio_pts);
write_delayed_frames( output_fmt_ctx, audio_st );
close_audio( output_fmt_ctx, audio_st);
return 0;
I have been looking at this problem from many angles since about two days now, but cant seem to figure out what I'm doing wrong.
Note also: the printf() statement I've inserted shows audio_pts up to 64.551 (that's about 1:05 seconds that also proves encoder is not going to full duration of input file: 1:12 secs).
Can anyone please guide me what I may be doing wrong?
Thanks in advance for any guidance!
p.s. when run through command line like: ffmpeg -i test.mp3 test.mp4, it converts the file just fine.