diff --git a/README.md b/README.md
index b6f48e3..64e2eb9 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@ The package can be installed by adding `membrane_mp4_plugin` to your list of dep
 ```elixir
 defp deps do
   [
-    {:membrane_mp4_plugin, "~> 0.31.0"}
+    {:membrane_mp4_plugin, "~> 0.32.0"}
   ]
 end
 ```
diff --git a/lib/membrane_mp4/muxer/cmaf.ex b/lib/membrane_mp4/muxer/cmaf.ex
index fc2816b..52a7dd6 100644
--- a/lib/membrane_mp4/muxer/cmaf.ex
+++ b/lib/membrane_mp4/muxer/cmaf.ex
@@ -3,23 +3,45 @@ defmodule Membrane.MP4.Muxer.CMAF do
   Puts a payloaded stream into [Common Media Application Format](https://www.wowza.com/blog/what-is-cmaf),
   an MP4-based container commonly used in adaptive streaming over HTTP.
 
-  The element supports up to 2 input tracks that can result in one output:
-    - audio input -> audio output
-    - video input -> video output
-    - video input + audio input -> muxed audio and video output
+
+  ## Input/Output tracks matrix
+  The basic muxer's functionality is to take a single media stream and put it into CMAF formatted track.
+
+  Sometimes one may need to mux several tracks together or make sure that output tracks are
+  synchronized with each other. Such behavior is also supported by the muxer's implementation.
+
+  Each output pad can specify which input pads needs to be muxed together by specifying `:tracks` option.
+
+  One may also want to have separate output pads that are internally synchronized with each other (then 
+  the `:tracks` should contain only a single id). By synchronization we mean that the muxer will try its best
+  to produce equal length segments for output pads. The synchronization relies on the video track (the video 
+  track can only be cut at keyframe boundries, audio track can be cut at any point).
+
+  This approach enforces that there is no more than a single video track. A video track is always used as a synchronization point 
+  therefore having more than one would make the synchronization decisions ambiguous. The amount of audio tracks on the other
+  hand is not limited.
+
+  As a rule of thumb, if there is no need to synchronize tracks just use separate muxer instances.
+
+  The example matrix of possible input/ouput tracks is as follows:
+  - audio input -> audio output 
+  - video input -> video output
+  - audio input + video input  -> muxed audio/video output
+  - audio-1 input + ... + audio-n input + video input  -> audio-1 output + ... + audio-n output  + video output
 
   ## Media objects
   Accordingly to the spec, the `#{inspect(__MODULE__)}` is able to assemble the following media entities:
-    * `headers` - media initialization object. Contains information necessary to play the media segments (when binary
-    concatenated with a media segment it creates a valid MP4 file). The media header is sent as an output pad's stream format.
+    * `header` - media initialization object. Contains information necessary to play the media segments.
+    The media header content is sent inside of a stream format on the target output pad.
 
-  * `segments` - media data that when combined with media headers can be played independently to other segments.
+  * `segment` - a sequence of one or more consecutive fragments belonging to a particular track that are playable on their own
+    when combined with a media header.
     Segments due to their nature (video decoding) must start with a key frame (doesn't apply to audio-only tracks) which
     is a main driver when collecting video samples and deciding when a segment should be created
 
-  * `chunks` - fragmented media data that when binary concatenated should make up a regular segment. Chunks
-    no longer have the requirement to start with a key frame (except for the first chunk that starts a new segment)
-    and their main goal is to reduce the latency of creating the media segments (chunks can be delivered to a client faster so it can
+  * `chunk` - a fragment consisting of a subset of media samples, not necessairly playable on its own. Chunk
+    no longer has the requirement to start with a key frame (except for the first chunk that starts a new segment)
+    and its main goal is to reduce the latency of creating the media segments (chunks can be delivered to a client faster so it can
     start playing them before a full segment gets assembled)
 
   ### Segment/Chunk metadata
@@ -36,8 +58,8 @@ defmodule Membrane.MP4.Muxer.CMAF do
     `:segment_min_duration` options passed when initializing `#{inspect(__MODULE__)}`.
 
   It is expected that the segment will not be shorter than the specified minimum duration value
-  and the aim is to end the segment as soon as the next key frames arrives that will become
-  a part of a new segment.
+  and the aim is to end the segment as soon as the next key frames arrives (for audio-only tracks the segment can be ended after each sample) 
+  that will become a part of a new segment.
 
   If a user prefers to have segments of unified durations then he needs to take into consideration
   the incoming keyframes interval. For instance, if a keyframe interval is 2 seconds and the goal is to have
@@ -52,7 +74,7 @@ defmodule Membrane.MP4.Muxer.CMAF do
   It may happen that one may need to create a segment before it reaches the minimum duration (for purposes such as fast AD insertion).
 
   To instruct the muxer to finalize the current segment as soon as possible one can send `Membrane.MP4.Muxer.CMAF.RequestMediaFinalization`
-  event on the `:output` pad. The event will enforce the muxer to end the current segment as soon as possible (usually on the nearest key frame).
+  event on any `:output` pad. The event will enforce the muxer to end the current segment as soon as possible (usually on the nearest key frame).
   After the segment gets generated, the muxer will go back to its normal behaviour of creating segments.
 
   ## Chunk creation
@@ -92,11 +114,12 @@ defmodule Membrane.MP4.Muxer.CMAF do
   use Membrane.Filter
 
   require Membrane.Logger
-
   require Membrane.H264
   require Membrane.H265
+
   alias __MODULE__.{Header, Segment, DurationRange, SegmentHelper}
-  alias Membrane.{AAC, Buffer, H264, H265, Opus}
+  alias Membrane.{AAC, H264, H265, Opus}
+  alias Membrane.Buffer
   alias Membrane.MP4.{Helper, Track}
   alias Membrane.MP4.Muxer.CMAF.TrackSamplesQueue, as: SamplesQueue
 
@@ -112,7 +135,21 @@ defmodule Membrane.MP4.Muxer.CMAF do
         %H265{stream_structure: structure, alignment: :au} when H265.is_hvc(structure)
       )
 
-  def_output_pad :output, accepted_format: Membrane.CMAF.Track, flow_control: :manual
+  def_output_pad :output,
+    availability: :on_request,
+    options: [
+      tracks: [
+        spec: [Membrane.Pad.dynamic_id()] | :all,
+        default: :all,
+        description: """
+        A list of the input pad ids that should be muxed together into a single output track.
+
+        If not specified the pad will include all unreferenced input pads.
+        """
+      ]
+    ],
+    accepted_format: Membrane.CMAF.Track,
+    flow_control: :manual
 
   def_options segment_min_duration: [
                 spec: Membrane.Time.t(),
@@ -143,15 +180,14 @@ defmodule Membrane.MP4.Muxer.CMAF do
       options
       |> Map.from_struct()
       |> Map.merge(%{
-        seq_num: 0,
-        # stream format waiting to be sent after receiving the next buffer.
+        # stream formats waiting to be sent after receiving the next buffer.
         # Holds the structure {stream_format_timestamp, stream_format}
-        awaiting_stream_format: nil,
+        awaiting_stream_formats: %{},
         pad_to_track_data: %{},
-        # ID for the next input track
-        next_track_id: 1,
+        pads_registration_order: [],
         sample_queues: %{},
-        finish_current_segment?: false
+        finish_current_segment?: false,
+        video_pad: nil
       })
       |> set_chunk_duration_range()
 
@@ -162,46 +198,93 @@ defmodule Membrane.MP4.Muxer.CMAF do
   def handle_pad_added(_pad, ctx, _state) when ctx.playback == :playing,
     do:
       raise(
-        "New tracks can be added to #{inspect(__MODULE__)} only before playback transition to :playing"
+        "New pads can be added to #{inspect(__MODULE__)} only before playback transition to :playing"
       )
 
   @impl true
   def handle_pad_added(Pad.ref(:input, _id) = pad, _ctx, state) do
-    {track_id, state} = Map.get_and_update!(state, :next_track_id, &{&1, &1 + 1})
-
-    track_data = %{
-      id: track_id,
-      track: nil,
-      # base timestamp of the current segment, initialized with DTS of the first buffer
-      # and then incremented by duration of every produced segment
-      segment_base_timestamp: nil,
-      end_timestamp: 0,
-      buffer_awaiting_duration: nil,
-      chunks_duration: Membrane.Time.seconds(0)
-    }
+    {[], Map.update!(state, :pads_registration_order, &[pad | &1])}
+  end
 
-    state
-    |> put_in([:pad_to_track_data, pad], track_data)
-    |> put_in([:sample_queues, pad], %SamplesQueue{
-      duration_range: state.chunk_duration_range || DurationRange.new(state.segment_min_duration)
-    })
-    |> then(&{[], &1})
+  @impl true
+  def handle_pad_added(Pad.ref(:output, _id), _ctx, state) do
+    {[], state}
   end
 
   @impl true
-  def handle_demand(:output, _size, _unit, _ctx, state) do
-    {pad, _elapsed_time} =
-      state.pad_to_track_data
-      |> Enum.map(fn {pad, track_data} -> {pad, track_data.end_timestamp} end)
-      |> Enum.reject(fn {_key, timestamp} -> is_nil(timestamp) end)
-      |> Enum.min_by(fn {_key, timestamp} -> Ratio.to_float(timestamp) end)
+  def handle_playing(ctx, state) do
+    {registration_order, state} = Map.pop!(state, :pads_registration_order)
+
+    registration_order = Enum.reverse(registration_order)
+
+    pads = Map.keys(ctx.pads)
+
+    %{
+      input: input_pads,
+      output: output_pads
+    } = Enum.group_by(pads, fn Pad.ref(type, _id) -> type end)
+
+    if Enum.empty?(output_pads) do
+      raise "Expected at least a single output pad"
+    end
+
+    input_groups =
+      input_pads
+      |> prepare_input_groups(output_pads, ctx)
+      |> tap(&validate_input_groups!/1)
+
+    input_to_output_pad =
+      input_groups
+      |> Enum.flat_map(fn {output_pad, input_pads} ->
+        Enum.map(input_pads, &{&1, output_pad})
+      end)
+      |> Map.new()
+
+    state =
+      Map.merge(state, %{
+        input_groups: input_groups,
+        input_to_output_pad: input_to_output_pad,
+        seq_nums: Map.new(output_pads, &{&1, 0})
+      })
+
+    input_pad_track_ids =
+      input_groups
+      |> Map.values()
+      |> Enum.flat_map(fn pads ->
+        pads
+        |> Enum.sort_by(fn pad -> Enum.find_index(registration_order, &(&1 == pad)) end)
+        |> Enum.with_index(1)
+      end)
 
-    {[demand: {pad, 1}], state}
+    state =
+      Enum.reduce(input_pad_track_ids, state, &initialize_pad_track_data/2)
+
+    demands = Enum.map(input_pads, &{:demand, &1})
+
+    {demands, state}
+  end
+
+  @impl true
+  def handle_demand(Pad.ref(:output, _id) = pad, _size, _unit, _ctx, state) do
+    case state.input_groups[pad] do
+      [input_pad] ->
+        {[demand: {input_pad, 1}], state}
+
+      input_pads ->
+        state.pad_to_track_data
+        |> Map.take(input_pads)
+        |> Enum.map(fn {pad, track_data} -> {pad, track_data.end_timestamp} end)
+        |> Enum.reject(fn {_key, timestamp} -> is_nil(timestamp) end)
+        |> Enum.min_by(fn {_key, timestamp} -> Ratio.to_float(timestamp) end)
+        |> then(fn {pad, _time} -> {[demand: {pad, 1}], state} end)
+    end
   end
 
   @impl true
   def handle_stream_format(pad, stream_format, ctx, state) do
-    ensure_max_one_video_pad!(pad, stream_format, ctx)
+    ensure_max_one_video_pad!(pad, stream_format, state)
+
+    output_pad = state.input_to_output_pad[pad]
 
     is_video_pad = is_video(stream_format)
 
@@ -209,27 +292,28 @@ defmodule Membrane.MP4.Muxer.CMAF do
       state
       |> update_in(
         [:pad_to_track_data, pad],
-        &%{&1 | track: stream_format_to_track(stream_format, &1.id)}
+        &%{&1 | track: Track.new(&1.id, stream_format)}
       )
       |> update_in(
         [:sample_queues, pad],
         &%SamplesQueue{&1 | track_with_keyframes?: is_video_pad}
       )
+      |> then(fn state ->
+        if is_video_pad do
+          %{state | video_pad: pad}
+        else
+          state
+        end
+      end)
 
-    has_all_input_stream_formats? =
-      ctx.pads
-      |> Map.drop([:output, pad])
-      |> Map.values()
-      |> Enum.all?(&(&1.stream_format != nil))
-
-    if has_all_input_stream_formats? do
-      stream_format = generate_output_stream_format(state)
+    if are_all_group_pads_ready?(pad, ctx, state) do
+      stream_format = generate_output_stream_format(output_pad, state)
 
       cond do
-        is_nil(ctx.pads.output.stream_format) ->
-          {[stream_format: {:output, stream_format}], state}
+        is_nil(ctx.pads[output_pad].stream_format) ->
+          {[stream_format: {output_pad, stream_format}], state}
 
-        stream_format != ctx.pads.output.stream_format ->
+        stream_format != ctx.pads[output_pad].stream_format ->
           {[], SegmentHelper.put_awaiting_stream_format(pad, stream_format, state)}
 
         true ->
@@ -240,42 +324,6 @@ defmodule Membrane.MP4.Muxer.CMAF do
     end
   end
 
-  defp is_video(%Track{stream_format: stream_format}),
-    do: is_struct(stream_format, H264) or is_struct(stream_format, H265)
-
-  defp is_video(stream_format),
-    do: is_struct(stream_format, H264) or is_struct(stream_format, H265)
-
-  defp find_video_pads(ctx) do
-    ctx.pads
-    |> Enum.filter(fn
-      {Pad.ref(:input, _id), data} ->
-        data.stream_format != nil and is_video(data.stream_format)
-
-      _other ->
-        false
-    end)
-    |> Enum.map(fn {pad, _data} -> pad end)
-  end
-
-  defp ensure_max_one_video_pad!(pad, stream_format, ctx) do
-    is_video_pad = is_video(stream_format)
-
-    if is_video_pad do
-      video_pads = find_video_pads(ctx)
-
-      has_other_video_pad? = video_pads != [] and video_pads != [pad]
-
-      if has_other_video_pad? do
-        raise "CMAF muxer can only handle at most one video pad"
-      end
-    end
-  end
-
-  defp stream_format_to_track(stream_format, track_id) do
-    Track.new(track_id, stream_format)
-  end
-
   @impl true
   def handle_buffer(Pad.ref(:input, _id) = pad, sample, ctx, state) do
     use Numbers, overload_operators: true, comparison: true
@@ -296,76 +344,159 @@ defmodule Membrane.MP4.Muxer.CMAF do
 
     case segment do
       {:segment, segment, state} ->
-        {buffer, state} = generate_segment(segment, ctx, state)
+        {buffers, state} = generate_segment_actions(segment, ctx, state)
 
-        actions = [buffer: {:output, buffer}] ++ stream_format_action ++ [redemand: :output]
+        actions =
+          buffers ++
+            stream_format_action ++
+            Enum.map(buffers, fn {:buffer, {pad, _buffer}} -> {:redemand, pad} end)
 
         {actions, state}
 
       {:no_segment, state} ->
-        {[redemand: :output], state}
+        output_pad = state.input_to_output_pad[pad]
+        {[redemand: output_pad], state}
     end
   end
 
   @impl true
-  def handle_event(:output, %__MODULE__.RequestMediaFinalization{}, _ctx, state) do
+  def handle_event(_pad, %__MODULE__.RequestMediaFinalization{}, _ctx, state) do
     {[], %{state | finish_current_segment?: true}}
   end
 
   @impl true
-  def handle_event(Pad.ref(:input, _ref), event, _ctx, state) do
-    {[forward: event], state}
+  def handle_event(Pad.ref(:input, _ref) = pad, event, _ctx, state) do
+    output_pad = state.input_to_output_pad[pad]
+
+    {[event: {output_pad, event}], state}
   end
 
   @impl true
   def handle_end_of_stream(Pad.ref(:input, _track_id) = pad, ctx, state) do
     cache = Map.fetch!(state.sample_queues, pad)
+    output_pad = state.input_to_output_pad[pad]
+
+    input_pads = Map.keys(state.input_to_output_pad) -- [pad]
 
     processing_finished? =
       ctx.pads
-      |> Map.drop([:output, pad])
+      |> Map.take(input_pads)
       |> Map.values()
       |> Enum.all?(& &1.end_of_stream?)
 
     if SamplesQueue.empty?(cache) do
       if processing_finished? do
-        {[end_of_stream: :output], state}
+        end_of_streams = generate_output_end_of_streams(ctx)
+
+        {end_of_streams, state}
       else
-        {[redemand: :output], state}
+        {[redemand: output_pad], state}
       end
     else
-      generate_end_of_stream_segment(processing_finished?, cache, pad, ctx, state)
+      generate_end_of_stream_segment(processing_finished?, pad, ctx, state)
     end
   end
 
-  defp generate_end_of_stream_segment(processing_finished?, cache, pad, ctx, state) do
-    sample = state.pad_to_track_data[pad].buffer_awaiting_duration
+  defp prepare_input_groups(input_pads, output_pads, ctx) do
+    available_tracks = Enum.map(input_pads, fn Pad.ref(:input, id) -> id end)
 
-    sample_metadata =
-      Map.put(sample.metadata, :duration, SamplesQueue.last_sample(cache).metadata.duration)
+    Map.new(output_pads, fn output_pad ->
+      tracks =
+        case ctx.pads[output_pad].options.tracks do
+          :all -> available_tracks
+          tracks when is_list(tracks) -> tracks
+        end
 
-    sample = %Buffer{sample | metadata: sample_metadata}
+      unless Enum.all?(tracks, &(&1 in available_tracks)) do
+        raise "Encountered unknown pad in specified tracks: #{inspect(tracks)}, available tracks: #{inspect(available_tracks)}"
+      end
 
-    cache = SamplesQueue.force_push(cache, sample)
-    state = put_in(state, [:sample_queues, pad], cache)
+      input_pads =
+        tracks
+        |> Enum.uniq()
+        |> Enum.map(&Pad.ref(:input, &1))
 
-    if processing_finished? do
-      with {:segment, segment, state} when map_size(segment) > 0 <-
-             SegmentHelper.take_all_samples(state) do
-        {buffer, state} = generate_segment(segment, ctx, state)
-        {[buffer: {:output, buffer}, end_of_stream: :output], state}
-      else
-        {:segment, _segment, state} -> {[end_of_stream: :output], state}
+      {output_pad, input_pads}
+    end)
+  end
+
+  defp validate_input_groups!(input_groups) do
+    input_groups
+    |> Map.values()
+    |> List.flatten()
+    |> Bunch.Enum.duplicates()
+    |> case do
+      [] ->
+        :ok
+
+      pads ->
+        raise "Input pads #{inspect(pads)} are used in more than one input group."
+    end
+  end
+
+  defp initialize_pad_track_data({pad, track_id}, state) do
+    track_data = %{
+      id: track_id,
+      track: nil,
+      # base timestamp of the current segment, initialized with DTS of the first buffer
+      # and then incremented by duration of every produced segment
+      segment_base_timestamp: nil,
+      end_timestamp: 0,
+      buffer_awaiting_duration: nil,
+      chunks_duration: Membrane.Time.seconds(0)
+    }
+
+    state
+    |> put_in([:pad_to_track_data, pad], track_data)
+    |> put_in([:sample_queues, pad], %SamplesQueue{
+      duration_range: state.chunk_duration_range || DurationRange.new(state.segment_min_duration)
+    })
+  end
+
+  defp generate_end_of_stream_segment(false, pad, _ctx, state) do
+    output_pad = state.input_to_output_pad[pad]
+
+    state = put_in(state, [:pad_to_track_data, pad, :end_timestamp], nil)
+
+    {[redemand: output_pad], state}
+  end
+
+  defp generate_end_of_stream_segment(true, _pad, ctx, state) do
+    state =
+      for {pad, track_data} <- state.pad_to_track_data, reduce: state do
+        state ->
+          queue = Map.fetch!(state.sample_queues, pad)
+          sample = track_data.buffer_awaiting_duration
+
+          sample_metadata =
+            Map.put(sample.metadata, :duration, SamplesQueue.last_sample(queue).metadata.duration)
+
+          sample = %Buffer{sample | metadata: sample_metadata}
+
+          queue = SamplesQueue.force_push(queue, sample)
+          put_in(state, [:sample_queues, pad], queue)
       end
-    else
-      state = put_in(state, [:pad_to_track_data, pad, :end_timestamp], nil)
 
-      {[redemand: :output], state}
+    end_of_streams = generate_output_end_of_streams(ctx)
+
+    case SegmentHelper.take_all_samples(state) do
+      {:segment, segment, state} when map_size(segment) > 0 ->
+        {buffers, state} = generate_segment_actions(segment, ctx, state)
+
+        {buffers ++ end_of_streams, state}
+
+      {:segment, _segment, state} ->
+        {end_of_streams, state}
     end
   end
 
-  defp generate_output_stream_format(state) do
-    tracks = Enum.map(state.pad_to_track_data, fn {_pad, track_data} -> track_data.track end)
+  defp generate_output_stream_format(output_pad, state) do
+    input_pads = state.input_groups[output_pad]
+
+    tracks =
+      state.pad_to_track_data
+      |> Map.take(input_pads)
+      |> Enum.map(fn {_pad, track_data} -> track_data.track end)
 
     resolution =
       tracks
@@ -374,13 +505,13 @@ defmodule Membrane.MP4.Muxer.CMAF do
         _audio_track -> nil
       end)
 
-    codecs = Map.new(tracks, fn track -> Track.get_encoding_info(track) end)
+    codecs = Map.new(tracks, &Track.get_encoding_info/1)
 
     header = Header.serialize(tracks)
 
     content_type =
       tracks
-      |> Enum.map(&if is_video(&1), do: :video, else: :audio)
+      |> Enum.map(&if is_video(&1.stream_format), do: :video, else: :audio)
       |> then(fn
         [item] -> item
         list -> list
@@ -394,54 +525,95 @@ defmodule Membrane.MP4.Muxer.CMAF do
     }
   end
 
-  defp generate_segment(acc, ctx, state) do
-    use Numbers, overload_operators: true, comparison: true
+  defp generate_output_end_of_streams(ctx) do
+    ctx.pads
+    |> Enum.filter(fn
+      {Pad.ref(:output, _id), data} -> not data.end_of_stream?
+      _other -> false
+    end)
+    |> Enum.map(fn {pad, _data} ->
+      {:end_of_stream, pad}
+    end)
+  end
+
+  defp generate_samples_table(samples, timescale) do
+    Enum.map(samples, fn sample ->
+      %{
+        sample_size: byte_size(sample.payload),
+        sample_flags: generate_sample_flags(sample.metadata),
+        sample_duration:
+          sample.metadata.duration
+          |> Helper.timescalify(timescale)
+          |> Ratio.trunc(),
+        sample_offset: Helper.timescalify(sample.pts - sample.dts, timescale)
+      }
+    end)
+  end
+
+  defp generate_samples_data(samples) do
+    samples
+    |> Enum.map(& &1.payload)
+    |> IO.iodata_to_binary()
+  end
+
+  defp calculate_segment_duration(samples) do
+    first_sample = hd(samples)
+    last_sample = List.last(samples)
+
+    last_sample.dts - first_sample.dts + last_sample.metadata.duration
+  end
+
+  defp generate_input_group_tracks_data(
+         input_group,
+         acc,
+         state
+       ) do
+    {output_pad, input_pads} = input_group
 
     tracks_data =
       acc
+      |> Map.take(input_pads)
       |> Enum.filter(fn {_pad, samples} -> not Enum.empty?(samples) end)
       |> Enum.map(fn {pad, samples} ->
         track_data = state.pad_to_track_data[pad]
 
         %{timescale: timescale} = track_data.track
-        first_sample = hd(samples)
-        last_sample = List.last(samples)
-
-        samples_table =
-          samples
-          |> Enum.map(fn sample ->
-            %{
-              sample_size: byte_size(sample.payload),
-              sample_flags: generate_sample_flags(sample.metadata),
-              sample_duration:
-                Helper.timescalify(
-                  sample.metadata.duration,
-                  timescale
-                )
-                |> Ratio.trunc(),
-              sample_offset: Helper.timescalify(sample.pts - sample.dts, timescale)
-            }
-          end)
-
-        samples_data = Enum.map_join(samples, & &1.payload)
-
-        duration = last_sample.dts - first_sample.dts + last_sample.metadata.duration
+
+        duration = calculate_segment_duration(samples)
 
         %{
           pad: pad,
-          id: state.pad_to_track_data[pad].id,
-          sequence_number: state.seq_num,
+          id: track_data.id,
+          sequence_number: state.seq_nums[output_pad],
+          timescale: timescale,
           base_timestamp:
-            Helper.timescalify(track_data.segment_base_timestamp, timescale)
+            track_data.segment_base_timestamp
+            |> Helper.timescalify(timescale)
             |> Ratio.trunc(),
           unscaled_duration: duration,
           duration: Helper.timescalify(duration, timescale),
-          timescale: timescale,
-          samples_table: samples_table,
-          samples_data: samples_data
+          samples_table: generate_samples_table(samples, timescale),
+          samples_data: generate_samples_data(samples)
         }
       end)
 
+    if Enum.empty?(tracks_data) do
+      {[], state}
+    else
+      state =
+        tracks_data
+        |> Enum.reduce(state, fn %{unscaled_duration: duration, pad: pad}, state ->
+          update_in(state, [:pad_to_track_data, pad, :segment_base_timestamp], &(&1 + duration))
+        end)
+        |> update_in([:seq_nums, output_pad], &(&1 + 1))
+
+      {[{input_group, tracks_data}], state}
+    end
+  end
+
+  defp generate_input_group_action({input_group, tracks_data}, acc, state) do
+    {output_pad, _input_pads} = input_group
+
     payload = Segment.serialize(tracks_data)
 
     # Duration of the tracks will never be exactly the same. To minimize the error and avoid its magnification over time,
@@ -454,37 +626,44 @@ defmodule Membrane.MP4.Muxer.CMAF do
 
     metadata = %{
       duration: duration,
-      independent?: is_segment_independent(acc, ctx),
+      independent?: is_segment_independent(acc, state),
       last_chunk?: is_segment_finished(state)
     }
 
-    buffer = %Buffer{payload: payload, metadata: metadata}
+    {:buffer, {output_pad, %Buffer{payload: payload, metadata: metadata}}}
+  end
 
-    # Update segment base timestamps for each track
-    state =
-      Enum.reduce(tracks_data, state, fn %{unscaled_duration: duration, pad: pad}, state ->
-        update_in(state, [:pad_to_track_data, pad, :segment_base_timestamp], &(&1 + duration))
-      end)
-      |> Map.update!(:seq_num, &(&1 + 1))
-      |> Map.update!(:finish_current_segment?, fn finish_current_segment? ->
-        non_ending_chunk? = metadata.last_chunk? == false
+  defp update_finish_current_segment_state(actions, state) do
+    last_chunk? =
+      Enum.any?(actions, fn {:buffer, {_pad, buffer}} -> buffer.metadata.last_chunk? end)
 
-        finish_current_segment? and non_ending_chunk?
-      end)
+    Map.update!(state, :finish_current_segment?, fn finish_current_segment? ->
+      non_ending_chunk? = last_chunk? == false
 
-    {buffer, state}
+      finish_current_segment? and non_ending_chunk?
+    end)
   end
 
-  defp is_segment_independent(segment, ctx) do
-    case find_video_pads(ctx) do
-      [] ->
-        true
+  defp generate_segment_actions(acc, _ctx, state) do
+    use Numbers, overload_operators: true, comparison: true
 
-      [video_pad] ->
-        case segment do
-          %{^video_pad => samples} -> Helper.key_frame?(hd(samples).metadata)
-          _other -> true
-        end
+    state.input_groups
+    |> Enum.flat_map_reduce(state, &generate_input_group_tracks_data(&1, acc, &2))
+    |> then(fn {data, state} ->
+      actions = Enum.map(data, &generate_input_group_action(&1, acc, state))
+
+      state = update_finish_current_segment_state(actions, state)
+
+      {actions, state}
+    end)
+  end
+
+  defp is_segment_independent(segment, state) do
+    video_pad = state.video_pad
+
+    case segment do
+      %{^video_pad => samples} -> Helper.key_frame?(hd(samples).metadata)
+      _other -> true
     end
   end
 
@@ -571,4 +750,23 @@ defmodule Membrane.MP4.Muxer.CMAF do
     |> Map.delete(:chunk_target_duration)
     |> Map.put(:chunk_duration_range, nil)
   end
+
+  defp are_all_group_pads_ready?(pad, ctx, state) do
+    output_pad = state.input_to_output_pad[pad]
+
+    other_input_pads = state.input_groups[output_pad] -- [pad]
+
+    ctx.pads
+    |> Map.take(other_input_pads)
+    |> Enum.all?(fn {_pad, data} -> data.stream_format != nil end)
+  end
+
+  defp is_video(stream_format),
+    do: is_struct(stream_format, H264) or is_struct(stream_format, H265)
+
+  defp ensure_max_one_video_pad!(pad, stream_format, state) do
+    if is_video(stream_format) and state.video_pad != nil and state.video_pad != pad do
+      raise "CMAF muxer can only handle at most one video pad"
+    end
+  end
 end
diff --git a/lib/membrane_mp4/muxer/cmaf/segment_helper.ex b/lib/membrane_mp4/muxer/cmaf/segment_helper.ex
index 236bac8..7edb16e 100644
--- a/lib/membrane_mp4/muxer/cmaf/segment_helper.ex
+++ b/lib/membrane_mp4/muxer/cmaf/segment_helper.ex
@@ -26,37 +26,40 @@ defmodule Membrane.MP4.Muxer.CMAF.SegmentHelper do
   @spec collect_segment_samples(state :: term(), Pad.ref(), Membrane.Buffer.t() | nil) ::
           {actions :: [term()],
            {:segment, segment :: term(), state :: term()} | {:no_segment, state :: term()}}
-  def collect_segment_samples(%{awaiting_stream_format: nil} = state, _pad, nil),
-    do: {[], {:no_segment, state}}
+  def collect_segment_samples(state, pad, sample)
 
-  def collect_segment_samples(%{awaiting_stream_format: nil} = state, pad, sample),
-    do: do_collect_segment_samples(state, pad, sample)
+  def collect_segment_samples(state, pad, sample)
+      when map_size(state.awaiting_stream_formats) > 0 do
+    output_pad = state.input_to_output_pad[pad]
 
-  def collect_segment_samples(
-        %{awaiting_stream_format: {{:update_with_next, _pad}, _stream_format}} = state,
-        pad,
-        sample
-      ),
-      do: do_collect_segment_samples(state, pad, sample)
-
-  def collect_segment_samples(
-        %{awaiting_stream_format: {:stream_format, stream_format}} = state,
-        pad,
-        sample
-      ) do
-    state = %{state | awaiting_stream_format: nil}
-
-    unless key_frame?(state.pad_to_track_data[pad].buffer_awaiting_duration.metadata) do
-      raise "Video sample received after new stream format must be a key frame"
-    end
+    case state.awaiting_stream_formats do
+      %{^output_pad => {:stream_format, stream_format}} ->
+        unless key_frame?(state.pad_to_track_data[pad].buffer_awaiting_duration.metadata) do
+          raise "Video sample received after new stream format must be a key frame"
+        end
+
+        {:no_segment, state} = force_push_segment(state, pad, sample)
 
-    {:no_segment, state} = force_push_segment(state, pad, sample)
+        {:segment, segment, state} = take_all_samples_until(state, sample)
 
-    {:segment, _segment, _state} = result = take_all_samples_until(state, sample)
+        state = Map.update!(state, :awaiting_stream_formats, &Map.delete(&1, output_pad))
 
-    {[stream_format: {:output, stream_format}], result}
+        {[stream_format: {output_pad, stream_format}], {:segment, segment, state}}
+
+      _other ->
+        if sample do
+          do_collect_segment_samples(state, pad, sample)
+        else
+          {[], {:no_segment, state}}
+        end
+    end
   end
 
+  def collect_segment_samples(state, _pad, nil), do: {[], {:no_segment, state}}
+
+  def collect_segment_samples(state, pad, sample),
+    do: do_collect_segment_samples(state, pad, sample)
+
   defp do_collect_segment_samples(state, pad, sample) do
     supports_partial_segments? = state.chunk_duration_range != nil
 
@@ -73,29 +76,43 @@ defmodule Membrane.MP4.Muxer.CMAF.SegmentHelper do
   """
   @spec put_awaiting_stream_format(Pad.ref(), term(), term()) :: term()
   def put_awaiting_stream_format(pad, stream_format, state) do
-    %{state | awaiting_stream_format: {{:update_with_next, pad}, stream_format}}
+    output_pad = state.input_to_output_pad[pad]
+
+    put_in(
+      state,
+      [:awaiting_stream_formats, output_pad],
+      {{:update_with_next, pad}, stream_format}
+    )
   end
 
   @doc """
   Updates the awaiting stream format to a ready state where it can be finally handled.
   """
   @spec update_awaiting_stream_format(state :: term(), Pad.ref()) :: state :: term()
-  def update_awaiting_stream_format(
-        %{awaiting_stream_format: {{:update_with_next, pad}, stream_format}} = state,
-        pad
-      ) do
-    %{state | awaiting_stream_format: {:stream_format, stream_format}}
-  end
+  def update_awaiting_stream_format(state, pad)
 
-  def update_awaiting_stream_format(state, _pad), do: state
+  def update_awaiting_stream_format(state, _pad)
+      when map_size(state.awaiting_stream_formats) == 0,
+      do: state
+
+  def update_awaiting_stream_format(state, pad) do
+    output_pad = state.input_to_output_pad[pad]
+
+    case state.awaiting_stream_formats do
+      %{^output_pad => {{:update_with_next, ^pad}, stream_format}} ->
+        put_in(state, [:awaiting_stream_formats, output_pad], {:stream_format, stream_format})
+
+      _other ->
+        state
+    end
+  end
 
   @spec push_segment(state_t(), Membrane.Pad.ref(), Membrane.Buffer.t()) ::
           {:no_segment, state_t()} | {:segment, segment_t(), state_t()}
   def push_segment(state, pad, sample) do
     queue = Map.fetch!(state.sample_queues, pad)
-    is_video = queue.track_with_keyframes?
 
-    if is_video do
+    if queue.track_with_keyframes? do
       push_video_segment(state, queue, pad, sample)
     else
       push_audio_segment(state, queue, pad, sample)
@@ -129,20 +146,29 @@ defmodule Membrane.MP4.Muxer.CMAF.SegmentHelper do
   defp push_audio_segment(state, queue, pad, sample) do
     base_timestamp = max_segment_base_timestamp(state)
 
-    any_video_tracks? =
-      Enum.any?(state.sample_queues, fn {_pad, queue} -> queue.track_with_keyframes? end)
+    {video_pad, video_queue} =
+      Enum.find(state.sample_queues, {nil, nil}, fn {_pad, queue} ->
+        queue.track_with_keyframes?
+      end)
 
     queue =
-      if any_video_tracks? do
+      if video_queue do
         SamplesQueue.force_push(queue, sample)
       else
         SamplesQueue.plain_push_until_target(queue, sample, base_timestamp)
       end
 
-    if queue.collectable? do
-      collect_samples_for_audio_track(pad, queue, state)
-    else
-      {:no_segment, update_queue_for(pad, queue, state)}
+    cond do
+      queue.collectable? ->
+        collect_samples_for_audio_track(pad, queue, state)
+
+      video_queue && video_queue.collectable? ->
+        state = update_queue_for(pad, queue, state)
+
+        collect_samples_for_video_track(video_pad, video_queue, state)
+
+      true ->
+        {:no_segment, update_queue_for(pad, queue, state)}
     end
   end
 
@@ -227,26 +253,35 @@ defmodule Membrane.MP4.Muxer.CMAF.SegmentHelper do
   end
 
   defp push_audio_chunk(state, queue, pad, sample) do
-    any_video_tracks? =
-      Enum.any?(state.sample_queues, fn {_pad, queue} -> queue.track_with_keyframes? end)
-
-    # if we have any video track then let the video track decide when to collect audio tracks
-    if any_video_tracks? do
-      queue = SamplesQueue.force_push(queue, sample)
+    {video_pad, video_queue} =
+      Enum.find(state.sample_queues, {nil, nil}, fn {_pad, queue} ->
+        queue.track_with_keyframes?
+      end)
 
-      {:no_segment, update_queue_for(pad, queue, state)}
-    else
-      base_timestamp = max_segment_base_timestamp(state)
+    queue =
+      if video_queue do
+        SamplesQueue.force_push(queue, sample)
+      else
+        base_timestamp = max_segment_base_timestamp(state)
 
-      queue = SamplesQueue.plain_push_until_target(queue, sample, base_timestamp)
+        SamplesQueue.plain_push_until_target(queue, sample, base_timestamp)
+      end
 
-      if queue.collectable? do
+    cond do
+      queue.collectable? ->
         pad
         |> collect_samples_for_audio_track(queue, state)
         |> maybe_reset_chunk_durations(sample)
-      else
+
+      video_queue && video_queue.collectable? ->
+        state = update_queue_for(pad, queue, state)
+
+        video_pad
+        |> collect_samples_for_video_track(video_queue, state)
+        |> maybe_reset_chunk_durations(sample)
+
+      true ->
         {:no_segment, update_queue_for(pad, queue, state)}
-      end
     end
   end
 
diff --git a/mix.exs b/mix.exs
index cc36ec5..5b21f69 100644
--- a/mix.exs
+++ b/mix.exs
@@ -1,7 +1,7 @@
 defmodule Membrane.MP4.Plugin.MixProject do
   use Mix.Project
 
-  @version "0.31.0"
+  @version "0.32.0"
   @github_url "https://github.com/membraneframework/membrane_mp4_plugin"
 
   def project do
diff --git a/test/membrane_mp4/muxer/cmaf/integration_test.exs b/test/membrane_mp4/muxer/cmaf/integration_test.exs
index e5adb95..5049d8c 100644
--- a/test/membrane_mp4/muxer/cmaf/integration_test.exs
+++ b/test/membrane_mp4/muxer/cmaf/integration_test.exs
@@ -4,10 +4,12 @@ defmodule Membrane.MP4.Muxer.CMAF.IntegrationTest do
   import Membrane.ChildrenSpec
   import Membrane.Testing.Assertions
 
+  require Membrane.Pad
+
   alias Membrane.MP4.BufferLimiter
   alias Membrane.MP4.Container
   alias Membrane.MP4.Muxer.CMAF.RequestMediaFinalizeSender
-  alias Membrane.{Testing, Time}
+  alias Membrane.{Pad, Testing, Time}
 
   # Fixtures used in CMAF tests below were generated using `membrane_http_adaptive_stream_plugin`
   # with `muxer_segment_duration` option set to `Membrane.Time.seconds(2)`.
@@ -63,7 +65,7 @@ defmodule Membrane.MP4.Muxer.CMAF.IntegrationTest do
       |> child(:audio_parser, %Membrane.AAC.Parser{out_encapsulation: :none, output_config: :esds}),
       child(:video_source, %Membrane.File.Source{location: "test/fixtures/in_video.h264"})
       |> child(:video_parser, %Membrane.H264.Parser{
-        generate_best_effort_timestamps: %{framerate: {30, 1}},
+        generate_best_effort_timestamps: %{framerate: {30, 1}, add_dts_offest: false},
         output_stream_structure: :avc1
       }),
       child(:cmaf, %Membrane.MP4.Muxer.CMAF{
@@ -88,6 +90,7 @@ defmodule Membrane.MP4.Muxer.CMAF.IntegrationTest do
     1..2
     |> Enum.map(fn i ->
       assert_sink_buffer(pipeline, :sink, buffer)
+
       assert_mp4_equal(buffer.payload, "muxed_audio_video/segment_#{i}.m4s")
     end)
 
@@ -97,6 +100,73 @@ defmodule Membrane.MP4.Muxer.CMAF.IntegrationTest do
     :ok = Testing.Pipeline.terminate(pipeline)
   end
 
+  test "synchronized audio and video" do
+    structure = [
+      child(:cmaf, %Membrane.MP4.Muxer.CMAF{
+        segment_min_duration: Time.seconds(2)
+      }),
+      child(:audio_source, %Membrane.File.Source{location: "test/fixtures/in_audio.aac"})
+      |> child(:audio_parser, %Membrane.AAC.Parser{out_encapsulation: :none, output_config: :esds}),
+      child(:video_source, %Membrane.File.Source{location: "test/fixtures/in_video.h264"})
+      |> child(:video_parser, %Membrane.H264.Parser{
+        generate_best_effort_timestamps: %{framerate: {30, 1}, add_dts_offset: true},
+        output_stream_structure: :avc1
+      }),
+      ###
+      get_child(:video_parser)
+      |> via_in(Pad.ref(:input, :video))
+      |> get_child(:cmaf),
+      get_child(:audio_parser)
+      |> via_in(Pad.ref(:input, :audio))
+      |> get_child(:cmaf),
+      ###
+      get_child(:cmaf)
+      |> via_out(Pad.ref(:output, :video), options: [tracks: [:video]])
+      |> child(:video_sink, Membrane.Testing.Sink),
+      get_child(:cmaf)
+      |> via_out(Pad.ref(:output, :audio), options: [tracks: [:audio]])
+      |> child(:audio_sink, Membrane.Testing.Sink)
+    ]
+
+    pipeline = Testing.Pipeline.start_link_supervised!(spec: structure)
+
+    assert_sink_stream_format(pipeline, :audio_sink, %Membrane.CMAF.Track{
+      header: header,
+      content_type: :audio
+    })
+
+    assert_mp4_equal(header, "ref_audio_header.mp4")
+
+    assert_sink_stream_format(pipeline, :video_sink, %Membrane.CMAF.Track{
+      header: header,
+      content_type: :video
+    })
+
+    assert_mp4_equal(header, "ref_video_header.mp4")
+
+    1..2
+    |> Enum.map(fn i ->
+      assert_sink_buffer(pipeline, :audio_sink, audio_buffer)
+
+      assert_sink_buffer(pipeline, :video_sink, video_buffer)
+
+      # NOTE: due to 'add_dts_offset' the video is moved back by 500ms
+      assert_in_delta audio_buffer.metadata.duration,
+                      video_buffer.metadata.duration,
+                      Membrane.Time.milliseconds(600)
+
+      assert_mp4_equal(video_buffer.payload, "ref_video_segment#{i}.m4s")
+    end)
+
+    assert_end_of_stream(pipeline, :audio_sink)
+    assert_end_of_stream(pipeline, :video_sink)
+
+    refute_sink_buffer(pipeline, :audio_sink, _buffer, 0)
+    refute_sink_buffer(pipeline, :video_sink, _buffer, 0)
+
+    :ok = Testing.Pipeline.terminate(pipeline)
+  end
+
   test "video partial segments" do
     pipeline =
       prepare_pipeline(:video,
diff --git a/test/membrane_mp4/muxer/cmaf/segment_helper_test.exs b/test/membrane_mp4/muxer/cmaf/segment_helper_test.exs
index d75a63c..230a30a 100644
--- a/test/membrane_mp4/muxer/cmaf/segment_helper_test.exs
+++ b/test/membrane_mp4/muxer/cmaf/segment_helper_test.exs
@@ -9,10 +9,12 @@ defmodule Membrane.MP4.Muxer.CMAF.SegmentHelperTest do
     chunk_duration_range = DurationRange.new(1, 50)
 
     state = %{
-      awaiting_stream_format: nil,
+      awaiting_stream_formats: %{},
       segment_min_duration: 100,
       chunk_duration_range: chunk_duration_range,
       finish_current_segment?: false,
+      input_to_output_pad: %{audio: :output, video: :output},
+      input_groups: %{output: [:audio, :video]},
       pad_to_track_data: %{
         audio: %{segment_base_timestamp: 0, chunks_duration: 0, buffer_awaiting_duration: nil},
         video: %{segment_base_timestamp: 0, chunks_duration: 0, buffer_awaiting_duration: nil}
@@ -36,6 +38,15 @@ defmodule Membrane.MP4.Muxer.CMAF.SegmentHelperTest do
     end
   end
 
+  defp push_n_buffers(type, range, state) do
+    for i <- range, reduce: state do
+      state ->
+        {:no_segment, state} = push_buffer(type, buffer_with_timestamp(type, i), state)
+
+        state
+    end
+  end
+
   defp get_next_buffer(pad, buffer, state) do
     awaiting = state.pad_to_track_data[pad].buffer_awaiting_duration
 
@@ -76,34 +87,17 @@ defmodule Membrane.MP4.Muxer.CMAF.SegmentHelperTest do
 
   @stream_format :stream_format
   test "new stream format forces segment collection", %{state: state} do
-    # push first couple of video samples
-    state =
-      for i <- 1..10, reduce: state do
-        state ->
-          {:no_segment, state} = push_buffer(:video, buffer_with_timestamp(:video, i), state)
-
-          state
-      end
-
-    # push first couple of audio samples
+    # push first couple of video and audio samples
     state =
-      for i <- 1..20, reduce: state do
+      for {type, amount} <- [video: 10, audio: 20], reduce: state do
         state ->
-          {:no_segment, state} = push_buffer(:audio, buffer_with_timestamp(:audio, i), state)
-
-          state
+          push_n_buffers(type, 1..amount, state)
       end
 
     state = SegmentHelper.put_awaiting_stream_format(:video, @stream_format, state)
 
     # push couple of audio samples after new video stream format
-    state =
-      for i <- 21..30, reduce: state do
-        state ->
-          {:no_segment, state} = push_buffer(:audio, buffer_with_timestamp(:audio, i), state)
-
-          state
-      end
+    state = push_n_buffers(:audio, 21..30, state)
 
     state = SegmentHelper.update_awaiting_stream_format(state, :video)