feat(audio): add Mp3FormatDecoder streaming strategy

Implements IFormatDecoder for MP3: ID3v2 skip, MPEG Layer III frame-sync + header decode, Xing/Info/VBRI detection, CBR frame alignment, and VBR TOC seek interpolation. Wiring lands in Wave 3.
This commit is contained in:
daniel-c-harvey
2026-06-11 08:40:52 -04:00
parent c835a54652
commit b04081b960
@@ -0,0 +1,322 @@
/**
* Mp3FormatDecoder - MP3 (MPEG-1/2/2.5 Layer III) implementation of IFormatDecoder.
*
* All MP3-specific stream logic lives here: ID3v2 skipping, MPEG frame-sync scanning,
* frame-header decode, Xing/Info/VBRI VBR-header detection, segment sizing, and seek
* byte-offset math (CBR estimate or VBR TOC interpolation). StreamDecoder delegates to
* this via IFormatDecoder and holds no MP3 knowledge of its own.
*
* MP3 frames are self-contained, so wrapSegment is a zero-copy passthrough — the browser's
* decodeAudioData accepts raw frame bytes directly and tolerates a partial leading frame.
*/
import { FormatInfo, IFormatDecoder, Mp3VbrSeekData } from './IFormatDecoder.js';
export class Mp3FormatDecoder implements IFormatDecoder {
// MPEG Layer III bitrate tables (kbps), indexed by the 4-bit bitrate index.
// Index 0 (free) and 15 (reserved) are invalid and rejected during frame validation.
private static readonly BITRATES_MPEG1 = [0, 32, 40, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224, 256, 320];
private static readonly BITRATES_MPEG2 = [0, 8, 16, 24, 32, 40, 48, 56, 64, 80, 96, 112, 128, 144, 160];
// Sample-rate tables (Hz), indexed by the 2-bit sample-rate index (3 = reserved).
private static readonly SAMPLE_RATES_MPEG1 = [44100, 48000, 32000];
private static readonly SAMPLE_RATES_MPEG2 = [22050, 24000, 16000];
private static readonly SAMPLE_RATES_MPEG25 = [11025, 12000, 8000];
tryParseHeader(chunks: Uint8Array[], totalSize: number): FormatInfo | null {
const buffer = Mp3FormatDecoder.concat(chunks, totalSize);
// Need at least the 10-byte ID3v2 header probe plus a 4-byte frame header.
if (buffer.length < 10) return null;
const searchStart = Mp3FormatDecoder.id3v2Skip(buffer);
// Scan for the first valid MPEG Layer III frame from the skip offset.
const frameStart = Mp3FormatDecoder.findFrameSync(buffer, searchStart);
if (frameStart < 0) return null;
// Decode the 4-byte frame header.
const h = Mp3FormatDecoder.decodeFrameHeader(buffer, frameStart);
if (!h) return null;
const vbr = Mp3FormatDecoder.parseVbrHeader(buffer, frameStart, h);
// Xing "Xing" tag → true VBR (size-based segments, no fixed blockAlign).
// Xing "Info" tag or no VBR header → CBR (frame-aligned blockAlign).
const isVbr = vbr?.isXing === true;
const blockAlign = isVbr ? 0 : h.frameSize;
let totalDuration: number | null = null;
let seekData: Mp3VbrSeekData | null = null;
if (vbr && vbr.totalFrames > 0) {
totalDuration = vbr.totalFrames * h.samplesPerFrame / h.sampleRate;
}
if (vbr?.toc) {
seekData = { kind: 'mp3-vbr', toc: vbr.toc, totalBytes: vbr.totalBytes };
}
return {
sampleRate: h.sampleRate,
channels: h.channels,
bitsPerSample: 16, // conventional for MP3 (decoder handles the real format internally)
byteRate: h.bitrateKbps * 125, // bytes/sec; used for CBR seek estimate
blockAlign,
totalDuration,
audioDataOffset: frameStart, // file-absolute byte where audio frames begin
seekData
};
}
getAlignedSegmentSize(
info: FormatInfo,
availableBytes: number,
requestedSize: number,
streamComplete: boolean
): number {
const minSize = 4096; // at least 4 KB before starting decode
if (availableBytes === 0) return 0;
if (info.blockAlign > 0) {
// CBR: align to complete MP3 frames so each segment is independently decodable.
const minFrames = Math.ceil(minSize / info.blockAlign);
const availableFrames = Math.floor(availableBytes / info.blockAlign);
if (!streamComplete && availableFrames < minFrames) return 0;
const requestedFrames = Math.floor(Math.min(requestedSize, availableBytes) / info.blockAlign);
return Math.max(streamComplete ? 1 : minFrames, requestedFrames) * info.blockAlign;
}
// VBR: size-based — frame sizes vary, so we cannot align cleanly. The browser MP3
// decoder skips a partial leading frame gracefully.
if (!streamComplete && availableBytes < minSize) return 0;
return Math.min(requestedSize, availableBytes);
}
wrapSegment(_info: FormatInfo, rawBytes: Uint8Array): Uint8Array {
// MP3 frames are self-contained; decodeAudioData accepts raw frame data directly.
return rawBytes;
}
calculateByteOffset(info: FormatInfo, positionSeconds: number): number {
if (info.totalDuration == null || info.totalDuration <= 0) {
// No duration info — CBR byteRate estimate.
return Mp3FormatDecoder.byteRateOffset(info, positionSeconds);
}
const mp3Vbr = info.seekData?.kind === 'mp3-vbr' ? info.seekData as Mp3VbrSeekData : null;
if (mp3Vbr?.toc && mp3Vbr.totalBytes > 0) {
// VBR with Xing TOC — interpolate file-byte fraction from the percentage table.
const percent = Math.min(99, positionSeconds / info.totalDuration * 100);
const tocIdx = Math.floor(percent);
const tocFrac = percent - tocIdx;
const t0 = mp3Vbr.toc[tocIdx];
const t1 = tocIdx < 99 ? mp3Vbr.toc[tocIdx + 1] : 256;
const bytePercent = (t0 + (t1 - t0) * tocFrac) / 256.0;
return info.audioDataOffset + Math.floor(bytePercent * mp3Vbr.totalBytes);
}
// VBR without TOC or CBR with duration — byteRate estimate.
return Mp3FormatDecoder.byteRateOffset(info, positionSeconds);
}
/**
* CBR/VBR-without-TOC seek estimate from average byte rate. Frame-aligns the result
* when blockAlign is known (CBR); otherwise returns the raw byte position.
*/
private static byteRateOffset(info: FormatInfo, positionSeconds: number): number {
if (info.byteRate <= 0) return info.audioDataOffset;
const raw = Math.floor(positionSeconds * info.byteRate);
if (info.blockAlign > 0) {
return info.audioDataOffset + Math.floor(raw / info.blockAlign) * info.blockAlign;
}
return info.audioDataOffset + raw;
}
/** Concatenate the accumulated chunks into one contiguous buffer. */
private static concat(chunks: Uint8Array[], totalSize: number): Uint8Array {
if (chunks.length === 1) return chunks[0];
const buffer = new Uint8Array(totalSize);
let offset = 0;
for (const c of chunks) {
buffer.set(c, offset);
offset += c.length;
}
return buffer;
}
/**
* Return the byte offset past an ID3v2 tag if one is present at the buffer start,
* else 0. The size field is a syncsafe big-endian uint28 (each byte's bit 7 is 0).
*/
private static id3v2Skip(buffer: Uint8Array): number {
if (buffer.length < 10) return 0;
if (buffer[0] !== 0x49 || buffer[1] !== 0x44 || buffer[2] !== 0x33) return 0; // 'I' 'D' '3'
const size = (buffer[6] << 21) | (buffer[7] << 14) | (buffer[8] << 7) | buffer[9];
const hasFooter = (buffer[5] & 0x10) !== 0; // bit 4 of flags byte
return 10 + size + (hasFooter ? 10 : 0);
}
/**
* Scan one byte at a time from `start` for the first byte position that begins a valid
* MPEG Layer III frame. Returns the offset, or -1 if none found in the available bytes
* (caller should wait for more data).
*/
private static findFrameSync(buffer: Uint8Array, start: number): number {
// Need 4 bytes for a full frame header.
for (let i = Math.max(0, start); i + 4 <= buffer.length; i++) {
if (buffer[i] !== 0xff) continue;
if ((buffer[i + 1] & 0xe0) !== 0xe0) continue; // top 3 bits of byte 1 must be set
const version = (buffer[i + 1] >> 3) & 3;
const layer = (buffer[i + 1] >> 1) & 3;
const bitrateIndex = buffer[i + 2] >> 4;
const sampleRateIndex = (buffer[i + 2] >> 2) & 3;
if (version === 1) continue; // 01 = reserved
if (layer !== 1) continue; // 01 = Layer III
if (bitrateIndex === 0 || bitrateIndex === 15) continue; // free / reserved
if (sampleRateIndex === 3) continue; // reserved
return i;
}
return -1;
}
/**
* Decode the 4-byte frame header at `frameStart`. Returns null if the resolved
* bitrate/sample-rate are invalid (defensive — findFrameSync already validated indices).
*/
private static decodeFrameHeader(buffer: Uint8Array, frameStart: number): {
version: number;
sampleRate: number;
channels: number;
channelMode: number;
bitrateKbps: number;
samplesPerFrame: number;
frameSize: number;
} | null {
const b1 = buffer[frameStart + 1];
const b2 = buffer[frameStart + 2];
const b3 = buffer[frameStart + 3];
const version = (b1 >> 3) & 3; // 3 = MPEG1, 2 = MPEG2, 0 = MPEG2.5
const bitrateIndex = b2 >> 4;
const sampleRateIndex = (b2 >> 2) & 3;
const paddingBit = (b2 >> 1) & 1;
const channelMode = b3 >> 6; // 0-2 = stereo variants, 3 = mono
const isMpeg1 = version === 3;
const bitrateKbps = isMpeg1
? Mp3FormatDecoder.BITRATES_MPEG1[bitrateIndex]
: Mp3FormatDecoder.BITRATES_MPEG2[bitrateIndex];
const sampleRate = version === 3
? Mp3FormatDecoder.SAMPLE_RATES_MPEG1[sampleRateIndex]
: version === 2
? Mp3FormatDecoder.SAMPLE_RATES_MPEG2[sampleRateIndex]
: Mp3FormatDecoder.SAMPLE_RATES_MPEG25[sampleRateIndex];
if (!bitrateKbps || !sampleRate) return null;
const channels = channelMode === 3 ? 1 : 2;
const samplesPerFrame = isMpeg1 ? 1152 : 576;
const frameSize = Math.floor(144 * bitrateKbps * 1000 / sampleRate) + paddingBit;
return { version, sampleRate, channels, channelMode, bitrateKbps, samplesPerFrame, frameSize };
}
/**
* Detect a Xing/Info (VBR or CBR-with-info) or VBRI header inside the first frame.
* Returns null when neither is present (pure CBR).
*/
private static parseVbrHeader(
buffer: Uint8Array,
frameStart: number,
h: { version: number; channelMode: number }
): { isXing: boolean; totalFrames: number; totalBytes: number; toc: Uint8Array | null } | null {
const isMpeg1 = h.version === 3;
const isMono = h.channelMode === 3;
// Side-info region size depends on version and channel count.
const sideInfoOffset = isMpeg1
? (isMono ? 17 : 32)
: (isMono ? 9 : 17);
const xing = Mp3FormatDecoder.parseXing(buffer, frameStart, sideInfoOffset);
if (xing) return xing;
return Mp3FormatDecoder.parseVbri(buffer, frameStart);
}
/** Parse a Xing/Info tag in the side-info region. Returns null if absent. */
private static parseXing(
buffer: Uint8Array,
frameStart: number,
sideInfoOffset: number
): { isXing: boolean; totalFrames: number; totalBytes: number; toc: Uint8Array | null } | null {
const tagPos = frameStart + 4 + sideInfoOffset;
if (tagPos + 8 > buffer.length) return null;
const isXing = Mp3FormatDecoder.matchAscii(buffer, tagPos, 'Xing');
const isInfo = Mp3FormatDecoder.matchAscii(buffer, tagPos, 'Info');
if (!isXing && !isInfo) return null;
const flags = Mp3FormatDecoder.readUint32BE(buffer, tagPos + 4);
// Fields are packed in flag order: frames, bytes, TOC, quality.
let pos = tagPos + 8;
let totalFrames = 0;
let totalBytes = 0;
let toc: Uint8Array | null = null;
if (flags & 0x1) { // frame count present
if (pos + 4 > buffer.length) return { isXing, totalFrames, totalBytes, toc };
totalFrames = Mp3FormatDecoder.readUint32BE(buffer, pos);
pos += 4;
}
if (flags & 0x2) { // byte count present
if (pos + 4 > buffer.length) return { isXing, totalFrames, totalBytes, toc };
totalBytes = Mp3FormatDecoder.readUint32BE(buffer, pos);
pos += 4;
}
if (flags & 0x4) { // TOC present — only meaningful alongside a frame count
if (pos + 100 <= buffer.length && (flags & 0x1)) {
toc = buffer.slice(pos, pos + 100);
}
pos += 100;
}
return { isXing, totalFrames, totalBytes, toc };
}
/** Parse a VBRI tag at the fixed Fraunhofer position. Returns null if absent. */
private static parseVbri(
buffer: Uint8Array,
frameStart: number
): { isXing: boolean; totalFrames: number; totalBytes: number; toc: Uint8Array | null } | null {
const pos = frameStart + 4 + 32;
if (pos + 18 > buffer.length) return null;
if (!Mp3FormatDecoder.matchAscii(buffer, pos, 'VBRI')) return null;
const totalFrames = Mp3FormatDecoder.readUint32BE(buffer, pos + 14);
// VBRI is always VBR but its TOC layout differs from Xing's percentage table;
// we surface duration only and fall back to byteRate seek estimation.
return { isXing: true, totalFrames, totalBytes: 0, toc: null };
}
private static matchAscii(buffer: Uint8Array, pos: number, tag: string): boolean {
if (pos + tag.length > buffer.length) return false;
for (let i = 0; i < tag.length; i++) {
if (buffer[pos + i] !== tag.charCodeAt(i)) return false;
}
return true;
}
private static readUint32BE(buffer: Uint8Array, pos: number): number {
// Unsigned: coerce the sign bit back to a positive value.
return ((buffer[pos] << 24) | (buffer[pos + 1] << 16) | (buffer[pos + 2] << 8) | buffer[pos + 3]) >>> 0;
}
}