fix(TTML): Correctly handle multiple samples in a segment (#8088)

Fixes #8087 Implements handling of multiple samples in a MP4/ISOBMFF/DASH TTML segment/fragment. Such segments are allowed by ISO14496-12 and ISO23000-19. gpac creates such segments. The prior code just treated the full MDAT as one TTML XML document and tried to parse it in whole without accounting for sample(s). A testcase is included which was created by taking the testdata from ttml-segment.mp4 and splitting the subtitles into two independent TTML-XML documents, which then were put as individual samples. The testdata for the prior existing multiple MDAT testcase was invalid. It was created by taking the same ttml-segment.mp4 as a source and just duplicating the MDAT box, but without then also fixing the TRUN box. The duplicated data was thus not referenced. The test case still worked, because the prior code did not look at the TRUN box and the sample specification at all and just handled any full MDAT box = 1 sample. The testdata was replaced with a new file, which is basically the same as for the multiple samples case, but with the two samples split into two MDAT boxes.
shaka-project · Feb 17, 2025 · 4446d4d · 4446d4d
1 parent 33037d1
commit 4446d4d
Show file tree

Hide file tree

Showing 6 changed files with 110 additions and 34 deletions.
diff --git a/AUTHORS b/AUTHORS
@@ -56,6 +56,7 @@ Jesper Haug Karsrud <[email protected]>
 Johan Sundström <[email protected]>
 Jonas Birmé <[email protected]>
 Jozef Chúťka <[email protected]>
+Juliane Holzt <[email protected]>
 Jun Hong Chong <[email protected]>
 Jürgen Kartnaller <[email protected]>
 Justin Swaney <[email protected]>

diff --git a/CONTRIBUTORS b/CONTRIBUTORS
@@ -86,6 +86,7 @@ Jono Ward <[email protected]>
 Jozef Chúťka <[email protected]>
 Juan Manuel Tomás <[email protected]>
 Julian Domingo <[email protected]>
+Juliane Holzt <[email protected]>
 Jun Hong Chong <[email protected]>
 Jürgen Kartnaller <[email protected]>
 Justin Swaney <[email protected]>

diff --git a/lib/text/mp4_ttml_parser.js b/lib/text/mp4_ttml_parser.js
@@ -6,10 +6,12 @@
 
 goog.provide('shaka.text.Mp4TtmlParser');
 
+goog.require('goog.asserts');
 goog.require('shaka.text.TextEngine');
 goog.require('shaka.text.TtmlTextParser');
 goog.require('shaka.util.BufferUtils');
 goog.require('shaka.util.Error');
+goog.require('shaka.util.Mp4BoxParsers');
 goog.require('shaka.util.Mp4Parser');
 goog.require('shaka.util.Uint8ArrayUtils');
 
@@ -80,67 +82,117 @@ shaka.text.Mp4TtmlParser = class {
   parseMedia(data, time, uri) {
     const Mp4Parser = shaka.util.Mp4Parser;
 
-    let sawMDAT = false;
     let payload = [];
+    let defaultSampleSize = null;
+
+    /** @type {!Array<Uint8Array>} */
+    const mdats = [];
+
+    /* @type {!Map<number,!Array<number>>} */
+    const subSampleSizesPerSample = new Map();
 
     /** @type {!Array<number>} */
-    let subSizes = [];
+    const sampleSizes = [];
 
     const parser = new Mp4Parser()
         .box('moof', Mp4Parser.children)
         .box('traf', Mp4Parser.children)
+        .fullBox('tfhd', (box) => {
+          goog.asserts.assert(
+              box.flags != null,
+              'A TFHD box should have a valid flags value');
+          const parsedTFHDBox = shaka.util.Mp4BoxParsers.parseTFHD(
+              box.reader, box.flags);
+          defaultSampleSize = parsedTFHDBox.defaultSampleSize;
+        })
+        .fullBox('trun', (box) => {
+          goog.asserts.assert(
+              box.version != null,
+              'A TRUN box should have a valid version value');
+          goog.asserts.assert(
+              box.flags != null,
+              'A TRUN box should have a valid flags value');
+
+          const parsedTRUNBox = shaka.util.Mp4BoxParsers.parseTRUN(
+              box.reader, box.version, box.flags);
+
+          for (const sample of parsedTRUNBox.sampleData) {
+            const sampleSize =
+                sample.sampleSize || defaultSampleSize || 0;
+            sampleSizes.push(sampleSize);
+          }
+        })
         .fullBox('subs', (box) => {
-          subSizes = [];
           const reader = box.reader;
           const entryCount = reader.readUint32();
+          let currentSampleNum = -1;
           for (let i = 0; i < entryCount; i++) {
-            reader.readUint32(); // sample_delta
+            const sampleDelta = reader.readUint32();
+            currentSampleNum += sampleDelta;
             const subsampleCount = reader.readUint16();
+            const subsampleSizes = [];
             for (let j = 0; j < subsampleCount; j++) {
               if (box.version == 1) {
-                subSizes.push(reader.readUint32());
+                subsampleSizes.push(reader.readUint32());
               } else {
-                subSizes.push(reader.readUint16());
+                subsampleSizes.push(reader.readUint16());
               }
               reader.readUint8(); // priority
               reader.readUint8(); // discardable
               reader.readUint32(); // codec_specific_parameters
             }
+            subSampleSizesPerSample.set(currentSampleNum, subsampleSizes);
           }
         })
         .box('mdat', Mp4Parser.allData((data) => {
-          sawMDAT = true;
-          // Join this to any previous payload, in case the mp4 has multiple
-          // mdats.
-          if (subSizes.length) {
-            const contentData =
-                shaka.util.BufferUtils.toUint8(data, 0, subSizes[0]);
-            const images = [];
-            let offset = subSizes[0];
-            for (let i = 1; i < subSizes.length; i++) {
-              const imageData =
-                  shaka.util.BufferUtils.toUint8(data, offset, subSizes[i]);
-              const raw =
-                  shaka.util.Uint8ArrayUtils.toStandardBase64(imageData);
-              images.push('data:image/png;base64,' + raw);
-              offset += subSizes[i];
-            }
-            payload = payload.concat(
-                this.parser_.parseMedia(contentData, time, uri, images));
-          } else {
-            payload = payload.concat(
-                this.parser_.parseMedia(data, time, uri, /* images= */ []));
-          }
+          // We collect all of the mdats first, before parsing any of them.
+          // This is necessary in case the mp4 has multiple mdats.
+          mdats.push(data);
         }));
     parser.parse(data, /* partialOkay= */ false);
 
-    if (!sawMDAT) {
+    if (mdats.length == 0) {
       throw new shaka.util.Error(
           shaka.util.Error.Severity.CRITICAL,
           shaka.util.Error.Category.TEXT,
           shaka.util.Error.Code.INVALID_MP4_TTML);
     }
 
+    const fullData =
+        shaka.util.Uint8ArrayUtils.concat(...mdats);
+
+    let sampleOffset = 0;
+    for (let sampleNum = 0; sampleNum < sampleSizes.length; sampleNum++) {
+      const sampleData =
+          shaka.util.BufferUtils.toUint8(fullData, sampleOffset,
+              sampleSizes[sampleNum]);
+      sampleOffset += sampleSizes[sampleNum];
+
+      const subSampleSizes = subSampleSizesPerSample.get(sampleNum);
+
+      if (subSampleSizes && subSampleSizes.length) {
+        const contentData =
+            shaka.util.BufferUtils.toUint8(sampleData, 0, subSampleSizes[0]);
+        const images = [];
+        let subOffset = subSampleSizes[0];
+        for (let i = 1; i < subSampleSizes.length; i++) {
+          const imageData =
+              shaka.util.BufferUtils.toUint8(data, subOffset,
+                  subSampleSizes[i]);
+          const raw =
+              shaka.util.Uint8ArrayUtils.toStandardBase64(imageData);
+          images.push('data:image/png;base64,' + raw);
+          subOffset += subSampleSizes[i];
+        }
+        payload = payload.concat(
+            this.parser_.parseMedia(contentData, time, uri, images));
+      } else {
+        payload = payload.concat(
+            this.parser_.parseMedia(sampleData, time, uri,
+                /* images= */ []));
+      }
+    }
+
     return payload;
   }
 };

diff --git a/test/test/assets/ttml-segment-multiple-mdat.mp4 b/test/test/assets/ttml-segment-multiple-mdat.mp4
diff --git a/test/test/assets/ttml-segment-multiple-sample.mp4 b/test/test/assets/ttml-segment-multiple-sample.mp4
diff --git a/test/text/mp4_ttml_parser_unit.js b/test/text/mp4_ttml_parser_unit.js
@@ -9,6 +9,8 @@ describe('Mp4TtmlParser', () => {
   const ttmlSegmentUri = '/base/test/test/assets/ttml-segment.mp4';
   const ttmlSegmentMultipleMDATUri =
       '/base/test/test/assets/ttml-segment-multiple-mdat.mp4';
+  const ttmlSegmentMultipleSampleUri =
+      '/base/test/test/assets/ttml-segment-multiple-sample.mp4';
   const imscImageInitSegmentUri =
       '/base/test/test/assets/imsc-image-init.cmft';
   const imscImageSegmentUri =
@@ -22,6 +24,8 @@ describe('Mp4TtmlParser', () => {
   /** @type {!Uint8Array} */
   let ttmlSegmentMultipleMDAT;
   /** @type {!Uint8Array} */
+  let ttmlSegmentMultipleSample;
+  /** @type {!Uint8Array} */
   let imscImageInitSegment;
   /** @type {!Uint8Array} */
   let imscImageSegment;
@@ -33,16 +37,18 @@ describe('Mp4TtmlParser', () => {
       shaka.test.Util.fetch(ttmlInitSegmentUri),
       shaka.test.Util.fetch(ttmlSegmentUri),
       shaka.test.Util.fetch(ttmlSegmentMultipleMDATUri),
+      shaka.test.Util.fetch(ttmlSegmentMultipleSampleUri),
       shaka.test.Util.fetch(imscImageInitSegmentUri),
       shaka.test.Util.fetch(imscImageSegmentUri),
       shaka.test.Util.fetch(audioInitSegmentUri),
     ]);
     ttmlInitSegment = shaka.util.BufferUtils.toUint8(responses[0]);
     ttmlSegment = shaka.util.BufferUtils.toUint8(responses[1]);
     ttmlSegmentMultipleMDAT = shaka.util.BufferUtils.toUint8(responses[2]);
-    imscImageInitSegment = shaka.util.BufferUtils.toUint8(responses[3]);
-    imscImageSegment = shaka.util.BufferUtils.toUint8(responses[4]);
-    audioInitSegment = shaka.util.BufferUtils.toUint8(responses[5]);
+    ttmlSegmentMultipleSample = shaka.util.BufferUtils.toUint8(responses[3]);
+    imscImageInitSegment = shaka.util.BufferUtils.toUint8(responses[4]);
+    imscImageSegment = shaka.util.BufferUtils.toUint8(responses[5]);
+    audioInitSegment = shaka.util.BufferUtils.toUint8(responses[6]);
   });
 
   it('parses init segment', () => {
@@ -62,8 +68,24 @@ describe('Mp4TtmlParser', () => {
     expect(ret[0].nestedCues.length).toBe(1);
     expect(ret[1].nestedCues.length).toBe(1);
     // Cues.
-    expect(ret[0].nestedCues[0].nestedCues.length).toBe(10);
-    expect(ret[1].nestedCues[0].nestedCues.length).toBe(10);
+    expect(ret[0].nestedCues[0].nestedCues.length).toBe(5);
+    expect(ret[1].nestedCues[0].nestedCues.length).toBe(5);
+  });
+
+  it('handles media segments with multiple sample', () => {
+    const parser = new shaka.text.Mp4TtmlParser();
+    parser.parseInit(ttmlInitSegment);
+    const time =
+        {periodStart: 0, segmentStart: 0, segmentEnd: 60, vttOffset: 0};
+    const ret = parser.parseMedia(ttmlSegmentMultipleSample, time, null);
+    // Bodies.
+    expect(ret.length).toBe(2);
+    // Divs.
+    expect(ret[0].nestedCues.length).toBe(1);
+    expect(ret[1].nestedCues.length).toBe(1);
+    // Cues.
+    expect(ret[0].nestedCues[0].nestedCues.length).toBe(5);
+    expect(ret[1].nestedCues[0].nestedCues.length).toBe(5);
   });
 
   it('accounts for offset', () => {