From: Norm Raden <norm.raden+github@gmail.com>
Date: Mon, 12 Sep 2022 17:30:30 +0000 (-0400)
Subject: Added more 'text modes' to the ATSC Multiple String Structure decoder and convert... 
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=d25c19d673136fbf8572e901ed3c3e871e8b6dd4;p=thirdparty%2Ftvheadend.git

Added more 'text modes' to the ATSC Multiple String Structure decoder and convert text to UTF-8. (Fixes #5162)

 - Added support for decoding ATSC's "Multiple String Structure" text modes 0x1-0x6, 0x9-0x10, 0x20-0x27, 0x30-0x33.
 - Convert decoded text to UTF-8 instead of ISO-8859-1.
 - For unsupported 'compression types' or 'text modes' return a text string "[comptype=0x??,mode=0x??]" indicating
   the attempted compression type and text mode instead of the text segment.

Text output from ATSC's "Multiple String Structure" decoder should properly render in web browsers, specifically Unicode characters >= 0x80.
---

diff --git a/src/input/mpegts/dvb_support.c b/src/input/mpegts/dvb_support.c
index e1718c120..2656c700a 100644
--- a/src/input/mpegts/dvb_support.c
+++ b/src/input/mpegts/dvb_support.c
@@ -421,15 +421,26 @@ atsc_utf16_to_utf8(const uint8_t *src, int len, char *buf, int buflen)
   *buf = 0;
 }
 
+/* Decode and convert ATSC Multiple String Structures to UTF-8.
+ * 
+ * refer to "ATSC Standard: Program and System Information Protocol for Terrestrial Broadcast and Cable"
+ *          (Document A65/2013), Section 6.10, pages 79-82
+ */
+
 lang_str_t *
 atsc_get_string
   (const uint8_t *src, size_t srclen)
 {
+  const int bufferSize = 255*3+1; /* Max length of a text segment (255) *
+                                   * Max size of a UTF-8 for Unicode characters 0x0 .. 0x33FF (3) +
+                                   * NULL terminator (1) */
   lang_str_t *ls = NULL;
-  int i, j, stringcount, segmentcount;
+  int i, j, k, stringcount, segmentcount;
   int compressiontype, mode, bytecount;
   char langcode[4];
-  char buf[256];
+  char buf[bufferSize]; /* UTF-8 string */
+  int utf8size, remainingbufleft;
+  char *bufpointer;
 
   stringcount = src[0];
   tvhtrace(LS_MPEGTS, "atsc-str: %d strings", stringcount);
@@ -450,7 +461,9 @@ atsc_get_string
     src    += 4;
     srclen -= 4;
 
+    /* Step through the list of text segments, decoding each and append them all together. */
     for (j = 0; j < segmentcount && srclen >= 3; j++) {
+      /* Decode text segment header */
       compressiontype = src[0];
       mode            = src[1];
       bytecount       = src[2];
@@ -461,24 +474,70 @@ atsc_get_string
       if (bytecount > srclen)
         return ls;
 
-      if (mode == 0 && compressiontype == 0) {
+      /* Only supports compression type == 0 (none) and 
+       * text modes == 0x0 .. 0x6, 0x9 .. 0x10, 0x20 .. 0x27, 0x30 .. 0x33 */
+
+      if (compressiontype == 0 && (     /* No Compression and one of these: */
+           (mode >= 0x0 && mode <= 0x6) ||   /* Unicode range 0x0000 .. 0x06FF */
+           (mode >= 0x9 && mode <= 0x10) ||  /* Unicode range 0x0900 .. 0x10FF */
+           (mode >= 0x20 && mode <= 0x27) || /* Unicode range 0x2000 .. 0x27FF */
+           (mode >= 0x30 && mode <= 0x33)    /* Unicode range 0x3000 .. 0x33FF */
+         )) {
         tvhtrace(LS_MPEGTS, "atsc-str:    %d: comptype 0x%02x, mode 0x%02x, %d bytes: '%.*s'",
                  j, compressiontype, mode, bytecount, bytecount, src);
-        memcpy(buf, src, bytecount);
-        buf[bytecount] = '\0';
+
+        /* Convert each decoded Unicode character to UTF-8. */
+        for(k = 0, bufpointer = buf, remainingbufleft = bufferSize; k < bytecount; k++) {
+          /* Make sure there is enough buffer left for the next (or last) UTF-8 character
+           * [Max # bytes in a single UTF-8 character (3) + NULL terminator (1)] */
+          if(remainingbufleft > (3+1)) {
+            /* Construct the Unicode character and convert to UTF-8. */
+            utf8size = put_utf8(bufpointer, (mode << 8) | src[k]);
+            bufpointer += utf8size;
+            remainingbufleft -= utf8size;
+          } else {
+            /* We have run out of buffer space for this text segment, then stop and truncate.
+             * This can only happen if 'bufferSize' is too small. */
+            tvhtrace(LS_MPEGTS, "atsc_get_string: bufferSize is too small");
+            break;
+          }
+        }
+        *bufpointer = '\0';
+
         if (ls == NULL)
           ls = lang_str_create();
         lang_str_append(ls, buf, langcode);
       } else {
+        /* Unsupported text segment types:
+         *
+         * - compression type == 0x1 (Huffman-like coding with a fixed codebook)
+         * - compression type == 0x2 (Huffman-like coding with another fixed codebook)
+         * - compression type == 0x3 .. 0xFF (reserved)
+         *
+         * - text mode == 0x3E (Standard Compression Scheme for Unicode)
+         * - text mode == 0x3F (Select Unicode, UTF-16 Form)
+         * - text mode == 0x40, 0x41 (ATSC Standard for Taiwan)
+         * - text mode == 0x48 (ATSC Standard for South Korea)
+         *
+         * - text mode == 0x7, 0x8, 0x11 .. 0x1F, 0x28 .. 0x2F, 0x34 .. 0x3D, 0x42 .. 0x47, 0x49 .. 0xFF (reserved)
+         */
+
         tvhtrace(LS_MPEGTS, "atsc-str:    %d: comptype 0x%02x, mode 0x%02x, %d bytes",
                  j, compressiontype, mode, bytecount);
+
+        /* For text segments that are not supported, write a terse diagnostic text indicating
+         * the unsupported type instead of the text segment. */
+        snprintf(buf, bufferSize - 1, "[comptype=0x%02X,mode=0x%02X]", compressiontype, mode);
+        if (ls == NULL)
+          ls = lang_str_create();
+        lang_str_append(ls, buf, langcode);
       }
 
-      /* FIXME: read compressed bytes */
-      src += bytecount; srclen -= bytecount; // skip for now
+      /* Move on to the next text segment. */
+      src += bytecount;
+      srclen -= bytecount;
     }
   }
-
   return ls;
 }