From: Norm Raden Date: Mon, 12 Sep 2022 17:30:30 +0000 (-0400) Subject: Added more 'text modes' to the ATSC Multiple String Structure decoder and convert... X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=d25c19d673136fbf8572e901ed3c3e871e8b6dd4;p=thirdparty%2Ftvheadend.git Added more 'text modes' to the ATSC Multiple String Structure decoder and convert text to UTF-8. (Fixes #5162) - Added support for decoding ATSC's "Multiple String Structure" text modes 0x1-0x6, 0x9-0x10, 0x20-0x27, 0x30-0x33. - Convert decoded text to UTF-8 instead of ISO-8859-1. - For unsupported 'compression types' or 'text modes' return a text string "[comptype=0x??,mode=0x??]" indicating the attempted compression type and text mode instead of the text segment. Text output from ATSC's "Multiple String Structure" decoder should properly render in web browsers, specifically Unicode characters >= 0x80. --- diff --git a/src/input/mpegts/dvb_support.c b/src/input/mpegts/dvb_support.c index e1718c120..2656c700a 100644 --- a/src/input/mpegts/dvb_support.c +++ b/src/input/mpegts/dvb_support.c @@ -421,15 +421,26 @@ atsc_utf16_to_utf8(const uint8_t *src, int len, char *buf, int buflen) *buf = 0; } +/* Decode and convert ATSC Multiple String Structures to UTF-8. + * + * refer to "ATSC Standard: Program and System Information Protocol for Terrestrial Broadcast and Cable" + * (Document A65/2013), Section 6.10, pages 79-82 + */ + lang_str_t * atsc_get_string (const uint8_t *src, size_t srclen) { + const int bufferSize = 255*3+1; /* Max length of a text segment (255) * + * Max size of a UTF-8 for Unicode characters 0x0 .. 0x33FF (3) + + * NULL terminator (1) */ lang_str_t *ls = NULL; - int i, j, stringcount, segmentcount; + int i, j, k, stringcount, segmentcount; int compressiontype, mode, bytecount; char langcode[4]; - char buf[256]; + char buf[bufferSize]; /* UTF-8 string */ + int utf8size, remainingbufleft; + char *bufpointer; stringcount = src[0]; tvhtrace(LS_MPEGTS, "atsc-str: %d strings", stringcount); @@ -450,7 +461,9 @@ atsc_get_string src += 4; srclen -= 4; + /* Step through the list of text segments, decoding each and append them all together. */ for (j = 0; j < segmentcount && srclen >= 3; j++) { + /* Decode text segment header */ compressiontype = src[0]; mode = src[1]; bytecount = src[2]; @@ -461,24 +474,70 @@ atsc_get_string if (bytecount > srclen) return ls; - if (mode == 0 && compressiontype == 0) { + /* Only supports compression type == 0 (none) and + * text modes == 0x0 .. 0x6, 0x9 .. 0x10, 0x20 .. 0x27, 0x30 .. 0x33 */ + + if (compressiontype == 0 && ( /* No Compression and one of these: */ + (mode >= 0x0 && mode <= 0x6) || /* Unicode range 0x0000 .. 0x06FF */ + (mode >= 0x9 && mode <= 0x10) || /* Unicode range 0x0900 .. 0x10FF */ + (mode >= 0x20 && mode <= 0x27) || /* Unicode range 0x2000 .. 0x27FF */ + (mode >= 0x30 && mode <= 0x33) /* Unicode range 0x3000 .. 0x33FF */ + )) { tvhtrace(LS_MPEGTS, "atsc-str: %d: comptype 0x%02x, mode 0x%02x, %d bytes: '%.*s'", j, compressiontype, mode, bytecount, bytecount, src); - memcpy(buf, src, bytecount); - buf[bytecount] = '\0'; + + /* Convert each decoded Unicode character to UTF-8. */ + for(k = 0, bufpointer = buf, remainingbufleft = bufferSize; k < bytecount; k++) { + /* Make sure there is enough buffer left for the next (or last) UTF-8 character + * [Max # bytes in a single UTF-8 character (3) + NULL terminator (1)] */ + if(remainingbufleft > (3+1)) { + /* Construct the Unicode character and convert to UTF-8. */ + utf8size = put_utf8(bufpointer, (mode << 8) | src[k]); + bufpointer += utf8size; + remainingbufleft -= utf8size; + } else { + /* We have run out of buffer space for this text segment, then stop and truncate. + * This can only happen if 'bufferSize' is too small. */ + tvhtrace(LS_MPEGTS, "atsc_get_string: bufferSize is too small"); + break; + } + } + *bufpointer = '\0'; + if (ls == NULL) ls = lang_str_create(); lang_str_append(ls, buf, langcode); } else { + /* Unsupported text segment types: + * + * - compression type == 0x1 (Huffman-like coding with a fixed codebook) + * - compression type == 0x2 (Huffman-like coding with another fixed codebook) + * - compression type == 0x3 .. 0xFF (reserved) + * + * - text mode == 0x3E (Standard Compression Scheme for Unicode) + * - text mode == 0x3F (Select Unicode, UTF-16 Form) + * - text mode == 0x40, 0x41 (ATSC Standard for Taiwan) + * - text mode == 0x48 (ATSC Standard for South Korea) + * + * - text mode == 0x7, 0x8, 0x11 .. 0x1F, 0x28 .. 0x2F, 0x34 .. 0x3D, 0x42 .. 0x47, 0x49 .. 0xFF (reserved) + */ + tvhtrace(LS_MPEGTS, "atsc-str: %d: comptype 0x%02x, mode 0x%02x, %d bytes", j, compressiontype, mode, bytecount); + + /* For text segments that are not supported, write a terse diagnostic text indicating + * the unsupported type instead of the text segment. */ + snprintf(buf, bufferSize - 1, "[comptype=0x%02X,mode=0x%02X]", compressiontype, mode); + if (ls == NULL) + ls = lang_str_create(); + lang_str_append(ls, buf, langcode); } - /* FIXME: read compressed bytes */ - src += bytecount; srclen -= bytecount; // skip for now + /* Move on to the next text segment. */ + src += bytecount; + srclen -= bytecount; } } - return ls; }