*buf = 0;
}
+/* Decode and convert ATSC Multiple String Structures to UTF-8.
+ *
+ * refer to "ATSC Standard: Program and System Information Protocol for Terrestrial Broadcast and Cable"
+ * (Document A65/2013), Section 6.10, pages 79-82
+ */
+
lang_str_t *
atsc_get_string
(const uint8_t *src, size_t srclen)
{
+ const int bufferSize = 255*3+1; /* Max length of a text segment (255) *
+ * Max size of a UTF-8 for Unicode characters 0x0 .. 0x33FF (3) +
+ * NULL terminator (1) */
lang_str_t *ls = NULL;
- int i, j, stringcount, segmentcount;
+ int i, j, k, stringcount, segmentcount;
int compressiontype, mode, bytecount;
char langcode[4];
- char buf[256];
+ char buf[bufferSize]; /* UTF-8 string */
+ int utf8size, remainingbufleft;
+ char *bufpointer;
stringcount = src[0];
tvhtrace(LS_MPEGTS, "atsc-str: %d strings", stringcount);
src += 4;
srclen -= 4;
+ /* Step through the list of text segments, decoding each and append them all together. */
for (j = 0; j < segmentcount && srclen >= 3; j++) {
+ /* Decode text segment header */
compressiontype = src[0];
mode = src[1];
bytecount = src[2];
if (bytecount > srclen)
return ls;
- if (mode == 0 && compressiontype == 0) {
+ /* Only supports compression type == 0 (none) and
+ * text modes == 0x0 .. 0x6, 0x9 .. 0x10, 0x20 .. 0x27, 0x30 .. 0x33 */
+
+ if (compressiontype == 0 && ( /* No Compression and one of these: */
+ (mode >= 0x0 && mode <= 0x6) || /* Unicode range 0x0000 .. 0x06FF */
+ (mode >= 0x9 && mode <= 0x10) || /* Unicode range 0x0900 .. 0x10FF */
+ (mode >= 0x20 && mode <= 0x27) || /* Unicode range 0x2000 .. 0x27FF */
+ (mode >= 0x30 && mode <= 0x33) /* Unicode range 0x3000 .. 0x33FF */
+ )) {
tvhtrace(LS_MPEGTS, "atsc-str: %d: comptype 0x%02x, mode 0x%02x, %d bytes: '%.*s'",
j, compressiontype, mode, bytecount, bytecount, src);
- memcpy(buf, src, bytecount);
- buf[bytecount] = '\0';
+
+ /* Convert each decoded Unicode character to UTF-8. */
+ for(k = 0, bufpointer = buf, remainingbufleft = bufferSize; k < bytecount; k++) {
+ /* Make sure there is enough buffer left for the next (or last) UTF-8 character
+ * [Max # bytes in a single UTF-8 character (3) + NULL terminator (1)] */
+ if(remainingbufleft > (3+1)) {
+ /* Construct the Unicode character and convert to UTF-8. */
+ utf8size = put_utf8(bufpointer, (mode << 8) | src[k]);
+ bufpointer += utf8size;
+ remainingbufleft -= utf8size;
+ } else {
+ /* We have run out of buffer space for this text segment, then stop and truncate.
+ * This can only happen if 'bufferSize' is too small. */
+ tvhtrace(LS_MPEGTS, "atsc_get_string: bufferSize is too small");
+ break;
+ }
+ }
+ *bufpointer = '\0';
+
if (ls == NULL)
ls = lang_str_create();
lang_str_append(ls, buf, langcode);
} else {
+ /* Unsupported text segment types:
+ *
+ * - compression type == 0x1 (Huffman-like coding with a fixed codebook)
+ * - compression type == 0x2 (Huffman-like coding with another fixed codebook)
+ * - compression type == 0x3 .. 0xFF (reserved)
+ *
+ * - text mode == 0x3E (Standard Compression Scheme for Unicode)
+ * - text mode == 0x3F (Select Unicode, UTF-16 Form)
+ * - text mode == 0x40, 0x41 (ATSC Standard for Taiwan)
+ * - text mode == 0x48 (ATSC Standard for South Korea)
+ *
+ * - text mode == 0x7, 0x8, 0x11 .. 0x1F, 0x28 .. 0x2F, 0x34 .. 0x3D, 0x42 .. 0x47, 0x49 .. 0xFF (reserved)
+ */
+
tvhtrace(LS_MPEGTS, "atsc-str: %d: comptype 0x%02x, mode 0x%02x, %d bytes",
j, compressiontype, mode, bytecount);
+
+ /* For text segments that are not supported, write a terse diagnostic text indicating
+ * the unsupported type instead of the text segment. */
+ snprintf(buf, bufferSize - 1, "[comptype=0x%02X,mode=0x%02X]", compressiontype, mode);
+ if (ls == NULL)
+ ls = lang_str_create();
+ lang_str_append(ls, buf, langcode);
}
- /* FIXME: read compressed bytes */
- src += bytecount; srclen -= bytecount; // skip for now
+ /* Move on to the next text segment. */
+ src += bytecount;
+ srclen -= bytecount;
}
}
-
return ls;
}