Handle bad UTF-8 in xmltv (#5909)

author E.Smith <31170571+azlm8t@users.noreply.github.com>

Wed, 10 Jun 2020 21:30:14 +0000 (22:30 +0100)

committer Flole998 <Flole998@users.noreply.github.com>

Mon, 6 Jul 2020 15:06:15 +0000 (17:06 +0200)
author E.Smith <31170571+azlm8t@users.noreply.github.com>
Wed, 10 Jun 2020 21:30:14 +0000 (22:30 +0100)
committer Flole998 <Flole998@users.noreply.github.com>
Mon, 6 Jul 2020 15:06:15 +0000 (17:06 +0200)
diff --git a/src/htsbuf.c b/src/htsbuf.c

index 50f22a5006ba64d425af455b14c2bd1752986b46..c7c9b10f8bbce2cca3e12593d6bd0f0a973ccc1a 100644 (file)
--- a/src/htsbuf.c
+++ b/src/htsbuf.c
@@ -353,15 +353,50 @@ htsbuf_append_and_escape_xml(htsbuf_queue_t *hq, const char *s)
  {
    const char *c = s;
    const char *e = s + strlen(s);
-  const char *esc;
+  const char *esc = 0;
    int h;
  
    if(e == s)
      return;
  
-  while(1) {
+  while(c<e) {
      h = *c++;
  
+    if (h & 0x80) {
+      // Start of UTF-8.  But we sometimes have bad UTF-8 (#5909).
+      // So validate and handle bad characters.
+
+      // Number of top bits set indicates the total number of  bytes.
+      const int num_bytes =
+        ((h & 240) == 240) ? 4 :
+        ((h & 224) == 224) ? 3 :
+        ((h & 192) == 192) ? 2 : 0;
+
+      if (!num_bytes) {
+        // Completely invalid sequence, so we replace it with a space.
+        htsbuf_append(hq, s, c - s - 1);
+        htsbuf_append(hq, " ", 1);
+        s=c;
+        continue;
+      } else {
+        // Start of valid UTF-8.
+        if (e - c < num_bytes - 1) {
+          // Invalid sequence - too few characters left in buffer for the sequence.
+          // Append what we already have accumulated and ignore remaining characters.
+          htsbuf_append(hq, s, c - s - 1);
+          break;
+        } else {
+          // We should probably check each character in the range is also valid.
+          htsbuf_append(hq, s, c - s - 1);
+          htsbuf_append(hq, c-1, num_bytes);
+          s=c-1;
+          s+=num_bytes;
+          c=s;
+          continue;
+        }
+      }
+    }
+
      switch(h) {
      case '<':  esc = "&lt;";   break;
      case '>':  esc = "&gt;";   break;
author	E.Smith <31170571+azlm8t@users.noreply.github.com>
	Wed, 10 Jun 2020 21:30:14 +0000 (22:30 +0100)
committer	Flole998 <Flole998@users.noreply.github.com>
	Mon, 6 Jul 2020 15:06:15 +0000 (17:06 +0200)