{
const char *c = s;
const char *e = s + strlen(s);
- const char *esc;
+ const char *esc = 0;
int h;
if(e == s)
return;
- while(1) {
+ while(c<e) {
h = *c++;
+ if (h & 0x80) {
+ // Start of UTF-8. But we sometimes have bad UTF-8 (#5909).
+ // So validate and handle bad characters.
+
+ // Number of top bits set indicates the total number of bytes.
+ const int num_bytes =
+ ((h & 240) == 240) ? 4 :
+ ((h & 224) == 224) ? 3 :
+ ((h & 192) == 192) ? 2 : 0;
+
+ if (!num_bytes) {
+ // Completely invalid sequence, so we replace it with a space.
+ htsbuf_append(hq, s, c - s - 1);
+ htsbuf_append(hq, " ", 1);
+ s=c;
+ continue;
+ } else {
+ // Start of valid UTF-8.
+ if (e - c < num_bytes - 1) {
+ // Invalid sequence - too few characters left in buffer for the sequence.
+ // Append what we already have accumulated and ignore remaining characters.
+ htsbuf_append(hq, s, c - s - 1);
+ break;
+ } else {
+ // We should probably check each character in the range is also valid.
+ htsbuf_append(hq, s, c - s - 1);
+ htsbuf_append(hq, c-1, num_bytes);
+ s=c-1;
+ s+=num_bytes;
+ c=s;
+ continue;
+ }
+ }
+ }
+
switch(h) {
case '<': esc = "<"; break;
case '>': esc = ">"; break;