From: Bob Halley Date: Sun, 3 May 2020 13:34:53 +0000 (-0700) Subject: Do not double-UTF-8 encode escapes in TXT-like records. [Issue #321] X-Git-Tag: v2.0.0rc1~276 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=783deb045f82aa77a2229afe5001e59bbcfc553b;p=thirdparty%2Fdnspython.git Do not double-UTF-8 encode escapes in TXT-like records. [Issue #321] --- diff --git a/dns/rdtypes/txtbase.py b/dns/rdtypes/txtbase.py index 9c13ed98..dd928e65 100644 --- a/dns/rdtypes/txtbase.py +++ b/dns/rdtypes/txtbase.py @@ -60,18 +60,14 @@ class TXTBase(dns.rdata.Rdata): relativize_to=None): strings = [] while 1: - token = tok.get().unescape() + token = tok.get().unescape_to_bytes() if token.is_eol_or_eof(): break if not (token.is_quoted_string() or token.is_identifier()): raise dns.exception.SyntaxError("expected a string") if len(token.value) > 255: raise dns.exception.SyntaxError("string too long") - value = token.value - if isinstance(value, bytes): - strings.append(value) - else: - strings.append(value.encode()) + strings.append(token.value) if len(strings) == 0: raise dns.exception.UnexpectedEnd return cls(rdclass, rdtype, strings) diff --git a/dns/tokenizer.py b/dns/tokenizer.py index 4232b3f2..57d4a572 100644 --- a/dns/tokenizer.py +++ b/dns/tokenizer.py @@ -131,6 +131,65 @@ class Token(object): unescaped += c return Token(self.ttype, unescaped) + def unescape_to_bytes(self): + # We used to use unescape() for TXT-like records, but this + # caused problems as we'd process DNS escapes into Unicode code + # points instead of byte values, and then a to_text() of the + # processed data would not equal the original input. For + # example, \226 in the TXT record would have a to_text() of + # \195\162 because we applied UTF-8 encoding to Unicode code + # point 226. + # + # We now apply escapes while converting directly to bytes, + # avoiding this double encoding. + # + # This code also handles cases where the unicode input has + # non-ASCII code-points in it by converting it to UTF-8. TXT + # records aren't defined for Unicode, but this is the best we + # can do to preserve meaning. For example, + # + # foo\u200bbar + # + # (where \u200b is Unicode code point 0x200b) will be treated + # as if the input had been the UTF-8 encoding of that string, + # namely: + # + # foo\226\128\139bar + # + unescaped = b'' + l = len(self.value) + i = 0 + while i < l: + c = self.value[i] + i += 1 + if c == '\\': + if i >= l: + raise dns.exception.UnexpectedEnd + c = self.value[i] + i += 1 + if c.isdigit(): + if i >= l: + raise dns.exception.UnexpectedEnd + c2 = self.value[i] + i += 1 + if i >= l: + raise dns.exception.UnexpectedEnd + c3 = self.value[i] + i += 1 + if not (c2.isdigit() and c3.isdigit()): + raise dns.exception.SyntaxError + unescaped += b'%c' % (int(c) * 100 + int(c2) * 10 + int(c3)) + else: + # Note that as mentioned above, if c is a Unicode + # code point outside of the ASCII range, then this + # += is converting that code point to its UTF-8 + # encoding and appending multiple bytes to + # unescaped. + unescaped += c.encode() + else: + unescaped += c.encode() + return Token(self.ttype, bytes(unescaped)) + # compatibility for old-style tuple tokens def __len__(self): @@ -365,22 +424,7 @@ class Tokenizer(object): else: self._unget_char(c) break - elif self.quoting: - if c == '\\': - c = self._get_char() - if c == '': - raise dns.exception.UnexpectedEnd - if c.isdigit(): - c2 = self._get_char() - if c2 == '': - raise dns.exception.UnexpectedEnd - c3 = self._get_char() - if c == '': - raise dns.exception.UnexpectedEnd - if not (c2.isdigit() and c3.isdigit()): - raise dns.exception.SyntaxError - c = chr(int(c) * 100 + int(c2) * 10 + int(c3)) - elif c == '\n': + elif self.quoting and c == '\n': raise dns.exception.SyntaxError('newline in quoted string') elif c == '\\': # diff --git a/tests/test_rdata.py b/tests/test_rdata.py index 63a30baa..3c5a9eeb 100644 --- a/tests/test_rdata.py +++ b/tests/test_rdata.py @@ -77,5 +77,31 @@ class RdataTestCase(unittest.TestCase): self.assertEqual(str(ns.to_generic(origin=origin)), r'\# 13 03666f6f076578616d706c6500') + def test_txt_unicode(self): + # TXT records are not defined for Unicode, but if we get + # Unicode we should convert it to UTF-8 to preserve meaning as + # best we can. Note that it when the TXT record is sent + # to_text(), it does NOT convert embedded UTF-8 back to + # Unicode; it's just treated as binary TXT data. Probably + # there should be a TXT-like record with an encoding field. + rdata = dns.rdata.from_text(dns.rdataclass.IN, dns.rdatatype.TXT, + '"foo\u200bbar"') + self.assertEqual(str(rdata), '"foo\\226\\128\\139bar"') + # We used to encode UTF-8 in UTF-8 because we processed + # escapes in quoted strings immediately. This meant that the + # \\226 below would be inserted as Unicode code point 226, and + # then when we did to_text, we would UTF-8 encode that code + # point, emitting \\195\\162 instead of \\226, and thus + # from_text followed by to_text was not the equal to the + # original input like it ought to be. + rdata = dns.rdata.from_text(dns.rdataclass.IN, dns.rdatatype.TXT, + '"foo\\226\\128\\139bar"') + self.assertEqual(str(rdata), '"foo\\226\\128\\139bar"') + # Our fix for TXT-like records uses a new tokenizer method, + # unescape_to_bytes(), which both interprets + rdata = dns.rdata.from_text(dns.rdataclass.IN, dns.rdatatype.TXT, + '"foo\u200b\\123bar"') + self.assertEqual(str(rdata), '"foo\\226\\128\\139{bar"') + if __name__ == '__main__': unittest.main() diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 7fe7f85f..39c04361 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -42,12 +42,13 @@ class TokenizerTestCase(unittest.TestCase): def testQuotedString3(self): tok = dns.tokenizer.Tokenizer(r'"\"foo\""') token = tok.get() - self.assertEqual(token, Token(dns.tokenizer.QUOTED_STRING, '"foo"')) + self.assertEqual(token, Token(dns.tokenizer.QUOTED_STRING, '\\"foo\\"')) def testQuotedString4(self): tok = dns.tokenizer.Tokenizer(r'"foo\010bar"') token = tok.get() - self.assertEqual(token, Token(dns.tokenizer.QUOTED_STRING, 'foo\x0abar')) + self.assertEqual(token, Token(dns.tokenizer.QUOTED_STRING, + 'foo\\010bar')) def testQuotedString5(self): def bad():