From: Bob Halley <halley@dnspython.org>
Date: Sun, 3 May 2020 13:34:53 +0000 (-0700)
Subject: Do not double-UTF-8 encode escapes in TXT-like records. [Issue #321]
X-Git-Tag: v2.0.0rc1~276
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=783deb045f82aa77a2229afe5001e59bbcfc553b;p=thirdparty%2Fdnspython.git

Do not double-UTF-8 encode escapes in TXT-like records. [Issue #321]
---

diff --git a/dns/rdtypes/txtbase.py b/dns/rdtypes/txtbase.py
index 9c13ed98..dd928e65 100644
--- a/dns/rdtypes/txtbase.py
+++ b/dns/rdtypes/txtbase.py
@@ -60,18 +60,14 @@ class TXTBase(dns.rdata.Rdata):
                   relativize_to=None):
         strings = []
         while 1:
-            token = tok.get().unescape()
+            token = tok.get().unescape_to_bytes()
             if token.is_eol_or_eof():
                 break
             if not (token.is_quoted_string() or token.is_identifier()):
                 raise dns.exception.SyntaxError("expected a string")
             if len(token.value) > 255:
                 raise dns.exception.SyntaxError("string too long")
-            value = token.value
-            if isinstance(value, bytes):
-                strings.append(value)
-            else:
-                strings.append(value.encode())
+            strings.append(token.value)
         if len(strings) == 0:
             raise dns.exception.UnexpectedEnd
         return cls(rdclass, rdtype, strings)
diff --git a/dns/tokenizer.py b/dns/tokenizer.py
index 4232b3f2..57d4a572 100644
--- a/dns/tokenizer.py
+++ b/dns/tokenizer.py
@@ -131,6 +131,65 @@ class Token(object):
             unescaped += c
         return Token(self.ttype, unescaped)
 
+    def unescape_to_bytes(self):
+        # We used to use unescape() for TXT-like records, but this
+        # caused problems as we'd process DNS escapes into Unicode code
+        # points instead of byte values, and then a to_text() of the
+        # processed data would not equal the original input.  For
+        # example, \226 in the TXT record would have a to_text() of
+        # \195\162 because we applied UTF-8 encoding to Unicode code
+        # point 226.
+        #
+        # We now apply escapes while converting directly to bytes,
+        # avoiding this double encoding.
+        #
+        # This code also handles cases where the unicode input has
+        # non-ASCII code-points in it by converting it to UTF-8.  TXT
+        # records aren't defined for Unicode, but this is the best we
+        # can do to preserve meaning.  For example,
+        #
+        #     foo\u200bbar
+        #
+        # (where \u200b is Unicode code point 0x200b) will be treated
+        # as if the input had been the UTF-8 encoding of that string,
+        # namely:
+        #
+        #     foo\226\128\139bar
+        #
+        unescaped = b''
+        l = len(self.value)
+        i = 0
+        while i < l:
+            c = self.value[i]
+            i += 1
+            if c == '\\':
+                if i >= l:
+                    raise dns.exception.UnexpectedEnd
+                c = self.value[i]
+                i += 1
+                if c.isdigit():
+                    if i >= l:
+                        raise dns.exception.UnexpectedEnd
+                    c2 = self.value[i]
+                    i += 1
+                    if i >= l:
+                        raise dns.exception.UnexpectedEnd
+                    c3 = self.value[i]
+                    i += 1
+                    if not (c2.isdigit() and c3.isdigit()):
+                        raise dns.exception.SyntaxError
+                    unescaped += b'%c' % (int(c) * 100 + int(c2) * 10 + int(c3))
+                else:
+                    # Note that as mentioned above, if c is a Unicode
+                    # code point outside of the ASCII range, then this
+                    # += is converting that code point to its UTF-8
+                    # encoding and appending multiple bytes to
+                    # unescaped.
+                    unescaped += c.encode()
+            else:
+                unescaped += c.encode()
+        return Token(self.ttype, bytes(unescaped))
+
     # compatibility for old-style tuple tokens
 
     def __len__(self):
@@ -365,22 +424,7 @@ class Tokenizer(object):
                 else:
                     self._unget_char(c)
                 break
-            elif self.quoting:
-                if c == '\\':
-                    c = self._get_char()
-                    if c == '':
-                        raise dns.exception.UnexpectedEnd
-                    if c.isdigit():
-                        c2 = self._get_char()
-                        if c2 == '':
-                            raise dns.exception.UnexpectedEnd
-                        c3 = self._get_char()
-                        if c == '':
-                            raise dns.exception.UnexpectedEnd
-                        if not (c2.isdigit() and c3.isdigit()):
-                            raise dns.exception.SyntaxError
-                        c = chr(int(c) * 100 + int(c2) * 10 + int(c3))
-                elif c == '\n':
+            elif self.quoting and c == '\n':
                     raise dns.exception.SyntaxError('newline in quoted string')
             elif c == '\\':
                 #
diff --git a/tests/test_rdata.py b/tests/test_rdata.py
index 63a30baa..3c5a9eeb 100644
--- a/tests/test_rdata.py
+++ b/tests/test_rdata.py
@@ -77,5 +77,31 @@ class RdataTestCase(unittest.TestCase):
         self.assertEqual(str(ns.to_generic(origin=origin)),
                          r'\# 13 03666f6f076578616d706c6500')
 
+    def test_txt_unicode(self):
+        # TXT records are not defined for Unicode, but if we get
+        # Unicode we should convert it to UTF-8 to preserve meaning as
+        # best we can.  Note that it when the TXT record is sent
+        # to_text(), it does NOT convert embedded UTF-8 back to
+        # Unicode; it's just treated as binary TXT data.  Probably
+        # there should be a TXT-like record with an encoding field.
+        rdata = dns.rdata.from_text(dns.rdataclass.IN, dns.rdatatype.TXT,
+                                    '"foo\u200bbar"')
+        self.assertEqual(str(rdata), '"foo\\226\\128\\139bar"')
+        # We used to encode UTF-8 in UTF-8 because we processed
+        # escapes in quoted strings immediately.  This meant that the
+        # \\226 below would be inserted as Unicode code point 226, and
+        # then when we did to_text, we would UTF-8 encode that code
+        # point, emitting \\195\\162 instead of \\226, and thus
+        # from_text followed by to_text was not the equal to the
+        # original input like it ought to be.
+        rdata = dns.rdata.from_text(dns.rdataclass.IN, dns.rdatatype.TXT,
+                                    '"foo\\226\\128\\139bar"')
+        self.assertEqual(str(rdata), '"foo\\226\\128\\139bar"')
+        # Our fix for TXT-like records uses a new tokenizer method,
+        # unescape_to_bytes(), which both interprets 
+        rdata = dns.rdata.from_text(dns.rdataclass.IN, dns.rdatatype.TXT,
+                                    '"foo\u200b\\123bar"')
+        self.assertEqual(str(rdata), '"foo\\226\\128\\139{bar"')
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
index 7fe7f85f..39c04361 100644
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@@ -42,12 +42,13 @@ class TokenizerTestCase(unittest.TestCase):
     def testQuotedString3(self):
         tok = dns.tokenizer.Tokenizer(r'"\"foo\""')
         token = tok.get()
-        self.assertEqual(token, Token(dns.tokenizer.QUOTED_STRING, '"foo"'))
+        self.assertEqual(token, Token(dns.tokenizer.QUOTED_STRING, '\\"foo\\"'))
 
     def testQuotedString4(self):
         tok = dns.tokenizer.Tokenizer(r'"foo\010bar"')
         token = tok.get()
-        self.assertEqual(token, Token(dns.tokenizer.QUOTED_STRING, 'foo\x0abar'))
+        self.assertEqual(token, Token(dns.tokenizer.QUOTED_STRING,
+                                      'foo\\010bar'))
 
     def testQuotedString5(self):
         def bad():