Do not double-UTF-8 encode escapes in TXT-like records. [Issue #321]

author Bob Halley <halley@dnspython.org>

Sun, 3 May 2020 13:34:53 +0000 (06:34 -0700)

committer Bob Halley <halley@dnspython.org>

Sun, 3 May 2020 13:34:53 +0000 (06:34 -0700)
author Bob Halley <halley@dnspython.org>
Sun, 3 May 2020 13:34:53 +0000 (06:34 -0700)
committer Bob Halley <halley@dnspython.org>
Sun, 3 May 2020 13:34:53 +0000 (06:34 -0700)
diff --git a/dns/rdtypes/txtbase.py b/dns/rdtypes/txtbase.py

index 9c13ed9849d5bb300a57c84e0e844fcc4be79bce..dd928e65fdd3f6b664b94505c888f2eb73de5fea 100644 (file)
--- a/dns/rdtypes/txtbase.py
+++ b/dns/rdtypes/txtbase.py
@@ -60,18 +60,14 @@ class TXTBase(dns.rdata.Rdata):
                    relativize_to=None):
          strings = []
          while 1:
-            token = tok.get().unescape()
+            token = tok.get().unescape_to_bytes()
              if token.is_eol_or_eof():
                  break
              if not (token.is_quoted_string() or token.is_identifier()):
                  raise dns.exception.SyntaxError("expected a string")
              if len(token.value) > 255:
                  raise dns.exception.SyntaxError("string too long")
-            value = token.value
-            if isinstance(value, bytes):
-                strings.append(value)
-            else:
-                strings.append(value.encode())
+            strings.append(token.value)
          if len(strings) == 0:
              raise dns.exception.UnexpectedEnd
          return cls(rdclass, rdtype, strings)
diff --git a/dns/tokenizer.py b/dns/tokenizer.py

index 4232b3f2057a3e0c99e6ba6d8cb139535e89c886..57d4a572ba0955b030137826e389d2c1dfbad45e 100644 (file)
--- a/dns/tokenizer.py
+++ b/dns/tokenizer.py
@@ -131,6 +131,65 @@ class Token(object):
              unescaped += c
          return Token(self.ttype, unescaped)
  
+    def unescape_to_bytes(self):
+        # We used to use unescape() for TXT-like records, but this
+        # caused problems as we'd process DNS escapes into Unicode code
+        # points instead of byte values, and then a to_text() of the
+        # processed data would not equal the original input.  For
+        # example, \226 in the TXT record would have a to_text() of
+        # \195\162 because we applied UTF-8 encoding to Unicode code
+        # point 226.
+        #
+        # We now apply escapes while converting directly to bytes,
+        # avoiding this double encoding.
+        #
+        # This code also handles cases where the unicode input has
+        # non-ASCII code-points in it by converting it to UTF-8.  TXT
+        # records aren't defined for Unicode, but this is the best we
+        # can do to preserve meaning.  For example,
+        #
+        #     foo\u200bbar
+        #
+        # (where \u200b is Unicode code point 0x200b) will be treated
+        # as if the input had been the UTF-8 encoding of that string,
+        # namely:
+        #
+        #     foo\226\128\139bar
+        #
+        unescaped = b''
+        l = len(self.value)
+        i = 0
+        while i < l:
+            c = self.value[i]
+            i += 1
+            if c == '\\':
+                if i >= l:
+                    raise dns.exception.UnexpectedEnd
+                c = self.value[i]
+                i += 1
+                if c.isdigit():
+                    if i >= l:
+                        raise dns.exception.UnexpectedEnd
+                    c2 = self.value[i]
+                    i += 1
+                    if i >= l:
+                        raise dns.exception.UnexpectedEnd
+                    c3 = self.value[i]
+                    i += 1
+                    if not (c2.isdigit() and c3.isdigit()):
+                        raise dns.exception.SyntaxError
+                    unescaped += b'%c' % (int(c) * 100 + int(c2) * 10 + int(c3))
+                else:
+                    # Note that as mentioned above, if c is a Unicode
+                    # code point outside of the ASCII range, then this
+                    # += is converting that code point to its UTF-8
+                    # encoding and appending multiple bytes to
+                    # unescaped.
+                    unescaped += c.encode()
+            else:
+                unescaped += c.encode()
+        return Token(self.ttype, bytes(unescaped))
+
      # compatibility for old-style tuple tokens
  
      def __len__(self):
@@ -365,22 +424,7 @@ class Tokenizer(object):
                  else:
                      self._unget_char(c)
                  break
-            elif self.quoting:
-                if c == '\\':
-                    c = self._get_char()
-                    if c == '':
-                        raise dns.exception.UnexpectedEnd
-                    if c.isdigit():
-                        c2 = self._get_char()
-                        if c2 == '':
-                            raise dns.exception.UnexpectedEnd
-                        c3 = self._get_char()
-                        if c == '':
-                            raise dns.exception.UnexpectedEnd
-                        if not (c2.isdigit() and c3.isdigit()):
-                            raise dns.exception.SyntaxError
-                        c = chr(int(c) * 100 + int(c2) * 10 + int(c3))
-                elif c == '\n':
+            elif self.quoting and c == '\n':
                      raise dns.exception.SyntaxError('newline in quoted string')
              elif c == '\\':
                  #
diff --git a/tests/test_rdata.py b/tests/test_rdata.py

index 63a30baa934554246ff8efd357a758ba38a85d23..3c5a9eeb9cc4f0c7dcd766a767f75c919372958f 100644 (file)
--- a/tests/test_rdata.py
+++ b/tests/test_rdata.py
@@ -77,5 +77,31 @@ class RdataTestCase(unittest.TestCase):
          self.assertEqual(str(ns.to_generic(origin=origin)),
                           r'\# 13 03666f6f076578616d706c6500')
  
+    def test_txt_unicode(self):
+        # TXT records are not defined for Unicode, but if we get
+        # Unicode we should convert it to UTF-8 to preserve meaning as
+        # best we can.  Note that it when the TXT record is sent
+        # to_text(), it does NOT convert embedded UTF-8 back to
+        # Unicode; it's just treated as binary TXT data.  Probably
+        # there should be a TXT-like record with an encoding field.
+        rdata = dns.rdata.from_text(dns.rdataclass.IN, dns.rdatatype.TXT,
+                                    '"foo\u200bbar"')
+        self.assertEqual(str(rdata), '"foo\\226\\128\\139bar"')
+        # We used to encode UTF-8 in UTF-8 because we processed
+        # escapes in quoted strings immediately.  This meant that the
+        # \\226 below would be inserted as Unicode code point 226, and
+        # then when we did to_text, we would UTF-8 encode that code
+        # point, emitting \\195\\162 instead of \\226, and thus
+        # from_text followed by to_text was not the equal to the
+        # original input like it ought to be.
+        rdata = dns.rdata.from_text(dns.rdataclass.IN, dns.rdatatype.TXT,
+                                    '"foo\\226\\128\\139bar"')
+        self.assertEqual(str(rdata), '"foo\\226\\128\\139bar"')
+        # Our fix for TXT-like records uses a new tokenizer method,
+        # unescape_to_bytes(), which both interprets 
+        rdata = dns.rdata.from_text(dns.rdataclass.IN, dns.rdatatype.TXT,
+                                    '"foo\u200b\\123bar"')
+        self.assertEqual(str(rdata), '"foo\\226\\128\\139{bar"')
+
  if __name__ == '__main__':
      unittest.main()
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py

index 7fe7f85f92bb3bed9b71edd9174800aa7333785d..39c04361d915179c367fb8e9db1de5342bf09b5f 100644 (file)
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@@ -42,12 +42,13 @@ class TokenizerTestCase(unittest.TestCase):
      def testQuotedString3(self):
          tok = dns.tokenizer.Tokenizer(r'"\"foo\""')
          token = tok.get()
-        self.assertEqual(token, Token(dns.tokenizer.QUOTED_STRING, '"foo"'))
+        self.assertEqual(token, Token(dns.tokenizer.QUOTED_STRING, '\\"foo\\"'))
  
      def testQuotedString4(self):
          tok = dns.tokenizer.Tokenizer(r'"foo\010bar"')
          token = tok.get()
-        self.assertEqual(token, Token(dns.tokenizer.QUOTED_STRING, 'foo\x0abar'))
+        self.assertEqual(token, Token(dns.tokenizer.QUOTED_STRING,
+                                      'foo\\010bar'))
  
      def testQuotedString5(self):
          def bad():
author	Bob Halley <halley@dnspython.org>
	Sun, 3 May 2020 13:34:53 +0000 (06:34 -0700)
committer	Bob Halley <halley@dnspython.org>
	Sun, 3 May 2020 13:34:53 +0000 (06:34 -0700)
dns/rdtypes/txtbase.py		patch \| blob \| blame \| history
dns/tokenizer.py		patch \| blob \| blame \| history
tests/test_rdata.py		patch \| blob \| blame \| history
tests/test_tokenizer.py		patch \| blob \| blame \| history