relativize_to=None):
strings = []
while 1:
- token = tok.get().unescape()
+ token = tok.get().unescape_to_bytes()
if token.is_eol_or_eof():
break
if not (token.is_quoted_string() or token.is_identifier()):
raise dns.exception.SyntaxError("expected a string")
if len(token.value) > 255:
raise dns.exception.SyntaxError("string too long")
- value = token.value
- if isinstance(value, bytes):
- strings.append(value)
- else:
- strings.append(value.encode())
+ strings.append(token.value)
if len(strings) == 0:
raise dns.exception.UnexpectedEnd
return cls(rdclass, rdtype, strings)
unescaped += c
return Token(self.ttype, unescaped)
+ def unescape_to_bytes(self):
+ # We used to use unescape() for TXT-like records, but this
+ # caused problems as we'd process DNS escapes into Unicode code
+ # points instead of byte values, and then a to_text() of the
+ # processed data would not equal the original input. For
+ # example, \226 in the TXT record would have a to_text() of
+ # \195\162 because we applied UTF-8 encoding to Unicode code
+ # point 226.
+ #
+ # We now apply escapes while converting directly to bytes,
+ # avoiding this double encoding.
+ #
+ # This code also handles cases where the unicode input has
+ # non-ASCII code-points in it by converting it to UTF-8. TXT
+ # records aren't defined for Unicode, but this is the best we
+ # can do to preserve meaning. For example,
+ #
+ # foo\u200bbar
+ #
+ # (where \u200b is Unicode code point 0x200b) will be treated
+ # as if the input had been the UTF-8 encoding of that string,
+ # namely:
+ #
+ # foo\226\128\139bar
+ #
+ unescaped = b''
+ l = len(self.value)
+ i = 0
+ while i < l:
+ c = self.value[i]
+ i += 1
+ if c == '\\':
+ if i >= l:
+ raise dns.exception.UnexpectedEnd
+ c = self.value[i]
+ i += 1
+ if c.isdigit():
+ if i >= l:
+ raise dns.exception.UnexpectedEnd
+ c2 = self.value[i]
+ i += 1
+ if i >= l:
+ raise dns.exception.UnexpectedEnd
+ c3 = self.value[i]
+ i += 1
+ if not (c2.isdigit() and c3.isdigit()):
+ raise dns.exception.SyntaxError
+ unescaped += b'%c' % (int(c) * 100 + int(c2) * 10 + int(c3))
+ else:
+ # Note that as mentioned above, if c is a Unicode
+ # code point outside of the ASCII range, then this
+ # += is converting that code point to its UTF-8
+ # encoding and appending multiple bytes to
+ # unescaped.
+ unescaped += c.encode()
+ else:
+ unescaped += c.encode()
+ return Token(self.ttype, bytes(unescaped))
+
# compatibility for old-style tuple tokens
def __len__(self):
else:
self._unget_char(c)
break
- elif self.quoting:
- if c == '\\':
- c = self._get_char()
- if c == '':
- raise dns.exception.UnexpectedEnd
- if c.isdigit():
- c2 = self._get_char()
- if c2 == '':
- raise dns.exception.UnexpectedEnd
- c3 = self._get_char()
- if c == '':
- raise dns.exception.UnexpectedEnd
- if not (c2.isdigit() and c3.isdigit()):
- raise dns.exception.SyntaxError
- c = chr(int(c) * 100 + int(c2) * 10 + int(c3))
- elif c == '\n':
+ elif self.quoting and c == '\n':
raise dns.exception.SyntaxError('newline in quoted string')
elif c == '\\':
#
self.assertEqual(str(ns.to_generic(origin=origin)),
r'\# 13 03666f6f076578616d706c6500')
+ def test_txt_unicode(self):
+ # TXT records are not defined for Unicode, but if we get
+ # Unicode we should convert it to UTF-8 to preserve meaning as
+ # best we can. Note that it when the TXT record is sent
+ # to_text(), it does NOT convert embedded UTF-8 back to
+ # Unicode; it's just treated as binary TXT data. Probably
+ # there should be a TXT-like record with an encoding field.
+ rdata = dns.rdata.from_text(dns.rdataclass.IN, dns.rdatatype.TXT,
+ '"foo\u200bbar"')
+ self.assertEqual(str(rdata), '"foo\\226\\128\\139bar"')
+ # We used to encode UTF-8 in UTF-8 because we processed
+ # escapes in quoted strings immediately. This meant that the
+ # \\226 below would be inserted as Unicode code point 226, and
+ # then when we did to_text, we would UTF-8 encode that code
+ # point, emitting \\195\\162 instead of \\226, and thus
+ # from_text followed by to_text was not the equal to the
+ # original input like it ought to be.
+ rdata = dns.rdata.from_text(dns.rdataclass.IN, dns.rdatatype.TXT,
+ '"foo\\226\\128\\139bar"')
+ self.assertEqual(str(rdata), '"foo\\226\\128\\139bar"')
+ # Our fix for TXT-like records uses a new tokenizer method,
+ # unescape_to_bytes(), which both interprets
+ rdata = dns.rdata.from_text(dns.rdataclass.IN, dns.rdatatype.TXT,
+ '"foo\u200b\\123bar"')
+ self.assertEqual(str(rdata), '"foo\\226\\128\\139{bar"')
+
if __name__ == '__main__':
unittest.main()
def testQuotedString3(self):
tok = dns.tokenizer.Tokenizer(r'"\"foo\""')
token = tok.get()
- self.assertEqual(token, Token(dns.tokenizer.QUOTED_STRING, '"foo"'))
+ self.assertEqual(token, Token(dns.tokenizer.QUOTED_STRING, '\\"foo\\"'))
def testQuotedString4(self):
tok = dns.tokenizer.Tokenizer(r'"foo\010bar"')
token = tok.get()
- self.assertEqual(token, Token(dns.tokenizer.QUOTED_STRING, 'foo\x0abar'))
+ self.assertEqual(token, Token(dns.tokenizer.QUOTED_STRING,
+ 'foo\\010bar'))
def testQuotedString5(self):
def bad():