From: Oleksii Shumeiko -X (oshumeik - SOFTSERVE INC at Cisco) Date: Tue, 10 Jan 2023 14:25:06 +0000 (+0000) Subject: Pull request #3713: js_norm: decode UTF-16BE to UTF-8 for JS in PDF X-Git-Tag: 3.1.51.0~4 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=a6aabc4eb5a8a35ee30369320c2048828ccdf0e3;p=thirdparty%2Fsnort3.git Pull request #3713: js_norm: decode UTF-16BE to UTF-8 for JS in PDF Merge in SNORT/snort3 from ~OSERHIIE/snort3:jsn_pdf_utf16 to master Squashed commit of the following: commit 0687ef21316f44f413bdfe8287d8893ce5138e3c Author: Oleksandr Serhiienko Date: Thu Dec 15 15:41:25 2022 +0100 js_norm: decode UTF-16BE to UTF-8 for JS in PDF * js_norm: support UTF-16BE in text strings, hexadecimal strings and streams * js_norm: add unit test coverage * lua: fixup in snort_defaults.lua --- diff --git a/lua/snort_defaults.lua b/lua/snort_defaults.lua index af51352d1..76e845971 100644 --- a/lua/snort_defaults.lua +++ b/lua/snort_defaults.lua @@ -1176,7 +1176,7 @@ default_low_port_scan = } --------------------------------------------------------------------------- --- default http configuration +-- default js_norm configuration --------------------------------------------------------------------------- -- ECMAScript Standard Built-in Objects and Functions Names (Identifiers) diff --git a/src/js_norm/pdf_tokenizer.h b/src/js_norm/pdf_tokenizer.h index bd4c20b0b..ab7bc2170 100644 --- a/src/js_norm/pdf_tokenizer.h +++ b/src/js_norm/pdf_tokenizer.h @@ -64,6 +64,9 @@ private: PDFRet h_lit_unescape(); PDFRet h_lit_oct2chr(); PDFRet h_hex_hex2chr(); + PDFRet h_hex_hex2chr_u16(); + PDFRet h_lit_u16(); + PDFRet h_lit_u16_unescape(); PDFRet h_stream_open(); PDFRet h_stream(); bool h_stream_close(); @@ -71,6 +74,13 @@ private: void h_ref(); void h_ind_obj_open(); inline void h_ind_obj_close(); + void h_u16_start(); + void h_u16_break(); + void h_u16_hex_start(); + void h_u16_hex_break(); + + PDFRet u16_eval(uint8_t byte); + void u16_to_u8(uint32_t code); struct ObjectString { @@ -117,6 +127,14 @@ private: DictionaryEntry obj_entry; Stream obj_stream; std::unordered_set js_stream_refs; + + // represents UTF-16BE code point + struct + { + uint16_t high = 0; + uint16_t low = 0; + int cur_byte = 0; + } u16_state; }; bool PDFTokenizer::h_lit_str() diff --git a/src/js_norm/pdf_tokenizer.l b/src/js_norm/pdf_tokenizer.l index 6224c39bb..6c13a9cdd 100644 --- a/src/js_norm/pdf_tokenizer.l +++ b/src/js_norm/pdf_tokenizer.l @@ -101,6 +101,13 @@ LIT_STR_ESC_EOL \\[\x0d\x0a]|\\\x0d\x0a LIT_STR_EOL [\x0d\x0a]|\x0d\x0a LIT_STR_BODY [^\\\(\)]{1,64} +/* 7.9.2.2 Text String Type, UTF-16BE */ +/* RFC 2781: 4.3 Interpreting text labelled as UTF-16 */ +U16_BOM \xfe\xff +U16_BOM_HEX FE{HEX_STR_SKIP}*FF +LIT_STR_U16_UNESC \\[(\)\\nrtbf] +LIT_STR_U16_BODY [^\\\(\)]{1,16} + /* 7.3.4.3 Hexadecimal Strings */ HEX_STR_BODY [0-9A-Fa-f]{1,64} HEX_STR_SKIP [^0-9A-Fa-f>]{1,64} @@ -138,7 +145,7 @@ OBJ_REFERENCE {OBJ_INT_NUM}{GRP_WHITESPACE}+{OBJ_INT_NUM}{GRP_WHITESPACE} SKIP [^[:digit:]%]{1,64}|. WHITESPACE {GRP_WHITESPACE}{1,64} -/* Start conditions: INITIAL or inside dictionary, literal string, hexadecimal string */ +/* Start conditions: INITIAL or inside dictionary, literal string, hexadecimal string, stream */ %x indobj %x stream %x dictnr @@ -148,6 +155,13 @@ WHITESPACE {GRP_WHITESPACE}{1,64} %x jshstr %x jsstream +/* Start conditions: UTF-16BE BOM, UTF-16BE literal string, UTF-16BE hexadecimal string, UTF-16BE stream */ +%x u16 +%x u16hex +%x jsstru16 +%x jshstru16 +%x jsstreamu16 + %% {SKIP} { } @@ -158,11 +172,13 @@ WHITESPACE {GRP_WHITESPACE}{1,64} {WHITESPACE} { } {INDIRECT_OBJ_CLOSE} { POP(); h_ind_obj_close(); } -{OBJ_STREAM_OPEN} { EXEC(h_stream_open()) PUSH(obj_stream.is_js ? jsstream : stream); } +{OBJ_STREAM_OPEN} { EXEC(h_stream_open()) PUSH(obj_stream.is_js ? u16 : stream); } {OBJ_STREAM_SKIP} { EXEC(h_stream()) } {OBJ_STREAM_SKIP} { EXEC(h_stream()) ECHO; } +{OBJ_STREAM_SKIP} { EXEC(h_stream()) EXEC(h_lit_u16()) } {OBJ_STREAM_CLOSE} { if (h_stream_close()) POP(); } {OBJ_STREAM_CLOSE} { if (h_stream_close()) POP(); } +{OBJ_STREAM_CLOSE} { if (h_stream_close()) POP(); } {OBJ_DICT_OPEN} { PUSH(dictnr); EXEC(h_dict_open()) } {OBJ_DICT_OPEN} { PUSH(dictnr); EXEC(h_dict_open()) } @@ -195,7 +211,7 @@ WHITESPACE {GRP_WHITESPACE}{1,64} {HEX_STR_BODY} { } {HEX_STR_SKIP} { } -{OBJ_LIT_STR_OPEN} { if (!h_lit_open()) ECHO; } +{OBJ_LIT_STR_OPEN} { if (!h_lit_open()) ECHO; else PUSH(u16); } {OBJ_LIT_STR_CLOSE} { if (h_lit_close()) POP(); else ECHO; } {LIT_STR_ESC} { EXEC(h_lit_unescape()) } {LIT_STR_ESC_OCT} { EXEC(h_lit_oct2chr()) } @@ -203,10 +219,22 @@ WHITESPACE {GRP_WHITESPACE}{1,64} {LIT_STR_EOL} { ECHO; } {LIT_STR_BODY} { ECHO; } -{OBJ_HEX_STR_OPEN} { } -{OBJ_HEX_STR_CLOSE} { POP(); } +{U16_BOM} { h_u16_start(); } +.|\n { h_u16_break(); } + +{OBJ_LIT_STR_CLOSE} { if (h_lit_close()) POP(); } +{LIT_STR_ESC_EOL} { } +{LIT_STR_U16_UNESC} { EXEC(h_lit_u16_unescape()) } +{LIT_STR_U16_BODY} { EXEC(h_lit_u16()) } + +{U16_BOM_HEX} { h_u16_hex_start(); } +.|\n { h_u16_hex_break(); } + +{OBJ_HEX_STR_OPEN} { PUSH(u16hex); } +{OBJ_HEX_STR_CLOSE} { POP(); } {HEX_STR_BODY} { EXEC(h_hex_hex2chr()) } -{HEX_STR_SKIP} { } +{HEX_STR_BODY} { EXEC(h_hex_hex2chr_u16()) } +{HEX_STR_SKIP} { } <*><> { return PDFRet::EOS; } @@ -276,28 +304,26 @@ PDFTokenizer::PDFRet PDFTokenizer::h_dict_name() return PDFRet::EOS; } -PDFTokenizer::PDFRet PDFTokenizer::h_lit_unescape() +constexpr char literal_unescape(const char& input) { - assert(yyleng == 2); - assert(yytext[0] == '\\'); - - char c; - // 7.3.4.2 Literal Strings, Table 3 Escape sequences in literal strings - switch (yytext[1]) + switch (input) { - case 'n': c = '\n'; break; - case 'r': c = '\r'; break; - case 't': c = '\t'; break; - case 'b': c = '\b'; break; - case 'f': c = '\f'; break; - case '(': c = '('; break; - case ')': c = ')'; break; - case '\\': c = '\\'; break; - default: c = yytext[1]; + case 'n': return '\n'; + case 'r': return '\r'; + case 't': return '\t'; + case 'b': return '\b'; + case 'f': return '\f'; + default: return input; } +} + +PDFTokenizer::PDFRet PDFTokenizer::h_lit_unescape() +{ + assert(yyleng == 2); + assert(yytext[0] == '\\'); - yyout << c; + yyout << literal_unescape(yytext[1]); return PDFRet::EOS; } @@ -344,6 +370,63 @@ PDFTokenizer::PDFRet PDFTokenizer::h_hex_hex2chr() return PDFRet::EOS; } +PDFTokenizer::PDFRet PDFTokenizer::h_hex_hex2chr_u16() +{ + int len = yyleng & ~1; + const char* ptr = yytext; + const char* end = yytext + len; + + while (ptr < end) + { + unsigned v; + sscanf(ptr, "%02x", &v); + EXEC(u16_eval((uint8_t)v)) + ptr += 2; + } + + if (len != yyleng) + { + unsigned v; + sscanf(ptr, "%01x", &v); + EXEC(u16_eval((uint8_t)(v << 4))) + } + + debug_logf(6, js_trace, TRACE_PDF_DUMP, nullptr, + "literal string, in hex (UTF-16BE): %s\n", yytext); + + return PDFRet::EOS; +} + +PDFTokenizer::PDFRet PDFTokenizer::h_lit_u16() +{ + const uint8_t* ptr = (uint8_t*)yytext; + const uint8_t* end = ptr + yyleng; + + while (ptr < end) + { + EXEC(u16_eval(*ptr)) + ++ptr; + } + + debug_logf(6, js_trace, TRACE_PDF_DUMP, nullptr, + "string, in UTF-16BE: %s\n", yytext); + + return PDFRet::EOS; +} + +PDFTokenizer::PDFRet PDFTokenizer::h_lit_u16_unescape() +{ + assert(yyleng == 2); + + // the reverse solidus behaves as a split point in this case and should be removed + EXEC(u16_eval(literal_unescape(yytext[1]))) + + debug_logf(6, js_trace, TRACE_PDF_DUMP, nullptr, + "string, in UTF-16BE, escaped: %s\n", yytext); + + return PDFRet::EOS; +} + PDFTokenizer::PDFRet PDFTokenizer::h_stream_open() { if (obj_stream.rem_length < 0) @@ -395,6 +478,136 @@ void PDFTokenizer::h_ind_obj_open() obj_stream.is_js = true; } +void PDFTokenizer::h_u16_start() +{ + POP(); + + switch (YY_START) + { + case jslstr: + POP(); + PUSH(jsstru16); + break; + case indobj: + POP(); + PUSH(jsstreamu16); + break; + default: + assert(false); + } +} + +void PDFTokenizer::h_u16_break() +{ + POP(); + yyless(0); + + switch (YY_START) + { + case indobj: + PUSH(jsstream); + break; + case jslstr: + break; + default: + assert(false); + } +} + +void PDFTokenizer::h_u16_hex_start() +{ + POP(); + + assert(YY_START == jshstr); + POP(); + PUSH(jshstru16); +} + +void PDFTokenizer::h_u16_hex_break() +{ + POP(); + yyless(0); + assert(YY_START == jshstr); +} + +/* RFC 2781: 2.1 Encoding UTF-16 2.2, Decoding UTF-16, 4.3 Interpreting text labelled as UTF-16 */ +PDFTokenizer::PDFRet PDFTokenizer::u16_eval(uint8_t byte) +{ + switch(u16_state.cur_byte) + { + case 0: + u16_state.high = byte; + u16_state.cur_byte = 1; + + break; + case 1: + { + u16_state.high = (u16_state.high << 8) | byte; + if (u16_state.high < 0xd800) + { + u16_to_u8(u16_state.high); + u16_state.cur_byte = 0; + } + else + { + u16_state.high = (u16_state.high - 0xd800) * 0x400; + u16_state.cur_byte = 2; + } + + break; + } + case 2: + u16_state.low = byte; + u16_state.cur_byte = 3; + + break; + case 3: + u16_state.low = (u16_state.low << 8) | byte; + u16_state.cur_byte = 0; + + if (u16_state.low < 0xdc00) + return PDFRet::UNEXPECTED_SYMBOL; + + u16_state.low = u16_state.low - 0xdc00; + u16_to_u8((u16_state.high | u16_state.low) + 0x10000); + + break; + default: + assert(false); + } + + return PDFRet::EOS; +} + +void PDFTokenizer::u16_to_u8(uint32_t code) +{ + assert(code <= 0x1fffff); + std::string out; + + if (code <= 0x7f) + out = (char)code; + else if (code <= 0x7ff) + { + out += (char)(0xc0 | (code >> 6)); + out += (char)(0x80 | (code & 0x3f)); + } + else if (code <= 0xffff) + { + out += (char)(0xe0 | (code >> 12)); + out += (char)(0x80 | ((code >> 6) & 0x3f)); + out += (char)(0x80 | (code & 0x3f)); + } + else if (code <= 0x1fffff) + { + out += (char)(0xf0 | (code >> 18)); + out += (char)(0x80 | ((code >> 12) & 0x3f)); + out += (char)(0x80 | ((code >> 6) & 0x3f)); + out += (char)(0x80 | (code & 0x3f)); + } + + yyout << out; +} + PDFTokenizer::PDFTokenizer(std::istream& in, std::ostream& out) : yyFlexLexer(in, out) { diff --git a/src/js_norm/test/pdf_tokenizer_test.cc b/src/js_norm/test/pdf_tokenizer_test.cc index db1801336..db7929a5f 100644 --- a/src/js_norm/test/pdf_tokenizer_test.cc +++ b/src/js_norm/test/pdf_tokenizer_test.cc @@ -31,6 +31,7 @@ using namespace jsn; using namespace std; +using namespace std::string_literals; typedef pair Chunk; @@ -171,14 +172,27 @@ TEST_CASE("basic", "[PDFTokenizer]") SECTION("escapes in string") { test_pdf_proc( - "(() \\n\\r\\t\\b\\f\\(\\)\\\\ \\123 \\A\\B\\C \\x\\y\\z)", + "1 0 obj\n" + "<< /S (() \\n\\r\\t\\b\\f\\(\\)\\\\ \\123 \\A\\B\\C \\x\\y\\z \\\n \\\r\n) >>\n" + "endobj\n", + "" + ); + } + SECTION("EOL in string") + { + test_pdf_proc( + "1 0 obj\n" + "<< /S (\r\n) >>\n" + "endobj\n", "" ); } SECTION("hex string") { test_pdf_proc( - "<000102030405>", + "1 0 obj\n" + "<< /S <0001020304 05> >> \n" + "endobj\n", "" ); } @@ -804,3 +818,297 @@ TEST_CASE("stream object over PDU", "[PDFTokenizer]") }); } } + +TEST_CASE("UTF-16, basic", "[PDFTokenizer]") +{ + SECTION("basic string") + { + test_pdf_proc( + "1 0 obj\n" + "<<" + "/S /JavaScript" + "/JS (\xfe\xff\0f\0o\0o)"s, + "foo"s + ); + } + SECTION("non-ASCII character") + { + test_pdf_proc( + "1 0 obj\n" + "<<" + "/S /JavaScript" + "/JS (\xfe\xff\xd8=\xdc=)"s, + "\xf0\x9f\x90\xbd"s + ); + } + SECTION("Latin-1 character") + { + test_pdf_proc( + "1 0 obj\n" + "<<" + "/S /JavaScript" + "/JS (\xfe\xff\0\xc6)"s, + "\xc3\x86"s + ); + } + SECTION("mixed charset") + { + test_pdf_proc( + "1 0 obj\n" + "<<" + "/S /JavaScript" + "/JS (\xfe\xff\0f\0o\0o\xd8=\xdc=\0\x20\0b\0a\0r)"s, + "foo\xf0\x9f\x90\xbd bar"s + ); + } + SECTION("stream") + { + test_pdf_proc( + "1 0 obj\n" + "<<" + "/S /JavaScript" + "/JS 2 0 R" + ">>\n" + "endobj\n" + "2 0 obj\n" + "<>\n" + "stream\n" + "\xfe\xff\0f\0o\0o\n" + "endstream\n" + "endobj"s, + "foo"s + ); + } + SECTION("hexadecimal string") + { + test_pdf_proc( + "1 0 obj\n" + "<<" + "/S /JavaScript" + "/JS "s, + "foo"s + ); + test_pdf_proc( + "1 0 obj\n" + "<<" + "/S /JavaScript" + "/JS "s, + "foo "s + ); + } + SECTION("escaped slash") + { + test_pdf_proc( + "1 0 obj\n" + "<<" + "/S /JavaScript" + "/JS (\xfe\xff\0\\\\\0f\0o\0o)"s, + "\\foo"s + ); + } + SECTION("escaped slash-like byte of a CJK character") + { + test_pdf_proc( + "1 0 obj\n" + "<<" + "/S /JavaScript" + "/JS (\xfe\xff\\\\\0)"s, + "\xe5\xb0\x80"s + ); + } + SECTION("newline: CR") + { + test_pdf_proc( + "1 0 obj\n" + "<<" + "/S /JavaScript" + "/JS (\xfe\xff\0f\0o\0o\0\r\0b\0a\0r)"s, + "foo\r" + "bar"s + ); + } + SECTION("newline: LF") + { + test_pdf_proc( + "1 0 obj\n" + "<<" + "/S /JavaScript" + "/JS (\xfe\xff\0f\0o\0o\0\n\0b\0a\0r)"s, + "foo\n" + "bar"s + ); + } + SECTION("escaped newline: CR") + { + test_pdf_proc( + "1 0 obj\n" + "<<" + "/S /JavaScript" + "/JS (\xfe\xff\0f\0o\0o\0\\r\0b\0a\0r)"s, + "foo\r" + "bar"s + ); + } + SECTION("escaped newline: LF") + { + test_pdf_proc( + "1 0 obj\n" + "<<" + "/S /JavaScript" + "/JS (\xfe\xff\0f\0o\0o\0\\n\0b\0a\0r)"s, + "foo\n" + "bar"s + ); + } + SECTION("escaped newline: PDF line wrap") + { + test_pdf_proc( + "1 0 obj\n" + "<<" + "/S /JavaScript" + "/JS (\xfe\xff\0f\0o\0o\\\n" + "\0b\0a\0r)"s, + "foobar"s + ); + } + SECTION("slash in stream") + { + test_pdf_proc( + "1 0 obj\n" + "<<" + "/S /JavaScript" + "/JS 2 0 R" + ">>\n" + "endobj\n" + "2 0 obj\n" + "<>\n" + "stream\n" + "\xfe\xff\0\\\0f\0o\0o\n" + "endstream\n" + "endobj"s, + "\\foo"s + ); + } + SECTION("unexpected symbol") + { + test_pdf_proc( + "1 0 obj\n" + "<<" + "/S /JavaScript" + "/JS (\xfe\xff\0\\(\0a\0()"s, + "(a"s, + PDFTokenizer::PDFRet::UNEXPECTED_SYMBOL + ); + } + SECTION("invalid high surrogate pair") + { + test_pdf_proc( + "1 0 obj\n" + "<<" + "/S /JavaScript" + "/JS (\xfe\xff\xd8=\0=)"s, + ""s, + PDFTokenizer::PDFRet::UNEXPECTED_SYMBOL + ); + } +} + +TEST_CASE("UTF-16, cross-PDU", "[PDFTokenizer]") +{ + SECTION("split between symbols") + { + test_pdf_proc({ + { + "10 0 obj\n" + "<>\n" + "endobj"s, + "bar"s + } + }); + } + SECTION("split inside the symbol between code units") + { + test_pdf_proc({ + { + "10 0 obj\n" + "<>\n" + "endobj"s, + "\xf0\x9f\x90\xbd"s + } + }); + } + SECTION("split inside the code unit") + { + test_pdf_proc({ + { + "10 0 obj\n" + "<>\n" + "endobj"s, + "\xf0\x9f\x90\xbd"s + } + }); + } + SECTION("split inside escaped slash: first byte escaped") + { + test_pdf_proc({ + { + "10 0 obj\n" + "<>\n" + "endobj"s, + "\xe5\xb0\x80"s + } + }); + } + SECTION("split in hexadecimal string") + { + test_pdf_proc({ + { + "10 0 obj\n" + "<>>\n" + "endobj"s, + "foo"s + } + }); + } + SECTION("split in stream") + { + test_pdf_proc({ + { + "1 0 obj\n" + "<>\n" + "endobj\n" + "2 0 obj\n" + "<>\n" + "stream\n" + "\xfe\xff\0f\0o\0o\0"s, + "foo"s + }, + { + "b\0a\0r\n" + "endstream\n" + "endobj"s, + "bar"s + } + }); + } +}