PDFRet h_lit_u16_unescape();
PDFRet h_stream_open();
PDFRet h_stream();
+ PDFRet h_array_nesting();
bool h_stream_close();
void h_stream_length();
void h_ref();
char key[PDFTOKENIZER_NAME_MAX_SIZE] = {0};
};
+ struct IndirectObject
+ {
+ void clear()
+ { ref_met = false; }
+
+ bool ref_met = false;
+ };
+
struct Stream
{
int rem_length = -1;
bool is_js = false;
+ bool is_ref_len = false;
};
ObjectString obj_string;
ObjectDictionary obj_dictionary;
DictionaryEntry obj_entry;
Stream obj_stream;
+ IndirectObject indirect_obj;
std::unordered_set<unsigned int> js_stream_refs;
// represents UTF-16BE code point
void PDFTokenizer::h_ind_obj_close()
{
+ indirect_obj.clear();
obj_stream.is_js = false;
+ obj_stream.is_ref_len = false;
}
}
#include "js_norm/pdf_tokenizer.h"
#include "log/messages.h"
#include "trace/trace_api.h"
+#include "utils/util_cstring.h"
extern THREAD_LOCAL const snort::Trace* js_trace;
INDIRECT_OBJ_CLOSE endobj
-OBJ_REFERENCE {OBJ_INT_NUM}{GRP_WHITESPACE}+{OBJ_INT_NUM}{GRP_WHITESPACE}+R
+OBJ_REFERENCE {OBJ_INT_NUM}{GRP_WHITESPACE}+{OBJ_INT_NUM}{GRP_WHITESPACE}+R
/* Not object start, not comments */
-SKIP [^[:digit:]%]{1,16}|.
-WHITESPACE {GRP_WHITESPACE}{1,16}
+SKIP [^[:digit:]%]{1,16}|.
+WHITESPACE {GRP_WHITESPACE}{1,16}
/* Start conditions: structures: comment, indirect object, dictionary or array */
%x comment
<INITIAL>{INDIRECT_OBJ_OPEN} { PUSH(indobj); h_ind_obj_open(); }
<indobj>{WHITESPACE} { }
-<indobj>{INDIRECT_OBJ_CLOSE} { POP(); h_ind_obj_close(); }
+<indobj>{INDIRECT_OBJ_CLOSE} { POP(); h_ind_obj_close(); EXEC(h_array_nesting()) }
+<indobj>{OBJ_ARRAY_OPEN} { ++obj_array.nesting_level; }
+<indobj>{OBJ_ARRAY_CLOSE} { --obj_array.nesting_level; }
+<indobj>{OBJ_REFERENCE} { indirect_obj.ref_met = true; }
+<indobj>{OBJ_BOOLEAN} { }
+<indobj>{OBJ_INT_NUM} { }
+<indobj>{OBJ_REL_NUM} { }
+<indobj>{OBJ_NULL} { }
+<indobj>{OBJ_NAME} { }
<indobj>{OBJ_STREAM_OPEN} { EXEC(h_stream_open()) PUSH(obj_stream.is_js ? u16 : stream); }
<stream>{OBJ_STREAM_SKIP} { EXEC(h_stream()) }
<dictnr>{OBJ_LIT_STR_OPEN} { EXEC(h_dict_other()) if (h_lit_str()) PUSH(jslstr); else PUSH(litstr); yyless(0); }
<dictnr>{OBJ_HEX_STR_OPEN} { EXEC(h_dict_other()) if (h_hex_str()) PUSH(jshstr); else PUSH(hexstr); yyless(0); }
<dictnr>{OBJ_DICT_SKIP} { }
+<dictnr>{INDIRECT_OBJ_CLOSE} { return PDFRet::UNEXPECTED_SYMBOL; }
<indobj>{OBJ_LIT_STR_OPEN} { if (h_lit_open()) PUSH(litstr); }
<litstr>{OBJ_LIT_STR_OPEN} { h_lit_open(); }
PDFTokenizer::PDFRet PDFTokenizer::h_dict_close()
{
- obj_dictionary.clear();
-
debug_logf(6, js_trace, TRACE_PDF_PROC, nullptr,
"dictionary close, at array level %d\n", obj_array.nesting_level);
- if (obj_dictionary.array_level != obj_array.nesting_level)
+ auto dict_arr_lvl = obj_dictionary.array_level;
+ obj_dictionary.clear();
+
+ if (dict_arr_lvl != obj_array.nesting_level)
return PDFRet::INCOMPLETE_ARRAY_IN_DICTIONARY;
return PDFRet::EOS;
return PDFRet::EOS;
}
+PDFTokenizer::PDFRet PDFTokenizer::h_array_nesting()
+{
+ if (obj_array.nesting_level)
+ return PDFRet::UNEXPECTED_SYMBOL;
+ else
+ return PDFRet::EOS;
+}
+
PDFTokenizer::PDFRet PDFTokenizer::h_stream_open()
{
- if (obj_stream.rem_length < 0)
+ if (obj_stream.rem_length < 0 and !obj_stream.is_ref_len)
return PDFRet::STREAM_NO_LENGTH;
+ if (indirect_obj.ref_met)
+ {
+ indirect_obj.clear();
+ return PDFRet::UNEXPECTED_SYMBOL; // indirect streams must have direct dictionaries
+ }
+
debug_logf(6, js_trace, TRACE_PDF_PROC, nullptr,
"Starting %s stream, length %d\n", obj_stream.is_js ? "JavaScript" : "skipping", obj_stream.rem_length);
bool PDFTokenizer::h_stream_close()
{
obj_stream.rem_length -= yyleng;
+
if (obj_stream.rem_length <= 0)
{
if (YY_START == jsstream)
if (YY_START == jsstream)
ECHO;
- return false;
+ return obj_stream.is_ref_len;
}
void PDFTokenizer::h_stream_length()
{
if (!strcmp(obj_entry.key, "/Length"))
- obj_stream.rem_length = atoi(yytext);
+ obj_stream.rem_length = snort::SnortStrtol(yytext, nullptr, 10);
}
void PDFTokenizer::h_ref()
{
if (!strcmp(obj_entry.key, "/JS"))
- js_stream_refs.insert(atoi(yytext));
+ js_stream_refs.insert(snort::SnortStrtoul(yytext, nullptr, 10));
+ else if (!strcmp(obj_entry.key, "/Length"))
+ {
+ obj_stream.is_ref_len = true;
+ obj_stream.rem_length = -1;
+ }
}
void PDFTokenizer::h_ind_obj_open()
{
- int value = atoi(yytext);
+ unsigned int value = snort::SnortStrtoul(yytext, nullptr, 10);
if (js_stream_refs.count(value) > 0)
obj_stream.is_js = true;
}
""
);
}
- SECTION("indirect object")
+ SECTION("indirect dictionary")
{
test_pdf_proc(
"19 0 obj"
""
);
}
+
+ SECTION("indirect array")
+ {
+ test_pdf_proc(
+ "1 0 obj"
+ "["
+ "null 1 2 3.14 (string) << /SubDict [/Sub /Array] >> true 2 0 R"
+ "]"
+ "endobj",
+ ""
+ );
+ }
+
+ SECTION("indirect imbalanced array")
+ {
+ test_pdf_proc(
+ "1 0 obj"
+ "["
+ "1 2 3\n"
+ "endobj",
+ "", PDFTokenizer::PDFRet::UNEXPECTED_SYMBOL
+ );
+ }
+
+ SECTION("indirect number")
+ {
+ test_pdf_proc(
+ "1 0 obj\n"
+ "1\n"
+ "endobj\n"
+ "2 0 obj\n"
+ "3.14\n"
+ "endobj",
+ ""
+ );
+ }
+
+ SECTION("indirect ref")
+ {
+ test_pdf_proc(
+ "1 0 obj\n"
+ "2 0 R\n"
+ "endobj",
+ ""
+ );
+ }
+
+ SECTION("indirect bool")
+ {
+ test_pdf_proc(
+ "1 0 obj\n"
+ "false\n"
+ "endobj\n",
+ ""
+ );
+ }
+
+ SECTION("indirect name")
+ {
+ test_pdf_proc(
+ "1 0 obj\n"
+ "/name\n"
+ "endobj",
+ ""
+ );
+ }
+
+ SECTION("indirect null")
+ {
+ test_pdf_proc(
+ "1 0 obj\n"
+ "null\n"
+ "endobj\n",
+ ""
+ );
+ }
+
SECTION("records")
{
test_pdf_proc(
"", PDFTokenizer::PDFRet::NOT_NAME_IN_DICTIONARY_KEY
);
}
- SECTION("incomplete array")
+ SECTION("token too long")
+ {
+ test_pdf_proc(
+ "1"s + std::string(16 * 1024,' ') + " 0 obj"
+ "<< >>"
+ "endobj"s,
+ "", PDFTokenizer::PDFRet::TOKEN_TOO_LONG
+ );
+ }
+}
+
+TEST_CASE("brackets balancing", "[PDFTokenizer]")
+{
+ SECTION("imbalanced array")
+ {
+ SECTION("missing end")
+ {
+ test_pdf_proc(
+ "1 0 obj"
+ "[ 0 "
+ "endobj",
+ "", PDFTokenizer::PDFRet::UNEXPECTED_SYMBOL
+ );
+ }
+ SECTION("redundant end")
+ {
+ test_pdf_proc(
+ "1 0 obj"
+ "[ 0 ]]"
+ "endobj",
+ "", PDFTokenizer::PDFRet::UNEXPECTED_SYMBOL
+ );
+ }
+ }
+ SECTION("imbalanced dictionary")
+ {
+ SECTION("missing end")
+ {
+ test_pdf_proc(
+ "1 0 obj"
+ "<< /dict "
+ "endobj",
+ "", PDFTokenizer::PDFRet::UNEXPECTED_SYMBOL
+ );
+ }
+ SECTION("redundant end")
+ {
+ test_pdf_proc(
+ "1 0 obj"
+ "<< /dict >> >>"
+ "endobj",
+ "", PDFTokenizer::PDFRet::UNEXPECTED_SYMBOL
+ );
+ }
+ }
+ SECTION("balanced array in array")
{
test_pdf_proc(
"1 0 obj"
- "<<"
- "/K1 [ /V1 /V2 /V3 "
- ">>"
+ "["
+ "[ /nested /array ]"
+ "]"
"endobj",
- "", PDFTokenizer::PDFRet::INCOMPLETE_ARRAY_IN_DICTIONARY
+ ""
);
}
- SECTION("token too long")
+ SECTION("imbalanced array in array")
+ {
+ SECTION("missing end")
+ {
+ test_pdf_proc(
+ "1 0 obj"
+ "["
+ "[ /nested /array "
+ "]"
+ "endobj",
+ "", PDFTokenizer::PDFRet::UNEXPECTED_SYMBOL
+ );
+ }
+ SECTION("redundant end")
+ {
+ test_pdf_proc(
+ "1 0 obj"
+ "["
+ "[ /nested /array ] ]"
+ "]"
+ "endobj",
+ "", PDFTokenizer::PDFRet::UNEXPECTED_SYMBOL
+ );
+ }
+ }
+ SECTION("balanced dictionary in array")
{
test_pdf_proc(
- "1"s + std::string(16 * 1024,' ') + " 0 obj"
- "<< >>"
- "endobj"s,
- "", PDFTokenizer::PDFRet::TOKEN_TOO_LONG
+ "1 0 obj"
+ "["
+ "<< /nested /dict >>"
+ "]"
+ "endobj",
+ ""
+ );
+ }
+ SECTION("imbalanced dictionary in array")
+ {
+ SECTION("missing end")
+ {
+ test_pdf_proc(
+ "1 0 obj"
+ "["
+ "<< /nested /dict "
+ "]"
+ "endobj",
+ "", PDFTokenizer::PDFRet::UNEXPECTED_SYMBOL
+ );
+ }
+ SECTION("redundant end")
+ {
+ test_pdf_proc(
+ "1 0 obj"
+ "["
+ "<< /nested /dict >> >>"
+ "]"
+ "endobj",
+ "", PDFTokenizer::PDFRet::UNEXPECTED_SYMBOL
+ );
+ }
+ }
+ SECTION("balanced array in dictionary")
+ {
+ test_pdf_proc(
+ "1 0 obj"
+ "<< /array [] >>"
+ "endobj",
+ ""
+ );
+ }
+ SECTION("imbalanced array in dictionary")
+ {
+ SECTION("missing end")
+ {
+ test_pdf_proc(
+ "1 0 obj"
+ "<<"
+ "/K1 [ /V1 /V2 /V3 "
+ ">>"
+ "endobj",
+ "", PDFTokenizer::PDFRet::INCOMPLETE_ARRAY_IN_DICTIONARY
+ );
+ }
+ SECTION("redundant end")
+ {
+ test_pdf_proc(
+ "1 0 obj"
+ "<<"
+ "/K1 [ /V1 /V2 /V3 ]]"
+ ">>"
+ "endobj",
+ "", PDFTokenizer::PDFRet::INCOMPLETE_ARRAY_IN_DICTIONARY
+ );
+ }
+ }
+ SECTION("balanced strings")
+ {
+ test_pdf_proc(
+ "1 0 obj"
+ "( a string with ( parentheses ) in it )"
+ "endobj",
+ ""
+ );
+ }
+ SECTION("imbalanced strings")
+ {
+ SECTION("missing end")
+ {
+ // NOTE: such syntax doesn't generate an error, because it's possible
+ // to have a string continuation in next PDUs. Same holds true for
+ // hex strings too
+ test_pdf_proc(
+ "1 0 obj"
+ "( a string with ( parentheses in it )"
+ "endobj",
+ ""
+ );
+ }
+ SECTION("redundant end")
+ {
+ test_pdf_proc(
+ "1 0 obj"
+ "( a string with ( parentheses in it )))"
+ "endobj",
+ "", PDFTokenizer::PDFRet::UNEXPECTED_SYMBOL
+ );
+ }
+ }
+ SECTION("balanced hex strings")
+ {
+ test_pdf_proc(
+ "1 0 obj"
+ "<FE FF 00 66 006F 00 6F>"
+ "endobj",
+ ""
);
}
+ SECTION("imbalanced hex strings")
+ {
+ SECTION("missing end")
+ {
+ test_pdf_proc(
+ "1 0 obj"
+ "<FE FF 00 66 006F 00 6F "
+ "endobj",
+ ""
+ );
+ }
+ SECTION("redundant end")
+ {
+ test_pdf_proc(
+ "1 0 obj"
+ "<FE FF 00 66 006F 00 6F>>"
+ "endobj",
+ "", PDFTokenizer::PDFRet::UNEXPECTED_SYMBOL
+ );
+ }
+ }
+ SECTION("multiple tokens inter-nesting")
+ {
+ SECTION("array-array-array")
+ {
+ test_pdf_proc(
+ "1 0 obj"
+ "[ [ [ null ] ] ]"
+ "endobj",
+ ""
+ );
+ }
+ SECTION("array-array-dict")
+ {
+ test_pdf_proc(
+ "1 0 obj"
+ "[ [ << /key /value >> ] ]"
+ "endobj",
+ ""
+ );
+ }
+ SECTION("dict-dict-array")
+ {
+ test_pdf_proc(
+ "1 0 obj"
+ "<< /key1 << /key2 [ null ] >> >>"
+ "endobj",
+ ""
+ );
+ }
+ SECTION("dict-dict-dict")
+ {
+ test_pdf_proc(
+ "1 0 obj"
+ "<< /key1 << /key2 << /key3 /val3 >> >> >>"
+ "endobj",
+ ""
+ );
+ }
+ }
}
TEST_CASE("JS location", "[PDFTokenizer]")
"bar\n"
);
}
+ SECTION("reference as length")
+ {
+ test_pdf_proc(
+ "1 0 obj\n"
+ "<</S /JavaScript /JS 2 0 R>>\n"
+ "endobj\n"
+ "2 0 obj\n"
+ "<<"
+ "/Length 3 0 R"
+ ">>\n"
+ "stream\n"
+ "foo\n"
+ "endstream\n"
+ "endobj\n"
+ "3 0 obj\n"
+ "3\n"
+ "endobj\n",
+ "foo\n", PDFTokenizer::PDFRet::EOS
+ );
+ }
SECTION("special symbols in a stream")
{
test_pdf_proc(