[thirdparty/squid.git] / src / http / one / Parser.cc

/*
 * Copyright (C) 1996-2018 The Squid Software Foundation and contributors
 *
 * Squid software is distributed under GPLv2+ license and includes
 * contributions from numerous individuals and organizations.
 * Please see the COPYING and CONTRIBUTORS files for details.
 */

#include "squid.h"
#include "Debug.h"
#include "http/one/Parser.h"
#include "http/one/Tokenizer.h"
#include "mime_header.h"
#include "SquidConfig.h"

/// RFC 7230 section 2.6 - 7 magic octets
const SBuf Http::One::Parser::Http1magic("HTTP/1.");

const SBuf &Http::One::CrLf()
{
    static const SBuf crlf("\r\n");
    return crlf;
}

void
Http::One::Parser::clear()
{
    parsingStage_ = HTTP_PARSE_NONE;
    buf_ = NULL;
    msgProtocol_ = AnyP::ProtocolVersion();
    mimeHeaderBlock_.clear();
}

/// characters HTTP permits tolerant parsers to accept as delimiters
static const CharacterSet &
RelaxedDelimiterCharacters()
{
    // RFC 7230 section 3.5
    // tolerant parser MAY accept any of SP, HTAB, VT (%x0B), FF (%x0C),
    // or bare CR as whitespace between request-line fields
    static const CharacterSet RelaxedDels =
        (CharacterSet::SP +
         CharacterSet::HTAB +
         CharacterSet("VT,FF","\x0B\x0C") +
         CharacterSet::CR).rename("relaxed-WSP");

    return RelaxedDels;
}

const CharacterSet &
Http::One::Parser::WhitespaceCharacters()
{
    return Config.onoff.relaxed_header_parser ?
           RelaxedDelimiterCharacters() : CharacterSet::WSP;
}

const CharacterSet &
Http::One::Parser::DelimiterCharacters()
{
    return Config.onoff.relaxed_header_parser ?
           RelaxedDelimiterCharacters() : CharacterSet::SP;
}

bool
Http::One::Parser::skipLineTerminator(Http1::Tokenizer &tok) const
{
    if (tok.skip(Http1::CrLf()))
        return true;

    if (Config.onoff.relaxed_header_parser && tok.skipOne(CharacterSet::LF))
        return true;

    if (tok.atEnd() || (tok.remaining().length() == 1 && tok.remaining().at(0) == '\r'))
        return false; // need more data

    throw TexcHere("garbage instead of CRLF line terminator");
    return false; // unreachable, but make naive compilers happy
}

/// all characters except the LF line terminator
static const CharacterSet &
LineCharacters()
{
    static const CharacterSet line = CharacterSet::LF.complement("non-LF");
    return line;
}

/**
 * Remove invalid lines (if any) from the mime prefix
 *
 * RFC 7230 section 3:
 * "A recipient that receives whitespace between the start-line and
 * the first header field MUST ... consume each whitespace-preceded
 * line without further processing of it."
 *
 * We need to always use the relaxed delimiters here to prevent
 * line smuggling through strict parsers.
 *
 * Note that 'whitespace' in RFC 7230 includes CR. So that means
 * sequences of CRLF will be pruned, but not sequences of bare-LF.
 */
void
Http::One::Parser::cleanMimePrefix()
{
    Http1::Tokenizer tok(mimeHeaderBlock_);
    while (tok.skipOne(RelaxedDelimiterCharacters())) {
        (void)tok.skipAll(LineCharacters()); // optional line content
        // LF terminator is required.
        // trust headersEnd() to ensure that we have at least one LF
        (void)tok.skipOne(CharacterSet::LF);
    }

    // If mimeHeaderBlock_ had just whitespace line(s) followed by CRLF,
    // then we skipped everything, including that terminating LF.
    // Restore the terminating CRLF if needed.
    if (tok.atEnd())
        mimeHeaderBlock_ = Http1::CrLf();
    else
        mimeHeaderBlock_ = tok.remaining();
    // now mimeHeaderBlock_ has 0+ fields followed by the LF terminator
}

/**
 * Replace obs-fold with a single SP,
 *
 * RFC 7230 section 3.2.4
 * "A server that receives an obs-fold in a request message that is not
 *  within a message/http container MUST ... replace
 *  each received obs-fold with one or more SP octets prior to
 *  interpreting the field value or forwarding the message downstream."
 *
 * "A proxy or gateway that receives an obs-fold in a response message
 *  that is not within a message/http container MUST ... replace each
 *  received obs-fold with one or more SP octets prior to interpreting
 *  the field value or forwarding the message downstream."
 */
void
Http::One::Parser::unfoldMime()
{
    Http1::Tokenizer tok(mimeHeaderBlock_);
    const auto szLimit = mimeHeaderBlock_.length();
    mimeHeaderBlock_.clear();
    // prevent the mime sender being able to make append() realloc/grow multiple times.
    mimeHeaderBlock_.reserveSpace(szLimit);

    static const CharacterSet nonCRLF = (CharacterSet::CR + CharacterSet::LF).complement().rename("non-CRLF");

    while (!tok.atEnd()) {
        const SBuf all(tok.remaining());
        const auto blobLen = tok.skipAll(nonCRLF); // may not be there
        const auto crLen = tok.skipAll(CharacterSet::CR); // may not be there
        const auto lfLen = tok.skipOne(CharacterSet::LF); // may not be there

        if (lfLen && tok.skipAll(CharacterSet::WSP)) { // obs-fold!
            mimeHeaderBlock_.append(all.substr(0, blobLen));
            mimeHeaderBlock_.append(' '); // replace one obs-fold with one SP
        } else
            mimeHeaderBlock_.append(all.substr(0, blobLen + crLen + lfLen));
    }
}

bool
Http::One::Parser::grabMimeBlock(const char *which, const size_t limit)
{
    // MIME headers block exist in (only) HTTP/1.x and ICY
    const bool expectMime = (msgProtocol_.protocol == AnyP::PROTO_HTTP && msgProtocol_.major == 1) ||
                            msgProtocol_.protocol == AnyP::PROTO_ICY ||
                            hackExpectsMime_;

    if (expectMime) {
        /* NOTE: HTTP/0.9 messages do not have a mime header block.
         *       So the rest of the code will need to deal with '0'-byte headers
         *       (ie, none, so don't try parsing em)
         */
        bool containsObsFold;
        if (SBuf::size_type mimeHeaderBytes = headersEnd(buf_, containsObsFold)) {

            // Squid could handle these headers, but admin does not want to
            if (firstLineSize() + mimeHeaderBytes >= limit) {
                debugs(33, 5, "Too large " << which);
                parseStatusCode = Http::scHeaderTooLarge;
                buf_.consume(mimeHeaderBytes);
                parsingStage_ = HTTP_PARSE_DONE;
                return false;
            }

            mimeHeaderBlock_ = buf_.consume(mimeHeaderBytes);
            cleanMimePrefix();
            if (containsObsFold)
                unfoldMime();

            debugs(74, 5, "mime header (0-" << mimeHeaderBytes << ") {" << mimeHeaderBlock_ << "}");

        } else { // headersEnd() == 0
            if (buf_.length()+firstLineSize() >= limit) {
                debugs(33, 5, "Too large " << which);
                parseStatusCode = Http::scHeaderTooLarge;
                parsingStage_ = HTTP_PARSE_DONE;
            } else
                debugs(33, 5, "Incomplete " << which << ", waiting for end of headers");
            return false;
        }

    } else
        debugs(33, 3, "Missing HTTP/1.x identifier");

    // NP: we do not do any further stages here yet so go straight to DONE
    parsingStage_ = HTTP_PARSE_DONE;

    return true;
}

// arbitrary maximum-length for headers which can be found by Http1Parser::getHeaderField()
#define GET_HDR_SZ  1024

// BUG: returns only the first header line with given name,
//      ignores multi-line headers and obs-fold headers
char *
Http::One::Parser::getHeaderField(const char *name)
{
    if (!headerBlockSize() || !name)
        return NULL;

    LOCAL_ARRAY(char, header, GET_HDR_SZ);
    const int namelen = strlen(name);

    debugs(25, 5, "looking for " << name);

    // while we can find more LF in the SBuf
    Http1::Tokenizer tok(mimeHeaderBlock_);
    SBuf p;

    while (tok.prefix(p, LineCharacters())) {
        if (!tok.skipOne(CharacterSet::LF)) // move tokenizer past the LF
            break; // error. reached invalid octet or end of buffer insted of an LF ??

        // header lines must start with the name (case insensitive)
        if (p.substr(0, namelen).caseCmp(name, namelen))
            continue;

        // then a COLON
        if (p[namelen] != ':')
            continue;

        // drop any trailing *CR sequence
        p.trim(Http1::CrLf(), false, true);

        debugs(25, 5, "checking " << p);
        p.consume(namelen + 1);

        // TODO: optimize SBuf::trim to take CharacterSet directly
        Http1::Tokenizer t(p);
        t.skipAll(CharacterSet::WSP);
        p = t.remaining();

        // prevent buffer overrun on char header[];
        p.chop(0, sizeof(header)-1);

        // return the header field-value
        SBufToCstring(header, p);
        debugs(25, 5, "returning " << header);
        return header;
    }

    return NULL;
}

int
Http::One::ErrorLevel()
{
    return Config.onoff.relaxed_header_parser < 0 ? DBG_IMPORTANT : 5;
}

// BWS = *( SP / HTAB ) ; WhitespaceCharacters() may relax this RFC 7230 rule
bool
Http::One::ParseBws(Tokenizer &tok)
{
    if (const auto count = tok.skipAll(Parser::WhitespaceCharacters())) {
        // Generating BWS is a MUST-level violation so warn about it as needed.
        debugs(33, ErrorLevel(), "found " << count << " BWS octets");
        // RFC 7230 says we MUST parse BWS, so we fall through even if
        // Config.onoff.relaxed_header_parser is off.
    }
    // else we successfully "parsed" an empty BWS sequence

    return true;
}
Commit	Line	Data
48a37aee	1	/*
5b74111a	2	* Copyright (C) 1996-2018 The Squid Software Foundation and contributors
48a37aee AJ	3	*
	4	* Squid software is distributed under GPLv2+ license and includes
	5	* contributions from numerous individuals and organizations.
	6	* Please see the COPYING and CONTRIBUTORS files for details.
	7	*/
	8
c99510dd AJ	9	#include "squid.h"
	10	#include "Debug.h"
	11	#include "http/one/Parser.h"
f29718b0	12	#include "http/one/Tokenizer.h"
f1d5359e	13	#include "mime_header.h"
b8f86fd2	14	#include "SquidConfig.h"
c99510dd	15
9651320a AJ	16	/// RFC 7230 section 2.6 - 7 magic octets
	17	const SBuf Http::One::Parser::Http1magic("HTTP/1.");
	18
00237269 AJ	19	const SBuf &Http::One::CrLf()
	20	{
	21	static const SBuf crlf("\r\n");
	22	return crlf;
	23	}
	24
c99510dd AJ	25	void
	26	Http::One::Parser::clear()
	27	{
	28	parsingStage_ = HTTP_PARSE_NONE;
b749de75	29	buf_ = NULL;
c99510dd AJ	30	msgProtocol_ = AnyP::ProtocolVersion();
	31	mimeHeaderBlock_.clear();
	32	}
	33
00237269 AJ	34	/// characters HTTP permits tolerant parsers to accept as delimiters
	35	static const CharacterSet &
	36	RelaxedDelimiterCharacters()
	37	{
	38	// RFC 7230 section 3.5
	39	// tolerant parser MAY accept any of SP, HTAB, VT (%x0B), FF (%x0C),
	40	// or bare CR as whitespace between request-line fields
	41	static const CharacterSet RelaxedDels =
	42	(CharacterSet::SP +
	43	CharacterSet::HTAB +
	44	CharacterSet("VT,FF","\x0B\x0C") +
	45	CharacterSet::CR).rename("relaxed-WSP");
	46
	47	return RelaxedDels;
	48	}
	49
26f0a359 AR	50	const CharacterSet &
	51	Http::One::Parser::WhitespaceCharacters()
	52	{
	53	return Config.onoff.relaxed_header_parser ?
	54	RelaxedDelimiterCharacters() : CharacterSet::WSP;
	55	}
	56
00237269 AJ	57	const CharacterSet &
	58	Http::One::Parser::DelimiterCharacters()
	59	{
	60	return Config.onoff.relaxed_header_parser ?
	61	RelaxedDelimiterCharacters() : CharacterSet::SP;
	62	}
	63
f1d5359e	64	bool
f29718b0	65	Http::One::Parser::skipLineTerminator(Http1::Tokenizer &tok) const
f1d5359e	66	{
00237269	67	if (tok.skip(Http1::CrLf()))
b8f86fd2 AJ	68	return true;
	69
	70	if (Config.onoff.relaxed_header_parser && tok.skipOne(CharacterSet::LF))
	71	return true;
	72
188ad27f AJ	73	if (tok.atEnd() \|\| (tok.remaining().length() == 1 && tok.remaining().at(0) == '\r'))
	74	return false; // need more data
	75
	76	throw TexcHere("garbage instead of CRLF line terminator");
	77	return false; // unreachable, but make naive compilers happy
b8f86fd2 AJ	78	}
b8f86fd2 AJ	79
00237269 AJ	80	/// all characters except the LF line terminator
	81	static const CharacterSet &
	82	LineCharacters()
	83	{
	84	static const CharacterSet line = CharacterSet::LF.complement("non-LF");
	85	return line;
	86	}
	87
	88	/**
	89	* Remove invalid lines (if any) from the mime prefix
	90	*
	91	* RFC 7230 section 3:
	92	* "A recipient that receives whitespace between the start-line and
	93	* the first header field MUST ... consume each whitespace-preceded
	94	* line without further processing of it."
	95	*
	96	* We need to always use the relaxed delimiters here to prevent
	97	* line smuggling through strict parsers.
	98	*
	99	* Note that 'whitespace' in RFC 7230 includes CR. So that means
	100	* sequences of CRLF will be pruned, but not sequences of bare-LF.
	101	*/
	102	void
	103	Http::One::Parser::cleanMimePrefix()
	104	{
	105	Http1::Tokenizer tok(mimeHeaderBlock_);
	106	while (tok.skipOne(RelaxedDelimiterCharacters())) {
	107	(void)tok.skipAll(LineCharacters()); // optional line content
	108	// LF terminator is required.
	109	// trust headersEnd() to ensure that we have at least one LF
	110	(void)tok.skipOne(CharacterSet::LF);
	111	}
	112
	113	// If mimeHeaderBlock_ had just whitespace line(s) followed by CRLF,
	114	// then we skipped everything, including that terminating LF.
	115	// Restore the terminating CRLF if needed.
	116	if (tok.atEnd())
	117	mimeHeaderBlock_ = Http1::CrLf();
	118	else
	119	mimeHeaderBlock_ = tok.remaining();
	120	// now mimeHeaderBlock_ has 0+ fields followed by the LF terminator
	121	}
	122
	123	/**
	124	* Replace obs-fold with a single SP,
	125	*
	126	* RFC 7230 section 3.2.4
	127	* "A server that receives an obs-fold in a request message that is not
	128	* within a message/http container MUST ... replace
	129	* each received obs-fold with one or more SP octets prior to
	130	* interpreting the field value or forwarding the message downstream."
	131	*
	132	* "A proxy or gateway that receives an obs-fold in a response message
	133	* that is not within a message/http container MUST ... replace each
	134	* received obs-fold with one or more SP octets prior to interpreting
	135	* the field value or forwarding the message downstream."
	136	*/
	137	void
	138	Http::One::Parser::unfoldMime()
	139	{
	140	Http1::Tokenizer tok(mimeHeaderBlock_);
	141	const auto szLimit = mimeHeaderBlock_.length();
	142	mimeHeaderBlock_.clear();
	143	// prevent the mime sender being able to make append() realloc/grow multiple times.
144	mimeHeaderBlock_.reserveSpace(szLimit);
145
146	static const CharacterSet nonCRLF = (CharacterSet::CR + CharacterSet::LF).complement().rename("non-CRLF");
147
148	while (!tok.atEnd()) {
149	const SBuf all(tok.remaining());
150	const auto blobLen = tok.skipAll(nonCRLF); // may not be there
151	const auto crLen = tok.skipAll(CharacterSet::CR); // may not be there
152	const auto lfLen = tok.skipOne(CharacterSet::LF); // may not be there
153
154	if (lfLen && tok.skipAll(CharacterSet::WSP)) { // obs-fold!
155	mimeHeaderBlock_.append(all.substr(0, blobLen));
156	mimeHeaderBlock_.append(' '); // replace one obs-fold with one SP
157	} else
158	mimeHeaderBlock_.append(all.substr(0, blobLen + crLen + lfLen));
159	}
160	}
161
b8f86fd2	162	bool
f8cab755	163	Http::One::Parser::grabMimeBlock(const char *which, const size_t limit)
b8f86fd2 AJ	164	{
	165	// MIME headers block exist in (only) HTTP/1.x and ICY
	166	const bool expectMime = (msgProtocol_.protocol == AnyP::PROTO_HTTP && msgProtocol_.major == 1) \|\|
e47e0802 AJ	167	msgProtocol_.protocol == AnyP::PROTO_ICY \|\|
e47e0802 AJ	168	hackExpectsMime_;
b8f86fd2 AJ	169
b8f86fd2 AJ	170	if (expectMime) {
f1d5359e AJ	171	/* NOTE: HTTP/0.9 messages do not have a mime header block.
	172	* So the rest of the code will need to deal with '0'-byte headers
	173	* (ie, none, so don't try parsing em)
	174	*/
00237269 AJ	175	bool containsObsFold;
00237269 AJ	176	if (SBuf::size_type mimeHeaderBytes = headersEnd(buf_, containsObsFold)) {
f8cab755 AJ	177
	178	// Squid could handle these headers, but admin does not want to
	179	if (firstLineSize() + mimeHeaderBytes >= limit) {
	180	debugs(33, 5, "Too large " << which);
	181	parseStatusCode = Http::scHeaderTooLarge;
	182	buf_.consume(mimeHeaderBytes);
	183	parsingStage_ = HTTP_PARSE_DONE;
	184	return false;
	185	}
	186
	187	mimeHeaderBlock_ = buf_.consume(mimeHeaderBytes);
00237269 AJ	188	cleanMimePrefix();
	189	if (containsObsFold)
	190	unfoldMime();
	191
f8cab755 AJ	192	debugs(74, 5, "mime header (0-" << mimeHeaderBytes << ") {" << mimeHeaderBlock_ << "}");
	193
	194	} else { // headersEnd() == 0
f1d5359e AJ	195	if (buf_.length()+firstLineSize() >= limit) {
	196	debugs(33, 5, "Too large " << which);
	197	parseStatusCode = Http::scHeaderTooLarge;
	198	parsingStage_ = HTTP_PARSE_DONE;
	199	} else
	200	debugs(33, 5, "Incomplete " << which << ", waiting for end of headers");
	201	return false;
	202	}
b8f86fd2	203
f1d5359e AJ	204	} else
	205	debugs(33, 3, "Missing HTTP/1.x identifier");
	206
	207	// NP: we do not do any further stages here yet so go straight to DONE
	208	parsingStage_ = HTTP_PARSE_DONE;
	209
f1d5359e AJ	210	return true;
	211	}
	212
c99510dd	213	// arbitrary maximum-length for headers which can be found by Http1Parser::getHeaderField()
f53969cc	214	#define GET_HDR_SZ 1024
c99510dd	215
687696c1 AJ	216	// BUG: returns only the first header line with given name,
687696c1 AJ	217	// ignores multi-line headers and obs-fold headers
c99510dd AJ	218	char *
	219	Http::One::Parser::getHeaderField(const char *name)
	220	{
c99510dd AJ	221	if (!headerBlockSize() \|\| !name)
	222	return NULL;
	223
687696c1	224	LOCAL_ARRAY(char, header, GET_HDR_SZ);
1296170f	225	const int namelen = strlen(name);
687696c1	226
f6c7fa03	227	debugs(25, 5, "looking for " << name);
c99510dd	228
f6c7fa03	229	// while we can find more LF in the SBuf
f29718b0	230	Http1::Tokenizer tok(mimeHeaderBlock_);
687696c1	231	SBuf p;
c99510dd	232
00237269	233	while (tok.prefix(p, LineCharacters())) {
2d40b13f AJ	234	if (!tok.skipOne(CharacterSet::LF)) // move tokenizer past the LF
2d40b13f AJ	235	break; // error. reached invalid octet or end of buffer insted of an LF ??
c99510dd	236
687696c1 AJ	237	// header lines must start with the name (case insensitive)
687696c1 AJ	238	if (p.substr(0, namelen).caseCmp(name, namelen))
c99510dd AJ	239	continue;
c99510dd AJ	240
687696c1 AJ	241	// then a COLON
687696c1 AJ	242	if (p[namelen] != ':')
c99510dd AJ	243	continue;
c99510dd AJ	244
687696c1	245	// drop any trailing *CR sequence
00237269	246	p.trim(Http1::CrLf(), false, true);
c99510dd	247
687696c1 AJ	248	debugs(25, 5, "checking " << p);
687696c1 AJ	249	p.consume(namelen + 1);
c99510dd	250
687696c1	251	// TODO: optimize SBuf::trim to take CharacterSet directly
f29718b0	252	Http1::Tokenizer t(p);
9bafa70d	253	t.skipAll(CharacterSet::WSP);
687696c1	254	p = t.remaining();
c99510dd	255
687696c1 AJ	256	// prevent buffer overrun on char header[];
687696c1 AJ	257	p.chop(0, sizeof(header)-1);
c99510dd	258
687696c1	259	// return the header field-value
3f0e38d6	260	SBufToCstring(header, p);
f6c7fa03	261	debugs(25, 5, "returning " << header);
687696c1	262	return header;
c99510dd AJ	263	}
	264
	265	return NULL;
	266	}
f53969cc	267
9a4b5048	268	int
26f0a359	269	Http::One::ErrorLevel()
9a4b5048 AJ	270	{
	271	return Config.onoff.relaxed_header_parser < 0 ? DBG_IMPORTANT : 5;
	272	}
2c4e5226	273
26f0a359 AR	274	// BWS = *( SP / HTAB ) ; WhitespaceCharacters() may relax this RFC 7230 rule
	275	bool
	276	Http::One::ParseBws(Tokenizer &tok)
	277	{
	278	if (const auto count = tok.skipAll(Parser::WhitespaceCharacters())) {
	279	// Generating BWS is a MUST-level violation so warn about it as needed.
	280	debugs(33, ErrorLevel(), "found " << count << " BWS octets");
	281	// RFC 7230 says we MUST parse BWS, so we fall through even if
	282	// Config.onoff.relaxed_header_parser is off.
	283	}
	284	// else we successfully "parsed" an empty BWS sequence
	285
	286	return true;
	287	}
cae5602c	288