[thirdparty/squid.git] / src / http / one / Parser.cc

/*
 * Copyright (C) 1996-2017 The Squid Software Foundation and contributors
 *
 * Squid software is distributed under GPLv2+ license and includes
 * contributions from numerous individuals and organizations.
 * Please see the COPYING and CONTRIBUTORS files for details.
 */

#include "squid.h"
#include "Debug.h"
#include "http/one/Parser.h"
#include "http/one/Tokenizer.h"
#include "mime_header.h"
#include "SquidConfig.h"

/// RFC 7230 section 2.6 - 7 magic octets
const SBuf Http::One::Parser::Http1magic("HTTP/1.");

const SBuf &Http::One::CrLf()
{
    static const SBuf crlf("\r\n");
    return crlf;
}

void
Http::One::Parser::clear()
{
    parsingStage_ = HTTP_PARSE_NONE;
    buf_ = NULL;
    msgProtocol_ = AnyP::ProtocolVersion();
    mimeHeaderBlock_.clear();
}

/// characters HTTP permits tolerant parsers to accept as delimiters
static const CharacterSet &
RelaxedDelimiterCharacters()
{
    // RFC 7230 section 3.5
    // tolerant parser MAY accept any of SP, HTAB, VT (%x0B), FF (%x0C),
    // or bare CR as whitespace between request-line fields
    static const CharacterSet RelaxedDels =
        (CharacterSet::SP +
         CharacterSet::HTAB +
         CharacterSet("VT,FF","\x0B\x0C") +
         CharacterSet::CR).rename("relaxed-WSP");

    return RelaxedDels;
}

/// characters used to separate HTTP fields
const CharacterSet &
Http::One::Parser::DelimiterCharacters()
{
    return Config.onoff.relaxed_header_parser ?
           RelaxedDelimiterCharacters() : CharacterSet::SP;
}

bool
Http::One::Parser::skipLineTerminator(Http1::Tokenizer &tok) const
{
    if (tok.skip(Http1::CrLf()))
        return true;

    if (Config.onoff.relaxed_header_parser && tok.skipOne(CharacterSet::LF))
        return true;

    if (tok.atEnd() || (tok.remaining().length() == 1 && tok.remaining().at(0) == '\r'))
        return false; // need more data

    throw TexcHere("garbage instead of CRLF line terminator");
    return false; // unreachable, but make naive compilers happy
}

/// all characters except the LF line terminator
static const CharacterSet &
LineCharacters()
{
    static const CharacterSet line = CharacterSet::LF.complement("non-LF");
    return line;
}

/**
 * Remove invalid lines (if any) from the mime prefix
 *
 * RFC 7230 section 3:
 * "A recipient that receives whitespace between the start-line and
 * the first header field MUST ... consume each whitespace-preceded
 * line without further processing of it."
 *
 * We need to always use the relaxed delimiters here to prevent
 * line smuggling through strict parsers.
 *
 * Note that 'whitespace' in RFC 7230 includes CR. So that means
 * sequences of CRLF will be pruned, but not sequences of bare-LF.
 */
void
Http::One::Parser::cleanMimePrefix()
{
    Http1::Tokenizer tok(mimeHeaderBlock_);
    while (tok.skipOne(RelaxedDelimiterCharacters())) {
        (void)tok.skipAll(LineCharacters()); // optional line content
        // LF terminator is required.
        // trust headersEnd() to ensure that we have at least one LF
        (void)tok.skipOne(CharacterSet::LF);
    }

    // If mimeHeaderBlock_ had just whitespace line(s) followed by CRLF,
    // then we skipped everything, including that terminating LF.
    // Restore the terminating CRLF if needed.
    if (tok.atEnd())
        mimeHeaderBlock_ = Http1::CrLf();
    else
        mimeHeaderBlock_ = tok.remaining();
    // now mimeHeaderBlock_ has 0+ fields followed by the LF terminator
}

/**
 * Replace obs-fold with a single SP,
 *
 * RFC 7230 section 3.2.4
 * "A server that receives an obs-fold in a request message that is not
 *  within a message/http container MUST ... replace
 *  each received obs-fold with one or more SP octets prior to
 *  interpreting the field value or forwarding the message downstream."
 *
 * "A proxy or gateway that receives an obs-fold in a response message
 *  that is not within a message/http container MUST ... replace each
 *  received obs-fold with one or more SP octets prior to interpreting
 *  the field value or forwarding the message downstream."
 */
void
Http::One::Parser::unfoldMime()
{
    Http1::Tokenizer tok(mimeHeaderBlock_);
    const auto szLimit = mimeHeaderBlock_.length();
    mimeHeaderBlock_.clear();
    // prevent the mime sender being able to make append() realloc/grow multiple times.
    mimeHeaderBlock_.reserveSpace(szLimit);

    static const CharacterSet nonCRLF = (CharacterSet::CR + CharacterSet::LF).complement().rename("non-CRLF");

    while (!tok.atEnd()) {
        const SBuf all(tok.remaining());
        const auto blobLen = tok.skipAll(nonCRLF); // may not be there
        const auto crLen = tok.skipAll(CharacterSet::CR); // may not be there
        const auto lfLen = tok.skipOne(CharacterSet::LF); // may not be there

        if (lfLen && tok.skipAll(CharacterSet::WSP)) { // obs-fold!
            mimeHeaderBlock_.append(all.substr(0, blobLen));
            mimeHeaderBlock_.append(' '); // replace one obs-fold with one SP
        } else
            mimeHeaderBlock_.append(all.substr(0, blobLen + crLen + lfLen));
    }
}

bool
Http::One::Parser::grabMimeBlock(const char *which, const size_t limit)
{
    // MIME headers block exist in (only) HTTP/1.x and ICY
    const bool expectMime = (msgProtocol_.protocol == AnyP::PROTO_HTTP && msgProtocol_.major == 1) ||
                            msgProtocol_.protocol == AnyP::PROTO_ICY ||
                            hackExpectsMime_;

    if (expectMime) {
        /* NOTE: HTTP/0.9 messages do not have a mime header block.
         *       So the rest of the code will need to deal with '0'-byte headers
         *       (ie, none, so don't try parsing em)
         */
        bool containsObsFold;
        if (SBuf::size_type mimeHeaderBytes = headersEnd(buf_, containsObsFold)) {

            // Squid could handle these headers, but admin does not want to
            if (firstLineSize() + mimeHeaderBytes >= limit) {
                debugs(33, 5, "Too large " << which);
                parseStatusCode = Http::scHeaderTooLarge;
                buf_.consume(mimeHeaderBytes);
                parsingStage_ = HTTP_PARSE_DONE;
                return false;
            }

            mimeHeaderBlock_ = buf_.consume(mimeHeaderBytes);
            cleanMimePrefix();
            if (containsObsFold)
                unfoldMime();

            debugs(74, 5, "mime header (0-" << mimeHeaderBytes << ") {" << mimeHeaderBlock_ << "}");

        } else { // headersEnd() == 0
            if (buf_.length()+firstLineSize() >= limit) {
                debugs(33, 5, "Too large " << which);
                parseStatusCode = Http::scHeaderTooLarge;
                parsingStage_ = HTTP_PARSE_DONE;
            } else
                debugs(33, 5, "Incomplete " << which << ", waiting for end of headers");
            return false;
        }

    } else
        debugs(33, 3, "Missing HTTP/1.x identifier");

    // NP: we do not do any further stages here yet so go straight to DONE
    parsingStage_ = HTTP_PARSE_DONE;

    return true;
}

// arbitrary maximum-length for headers which can be found by Http1Parser::getHeaderField()
#define GET_HDR_SZ  1024

// BUG: returns only the first header line with given name,
//      ignores multi-line headers and obs-fold headers
char *
Http::One::Parser::getHeaderField(const char *name)
{
    if (!headerBlockSize() || !name)
        return NULL;

    LOCAL_ARRAY(char, header, GET_HDR_SZ);
    const int namelen = strlen(name);

    debugs(25, 5, "looking for " << name);

    // while we can find more LF in the SBuf
    Http1::Tokenizer tok(mimeHeaderBlock_);
    SBuf p;

    while (tok.prefix(p, LineCharacters())) {
        if (!tok.skipOne(CharacterSet::LF)) // move tokenizer past the LF
            break; // error. reached invalid octet or end of buffer insted of an LF ??

        // header lines must start with the name (case insensitive)
        if (p.substr(0, namelen).caseCmp(name, namelen))
            continue;

        // then a COLON
        if (p[namelen] != ':')
            continue;

        // drop any trailing *CR sequence
        p.trim(Http1::CrLf(), false, true);

        debugs(25, 5, "checking " << p);
        p.consume(namelen + 1);

        // TODO: optimize SBuf::trim to take CharacterSet directly
        Http1::Tokenizer t(p);
        t.skipAll(CharacterSet::WSP);
        p = t.remaining();

        // prevent buffer overrun on char header[];
        p.chop(0, sizeof(header)-1);

        // return the header field-value
        SBufToCstring(header, p);
        debugs(25, 5, "returning " << header);
        return header;
    }

    return NULL;
}

#if USE_HTTP_VIOLATIONS
int
Http::One::Parser::violationLevel() const
{
    return Config.onoff.relaxed_header_parser < 0 ? DBG_IMPORTANT : 5;
}
#endif
Commit	Line	Data
	1	/*
	2	* Copyright (C) 1996-2017 The Squid Software Foundation and contributors
	3	*
	4	* Squid software is distributed under GPLv2+ license and includes
	5	* contributions from numerous individuals and organizations.
	6	* Please see the COPYING and CONTRIBUTORS files for details.
	7	*/
	8
	9	#include "squid.h"
	10	#include "Debug.h"
	11	#include "http/one/Parser.h"
	12	#include "http/one/Tokenizer.h"
	13	#include "mime_header.h"
	14	#include "SquidConfig.h"
	15
	16	/// RFC 7230 section 2.6 - 7 magic octets
	17	const SBuf Http::One::Parser::Http1magic("HTTP/1.");
	18
	19	const SBuf &Http::One::CrLf()
	20	{
	21	static const SBuf crlf("\r\n");
	22	return crlf;
	23	}
	24
	25	void
	26	Http::One::Parser::clear()
	27	{
	28	parsingStage_ = HTTP_PARSE_NONE;
	29	buf_ = NULL;
	30	msgProtocol_ = AnyP::ProtocolVersion();
	31	mimeHeaderBlock_.clear();
	32	}
	33
	34	/// characters HTTP permits tolerant parsers to accept as delimiters
	35	static const CharacterSet &
	36	RelaxedDelimiterCharacters()
	37	{
	38	// RFC 7230 section 3.5
	39	// tolerant parser MAY accept any of SP, HTAB, VT (%x0B), FF (%x0C),
	40	// or bare CR as whitespace between request-line fields
	41	static const CharacterSet RelaxedDels =
	42	(CharacterSet::SP +
	43	CharacterSet::HTAB +
	44	CharacterSet("VT,FF","\x0B\x0C") +
	45	CharacterSet::CR).rename("relaxed-WSP");
	46
	47	return RelaxedDels;
	48	}
	49
	50	/// characters used to separate HTTP fields
	51	const CharacterSet &
	52	Http::One::Parser::DelimiterCharacters()
	53	{
	54	return Config.onoff.relaxed_header_parser ?
	55	RelaxedDelimiterCharacters() : CharacterSet::SP;
	56	}
	57
	58	bool
	59	Http::One::Parser::skipLineTerminator(Http1::Tokenizer &tok) const
	60	{
	61	if (tok.skip(Http1::CrLf()))
	62	return true;
	63
	64	if (Config.onoff.relaxed_header_parser && tok.skipOne(CharacterSet::LF))
	65	return true;
	66
	67	if (tok.atEnd() \|\| (tok.remaining().length() == 1 && tok.remaining().at(0) == '\r'))
	68	return false; // need more data
	69
	70	throw TexcHere("garbage instead of CRLF line terminator");
	71	return false; // unreachable, but make naive compilers happy
	72	}
	73
	74	/// all characters except the LF line terminator
	75	static const CharacterSet &
	76	LineCharacters()
	77	{
	78	static const CharacterSet line = CharacterSet::LF.complement("non-LF");
	79	return line;
	80	}
	81
	82	/**
	83	* Remove invalid lines (if any) from the mime prefix
	84	*
	85	* RFC 7230 section 3:
	86	* "A recipient that receives whitespace between the start-line and
	87	* the first header field MUST ... consume each whitespace-preceded
	88	* line without further processing of it."
	89	*
	90	* We need to always use the relaxed delimiters here to prevent
	91	* line smuggling through strict parsers.
	92	*
	93	* Note that 'whitespace' in RFC 7230 includes CR. So that means
	94	* sequences of CRLF will be pruned, but not sequences of bare-LF.
	95	*/
	96	void
	97	Http::One::Parser::cleanMimePrefix()
	98	{
	99	Http1::Tokenizer tok(mimeHeaderBlock_);
	100	while (tok.skipOne(RelaxedDelimiterCharacters())) {
	101	(void)tok.skipAll(LineCharacters()); // optional line content
	102	// LF terminator is required.
	103	// trust headersEnd() to ensure that we have at least one LF
	104	(void)tok.skipOne(CharacterSet::LF);
	105	}
	106
	107	// If mimeHeaderBlock_ had just whitespace line(s) followed by CRLF,
	108	// then we skipped everything, including that terminating LF.
	109	// Restore the terminating CRLF if needed.
	110	if (tok.atEnd())
	111	mimeHeaderBlock_ = Http1::CrLf();
	112	else
	113	mimeHeaderBlock_ = tok.remaining();
	114	// now mimeHeaderBlock_ has 0+ fields followed by the LF terminator
	115	}
	116
	117	/**
	118	* Replace obs-fold with a single SP,
	119	*
	120	* RFC 7230 section 3.2.4
	121	* "A server that receives an obs-fold in a request message that is not
	122	* within a message/http container MUST ... replace
	123	* each received obs-fold with one or more SP octets prior to
	124	* interpreting the field value or forwarding the message downstream."
	125	*
	126	* "A proxy or gateway that receives an obs-fold in a response message
	127	* that is not within a message/http container MUST ... replace each
	128	* received obs-fold with one or more SP octets prior to interpreting
	129	* the field value or forwarding the message downstream."
	130	*/
	131	void
	132	Http::One::Parser::unfoldMime()
	133	{
	134	Http1::Tokenizer tok(mimeHeaderBlock_);
	135	const auto szLimit = mimeHeaderBlock_.length();
	136	mimeHeaderBlock_.clear();
	137	// prevent the mime sender being able to make append() realloc/grow multiple times.
	138	mimeHeaderBlock_.reserveSpace(szLimit);
	139
	140	static const CharacterSet nonCRLF = (CharacterSet::CR + CharacterSet::LF).complement().rename("non-CRLF");
	141
	142	while (!tok.atEnd()) {
	143	const SBuf all(tok.remaining());
	144	const auto blobLen = tok.skipAll(nonCRLF); // may not be there
	145	const auto crLen = tok.skipAll(CharacterSet::CR); // may not be there
	146	const auto lfLen = tok.skipOne(CharacterSet::LF); // may not be there
	147
	148	if (lfLen && tok.skipAll(CharacterSet::WSP)) { // obs-fold!
	149	mimeHeaderBlock_.append(all.substr(0, blobLen));
	150	mimeHeaderBlock_.append(' '); // replace one obs-fold with one SP
	151	} else
	152	mimeHeaderBlock_.append(all.substr(0, blobLen + crLen + lfLen));
	153	}
	154	}
	155
	156	bool
	157	Http::One::Parser::grabMimeBlock(const char *which, const size_t limit)
	158	{
	159	// MIME headers block exist in (only) HTTP/1.x and ICY
	160	const bool expectMime = (msgProtocol_.protocol == AnyP::PROTO_HTTP && msgProtocol_.major == 1) \|\|
	161	msgProtocol_.protocol == AnyP::PROTO_ICY \|\|
	162	hackExpectsMime_;
	163
	164	if (expectMime) {
	165	/* NOTE: HTTP/0.9 messages do not have a mime header block.
	166	* So the rest of the code will need to deal with '0'-byte headers
	167	* (ie, none, so don't try parsing em)
	168	*/
	169	bool containsObsFold;
	170	if (SBuf::size_type mimeHeaderBytes = headersEnd(buf_, containsObsFold)) {
	171
	172	// Squid could handle these headers, but admin does not want to
	173	if (firstLineSize() + mimeHeaderBytes >= limit) {
	174	debugs(33, 5, "Too large " << which);
	175	parseStatusCode = Http::scHeaderTooLarge;
	176	buf_.consume(mimeHeaderBytes);
	177	parsingStage_ = HTTP_PARSE_DONE;
	178	return false;
	179	}
	180
	181	mimeHeaderBlock_ = buf_.consume(mimeHeaderBytes);
	182	cleanMimePrefix();
	183	if (containsObsFold)
	184	unfoldMime();
	185
	186	debugs(74, 5, "mime header (0-" << mimeHeaderBytes << ") {" << mimeHeaderBlock_ << "}");
	187
	188	} else { // headersEnd() == 0
	189	if (buf_.length()+firstLineSize() >= limit) {
	190	debugs(33, 5, "Too large " << which);
	191	parseStatusCode = Http::scHeaderTooLarge;
	192	parsingStage_ = HTTP_PARSE_DONE;
	193	} else
	194	debugs(33, 5, "Incomplete " << which << ", waiting for end of headers");
	195	return false;
	196	}
	197
	198	} else
	199	debugs(33, 3, "Missing HTTP/1.x identifier");
	200
	201	// NP: we do not do any further stages here yet so go straight to DONE
	202	parsingStage_ = HTTP_PARSE_DONE;
	203
	204	return true;
	205	}
	206
	207	// arbitrary maximum-length for headers which can be found by Http1Parser::getHeaderField()
	208	#define GET_HDR_SZ 1024
	209
	210	// BUG: returns only the first header line with given name,
	211	// ignores multi-line headers and obs-fold headers
	212	char *
	213	Http::One::Parser::getHeaderField(const char *name)
	214	{
	215	if (!headerBlockSize() \|\| !name)
	216	return NULL;
	217
	218	LOCAL_ARRAY(char, header, GET_HDR_SZ);
	219	const int namelen = strlen(name);
	220
	221	debugs(25, 5, "looking for " << name);
	222
	223	// while we can find more LF in the SBuf
	224	Http1::Tokenizer tok(mimeHeaderBlock_);
	225	SBuf p;
	226
	227	while (tok.prefix(p, LineCharacters())) {
	228	if (!tok.skipOne(CharacterSet::LF)) // move tokenizer past the LF
	229	break; // error. reached invalid octet or end of buffer insted of an LF ??
	230
	231	// header lines must start with the name (case insensitive)
	232	if (p.substr(0, namelen).caseCmp(name, namelen))
	233	continue;
	234
	235	// then a COLON
	236	if (p[namelen] != ':')
	237	continue;
	238
	239	// drop any trailing *CR sequence
	240	p.trim(Http1::CrLf(), false, true);
	241
	242	debugs(25, 5, "checking " << p);
	243	p.consume(namelen + 1);
	244
	245	// TODO: optimize SBuf::trim to take CharacterSet directly
	246	Http1::Tokenizer t(p);
	247	t.skipAll(CharacterSet::WSP);
	248	p = t.remaining();
	249
	250	// prevent buffer overrun on char header[];
	251	p.chop(0, sizeof(header)-1);
	252
	253	// return the header field-value
	254	SBufToCstring(header, p);
	255	debugs(25, 5, "returning " << header);
	256	return header;
	257	}
	258
	259	return NULL;
	260	}
	261
	262	#if USE_HTTP_VIOLATIONS
	263	int
	264	Http::One::Parser::violationLevel() const
	265	{
	266	return Config.onoff.relaxed_header_parser < 0 ? DBG_IMPORTANT : 5;
	267	}
	268	#endif
	269