[thirdparty/squid.git] / src / http / one / Parser.cc

/*
 * Copyright (C) 1996-2022 The Squid Software Foundation and contributors
 *
 * Squid software is distributed under GPLv2+ license and includes
 * contributions from numerous individuals and organizations.
 * Please see the COPYING and CONTRIBUTORS files for details.
 */

#include "squid.h"
#include "base/CharacterSet.h"
#include "debug/Stream.h"
#include "http/one/Parser.h"
#include "mime_header.h"
#include "parser/Tokenizer.h"
#include "SquidConfig.h"

/// RFC 7230 section 2.6 - 7 magic octets
const SBuf Http::One::Parser::Http1magic("HTTP/1.");

const SBuf &Http::One::CrLf()
{
    static const SBuf crlf("\r\n");
    return crlf;
}

void
Http::One::Parser::clear()
{
    parsingStage_ = HTTP_PARSE_NONE;
    buf_ = nullptr;
    msgProtocol_ = AnyP::ProtocolVersion();
    mimeHeaderBlock_.clear();
}

/// characters HTTP permits tolerant parsers to accept as delimiters
static const CharacterSet &
RelaxedDelimiterCharacters()
{
    // RFC 7230 section 3.5
    // tolerant parser MAY accept any of SP, HTAB, VT (%x0B), FF (%x0C),
    // or bare CR as whitespace between request-line fields
    static const CharacterSet RelaxedDels =
        (CharacterSet::SP +
         CharacterSet::HTAB +
         CharacterSet("VT,FF","\x0B\x0C") +
         CharacterSet::CR).rename("relaxed-WSP");

    return RelaxedDels;
}

const CharacterSet &
Http::One::Parser::WhitespaceCharacters()
{
    return Config.onoff.relaxed_header_parser ?
           RelaxedDelimiterCharacters() : CharacterSet::WSP;
}

const CharacterSet &
Http::One::Parser::DelimiterCharacters()
{
    return Config.onoff.relaxed_header_parser ?
           RelaxedDelimiterCharacters() : CharacterSet::SP;
}

void
Http::One::Parser::skipLineTerminator(Tokenizer &tok) const
{
    if (tok.skip(Http1::CrLf()))
        return;

    if (Config.onoff.relaxed_header_parser && tok.skipOne(CharacterSet::LF))
        return;

    if (tok.atEnd() || (tok.remaining().length() == 1 && tok.remaining().at(0) == '\r'))
        throw InsufficientInput();

    throw TexcHere("garbage instead of CRLF line terminator");
}

/// all characters except the LF line terminator
static const CharacterSet &
LineCharacters()
{
    static const CharacterSet line = CharacterSet::LF.complement("non-LF");
    return line;
}

/**
 * Remove invalid lines (if any) from the mime prefix
 *
 * RFC 7230 section 3:
 * "A recipient that receives whitespace between the start-line and
 * the first header field MUST ... consume each whitespace-preceded
 * line without further processing of it."
 *
 * We need to always use the relaxed delimiters here to prevent
 * line smuggling through strict parsers.
 *
 * Note that 'whitespace' in RFC 7230 includes CR. So that means
 * sequences of CRLF will be pruned, but not sequences of bare-LF.
 */
void
Http::One::Parser::cleanMimePrefix()
{
    Tokenizer tok(mimeHeaderBlock_);
    while (tok.skipOne(RelaxedDelimiterCharacters())) {
        (void)tok.skipAll(LineCharacters()); // optional line content
        // LF terminator is required.
        // trust headersEnd() to ensure that we have at least one LF
        (void)tok.skipOne(CharacterSet::LF);
    }

    // If mimeHeaderBlock_ had just whitespace line(s) followed by CRLF,
    // then we skipped everything, including that terminating LF.
    // Restore the terminating CRLF if needed.
    if (tok.atEnd())
        mimeHeaderBlock_ = Http1::CrLf();
    else
        mimeHeaderBlock_ = tok.remaining();
    // now mimeHeaderBlock_ has 0+ fields followed by the LF terminator
}

/**
 * Replace obs-fold with a single SP,
 *
 * RFC 7230 section 3.2.4
 * "A server that receives an obs-fold in a request message that is not
 *  within a message/http container MUST ... replace
 *  each received obs-fold with one or more SP octets prior to
 *  interpreting the field value or forwarding the message downstream."
 *
 * "A proxy or gateway that receives an obs-fold in a response message
 *  that is not within a message/http container MUST ... replace each
 *  received obs-fold with one or more SP octets prior to interpreting
 *  the field value or forwarding the message downstream."
 */
void
Http::One::Parser::unfoldMime()
{
    Tokenizer tok(mimeHeaderBlock_);
    const auto szLimit = mimeHeaderBlock_.length();
    mimeHeaderBlock_.clear();
    // prevent the mime sender being able to make append() realloc/grow multiple times.
    mimeHeaderBlock_.reserveSpace(szLimit);

    static const CharacterSet nonCRLF = (CharacterSet::CR + CharacterSet::LF).complement().rename("non-CRLF");

    while (!tok.atEnd()) {
        const SBuf all(tok.remaining());
        const auto blobLen = tok.skipAll(nonCRLF); // may not be there
        const auto crLen = tok.skipAll(CharacterSet::CR); // may not be there
        const auto lfLen = tok.skipOne(CharacterSet::LF); // may not be there

        if (lfLen && tok.skipAll(CharacterSet::WSP)) { // obs-fold!
            mimeHeaderBlock_.append(all.substr(0, blobLen));
            mimeHeaderBlock_.append(' '); // replace one obs-fold with one SP
        } else
            mimeHeaderBlock_.append(all.substr(0, blobLen + crLen + lfLen));
    }
}

bool
Http::One::Parser::grabMimeBlock(const char *which, const size_t limit)
{
    // MIME headers block exist in (only) HTTP/1.x and ICY
    const bool expectMime = (msgProtocol_.protocol == AnyP::PROTO_HTTP && msgProtocol_.major == 1) ||
                            msgProtocol_.protocol == AnyP::PROTO_ICY ||
                            hackExpectsMime_;

    if (expectMime) {
        /* NOTE: HTTP/0.9 messages do not have a mime header block.
         *       So the rest of the code will need to deal with '0'-byte headers
         *       (ie, none, so don't try parsing em)
         */
        bool containsObsFold;
        if (SBuf::size_type mimeHeaderBytes = headersEnd(buf_, containsObsFold)) {

            // Squid could handle these headers, but admin does not want to
            if (firstLineSize() + mimeHeaderBytes >= limit) {
                debugs(33, 5, "Too large " << which);
                parseStatusCode = Http::scHeaderTooLarge;
                buf_.consume(mimeHeaderBytes);
                parsingStage_ = HTTP_PARSE_DONE;
                return false;
            }

            mimeHeaderBlock_ = buf_.consume(mimeHeaderBytes);
            cleanMimePrefix();
            if (containsObsFold)
                unfoldMime();

            debugs(74, 5, "mime header (0-" << mimeHeaderBytes << ") {" << mimeHeaderBlock_ << "}");

        } else { // headersEnd() == 0
            if (buf_.length()+firstLineSize() >= limit) {
                debugs(33, 5, "Too large " << which);
                parseStatusCode = Http::scHeaderTooLarge;
                parsingStage_ = HTTP_PARSE_DONE;
            } else
                debugs(33, 5, "Incomplete " << which << ", waiting for end of headers");
            return false;
        }

    } else
        debugs(33, 3, "Missing HTTP/1.x identifier");

    // NP: we do not do any further stages here yet so go straight to DONE
    parsingStage_ = HTTP_PARSE_DONE;

    return true;
}

// arbitrary maximum-length for headers which can be found by Http1Parser::getHostHeaderField()
#define GET_HDR_SZ  1024

// BUG: returns only the first header line with given name,
//      ignores multi-line headers and obs-fold headers
char *
Http::One::Parser::getHostHeaderField()
{
    if (!headerBlockSize())
        return nullptr;

    LOCAL_ARRAY(char, header, GET_HDR_SZ);
    const char *name = "Host";
    const int namelen = strlen(name);

    debugs(25, 5, "looking for " << name);

    // while we can find more LF in the SBuf
    Tokenizer tok(mimeHeaderBlock_);
    SBuf p;

    while (tok.prefix(p, LineCharacters())) {
        if (!tok.skipOne(CharacterSet::LF)) // move tokenizer past the LF
            break; // error. reached invalid octet or end of buffer instead of an LF ??

        // header lines must start with the name (case insensitive)
        if (p.substr(0, namelen).caseCmp(name, namelen))
            continue;

        // then a COLON
        if (p[namelen] != ':')
            continue;

        // drop any trailing *CR sequence
        p.trim(Http1::CrLf(), false, true);

        debugs(25, 5, "checking " << p);
        p.consume(namelen + 1);

        // TODO: optimize SBuf::trim to take CharacterSet directly
        Tokenizer t(p);
        t.skipAll(CharacterSet::WSP);
        p = t.remaining();

        // prevent buffer overrun on char header[];
        p.chop(0, sizeof(header)-1);

        // currently only used for pre-parse Host header, ensure valid domain[:port] or ip[:port]
        static const auto hostChars = CharacterSet("host",":[].-_") + CharacterSet::ALPHA + CharacterSet::DIGIT;
        if (p.findFirstNotOf(hostChars) != SBuf::npos)
            break; // error. line contains character not accepted in Host header

        // return the header field-value
        SBufToCstring(header, p);
        debugs(25, 5, "returning " << header);
        return header;
    }

    return nullptr;
}

int
Http::One::ErrorLevel()
{
    return Config.onoff.relaxed_header_parser < 0 ? DBG_IMPORTANT : 5;
}

// BWS = *( SP / HTAB ) ; WhitespaceCharacters() may relax this RFC 7230 rule
void
Http::One::ParseBws(Parser::Tokenizer &tok)
{
    const auto count = tok.skipAll(Parser::WhitespaceCharacters());

    if (tok.atEnd())
        throw InsufficientInput(); // even if count is positive

    if (count) {
        // Generating BWS is a MUST-level violation so warn about it as needed.
        debugs(33, ErrorLevel(), "found " << count << " BWS octets");
        // RFC 7230 says we MUST parse BWS, so we fall through even if
        // Config.onoff.relaxed_header_parser is off.
    }
    // else we successfully "parsed" an empty BWS sequence

    // success: no more BWS characters expected
}
Commit	Line	Data
48a37aee	1	/*
bf95c10a	2	* Copyright (C) 1996-2022 The Squid Software Foundation and contributors
48a37aee AJ	3	*
	4	* Squid software is distributed under GPLv2+ license and includes
	5	* contributions from numerous individuals and organizations.
	6	* Please see the COPYING and CONTRIBUTORS files for details.
	7	*/
	8
c99510dd	9	#include "squid.h"
417da400	10	#include "base/CharacterSet.h"
675b8408	11	#include "debug/Stream.h"
c99510dd	12	#include "http/one/Parser.h"
f1d5359e	13	#include "mime_header.h"
417da400	14	#include "parser/Tokenizer.h"
b8f86fd2	15	#include "SquidConfig.h"
c99510dd	16
9651320a AJ	17	/// RFC 7230 section 2.6 - 7 magic octets
	18	const SBuf Http::One::Parser::Http1magic("HTTP/1.");
	19
00237269 AJ	20	const SBuf &Http::One::CrLf()
	21	{
	22	static const SBuf crlf("\r\n");
	23	return crlf;
	24	}
	25
c99510dd AJ	26	void
	27	Http::One::Parser::clear()
	28	{
	29	parsingStage_ = HTTP_PARSE_NONE;
aee3523a	30	buf_ = nullptr;
c99510dd AJ	31	msgProtocol_ = AnyP::ProtocolVersion();
	32	mimeHeaderBlock_.clear();
	33	}
	34
00237269 AJ	35	/// characters HTTP permits tolerant parsers to accept as delimiters
	36	static const CharacterSet &
	37	RelaxedDelimiterCharacters()
	38	{
	39	// RFC 7230 section 3.5
	40	// tolerant parser MAY accept any of SP, HTAB, VT (%x0B), FF (%x0C),
	41	// or bare CR as whitespace between request-line fields
	42	static const CharacterSet RelaxedDels =
	43	(CharacterSet::SP +
	44	CharacterSet::HTAB +
	45	CharacterSet("VT,FF","\x0B\x0C") +
	46	CharacterSet::CR).rename("relaxed-WSP");
	47
	48	return RelaxedDels;
	49	}
	50
26f0a359 AR	51	const CharacterSet &
	52	Http::One::Parser::WhitespaceCharacters()
	53	{
	54	return Config.onoff.relaxed_header_parser ?
	55	RelaxedDelimiterCharacters() : CharacterSet::WSP;
	56	}
	57
00237269 AJ	58	const CharacterSet &
	59	Http::One::Parser::DelimiterCharacters()
	60	{
	61	return Config.onoff.relaxed_header_parser ?
	62	RelaxedDelimiterCharacters() : CharacterSet::SP;
	63	}
	64
417da400 EB	65	void
417da400 EB	66	Http::One::Parser::skipLineTerminator(Tokenizer &tok) const
f1d5359e	67	{
00237269	68	if (tok.skip(Http1::CrLf()))
417da400	69	return;
b8f86fd2 AJ	70
b8f86fd2 AJ	71	if (Config.onoff.relaxed_header_parser && tok.skipOne(CharacterSet::LF))
417da400	72	return;
b8f86fd2	73
188ad27f	74	if (tok.atEnd() \|\| (tok.remaining().length() == 1 && tok.remaining().at(0) == '\r'))
417da400	75	throw InsufficientInput();
188ad27f AJ	76
188ad27f AJ	77	throw TexcHere("garbage instead of CRLF line terminator");
b8f86fd2 AJ	78	}
b8f86fd2 AJ	79
00237269 AJ	80	/// all characters except the LF line terminator
	81	static const CharacterSet &
	82	LineCharacters()
	83	{
	84	static const CharacterSet line = CharacterSet::LF.complement("non-LF");
	85	return line;
	86	}
	87
	88	/**
	89	* Remove invalid lines (if any) from the mime prefix
	90	*
	91	* RFC 7230 section 3:
	92	* "A recipient that receives whitespace between the start-line and
	93	* the first header field MUST ... consume each whitespace-preceded
	94	* line without further processing of it."
	95	*
	96	* We need to always use the relaxed delimiters here to prevent
	97	* line smuggling through strict parsers.
	98	*
	99	* Note that 'whitespace' in RFC 7230 includes CR. So that means
	100	* sequences of CRLF will be pruned, but not sequences of bare-LF.
	101	*/
	102	void
	103	Http::One::Parser::cleanMimePrefix()
	104	{
417da400	105	Tokenizer tok(mimeHeaderBlock_);
00237269 AJ	106	while (tok.skipOne(RelaxedDelimiterCharacters())) {
	107	(void)tok.skipAll(LineCharacters()); // optional line content
	108	// LF terminator is required.
	109	// trust headersEnd() to ensure that we have at least one LF
	110	(void)tok.skipOne(CharacterSet::LF);
	111	}
	112
	113	// If mimeHeaderBlock_ had just whitespace line(s) followed by CRLF,
	114	// then we skipped everything, including that terminating LF.
	115	// Restore the terminating CRLF if needed.
	116	if (tok.atEnd())
	117	mimeHeaderBlock_ = Http1::CrLf();
	118	else
	119	mimeHeaderBlock_ = tok.remaining();
	120	// now mimeHeaderBlock_ has 0+ fields followed by the LF terminator
	121	}
	122
	123	/**
	124	* Replace obs-fold with a single SP,
	125	*
	126	* RFC 7230 section 3.2.4
	127	* "A server that receives an obs-fold in a request message that is not
	128	* within a message/http container MUST ... replace
	129	* each received obs-fold with one or more SP octets prior to
	130	* interpreting the field value or forwarding the message downstream."
	131	*
	132	* "A proxy or gateway that receives an obs-fold in a response message
	133	* that is not within a message/http container MUST ... replace each
	134	* received obs-fold with one or more SP octets prior to interpreting
	135	* the field value or forwarding the message downstream."
	136	*/
	137	void
	138	Http::One::Parser::unfoldMime()
	139	{
417da400	140	Tokenizer tok(mimeHeaderBlock_);
00237269 AJ	141	const auto szLimit = mimeHeaderBlock_.length();
	142	mimeHeaderBlock_.clear();
	143	// prevent the mime sender being able to make append() realloc/grow multiple times.
	144	mimeHeaderBlock_.reserveSpace(szLimit);
	145
	146	static const CharacterSet nonCRLF = (CharacterSet::CR + CharacterSet::LF).complement().rename("non-CRLF");
	147
	148	while (!tok.atEnd()) {
	149	const SBuf all(tok.remaining());
	150	const auto blobLen = tok.skipAll(nonCRLF); // may not be there
	151	const auto crLen = tok.skipAll(CharacterSet::CR); // may not be there
	152	const auto lfLen = tok.skipOne(CharacterSet::LF); // may not be there
	153
	154	if (lfLen && tok.skipAll(CharacterSet::WSP)) { // obs-fold!
	155	mimeHeaderBlock_.append(all.substr(0, blobLen));
	156	mimeHeaderBlock_.append(' '); // replace one obs-fold with one SP
	157	} else
	158	mimeHeaderBlock_.append(all.substr(0, blobLen + crLen + lfLen));
	159	}
	160	}
	161
b8f86fd2	162	bool
f8cab755	163	Http::One::Parser::grabMimeBlock(const char *which, const size_t limit)
b8f86fd2 AJ	164	{
	165	// MIME headers block exist in (only) HTTP/1.x and ICY
	166	const bool expectMime = (msgProtocol_.protocol == AnyP::PROTO_HTTP && msgProtocol_.major == 1) \|\|
e47e0802 AJ	167	msgProtocol_.protocol == AnyP::PROTO_ICY \|\|
e47e0802 AJ	168	hackExpectsMime_;
b8f86fd2 AJ	169
b8f86fd2 AJ	170	if (expectMime) {
f1d5359e AJ	171	/* NOTE: HTTP/0.9 messages do not have a mime header block.
	172	* So the rest of the code will need to deal with '0'-byte headers
	173	* (ie, none, so don't try parsing em)
	174	*/
00237269 AJ	175	bool containsObsFold;
00237269 AJ	176	if (SBuf::size_type mimeHeaderBytes = headersEnd(buf_, containsObsFold)) {
f8cab755 AJ	177
	178	// Squid could handle these headers, but admin does not want to
	179	if (firstLineSize() + mimeHeaderBytes >= limit) {
	180	debugs(33, 5, "Too large " << which);
	181	parseStatusCode = Http::scHeaderTooLarge;
	182	buf_.consume(mimeHeaderBytes);
	183	parsingStage_ = HTTP_PARSE_DONE;
	184	return false;
	185	}
	186
	187	mimeHeaderBlock_ = buf_.consume(mimeHeaderBytes);
00237269 AJ	188	cleanMimePrefix();
	189	if (containsObsFold)
	190	unfoldMime();
	191
f8cab755 AJ	192	debugs(74, 5, "mime header (0-" << mimeHeaderBytes << ") {" << mimeHeaderBlock_ << "}");
	193
	194	} else { // headersEnd() == 0
f1d5359e AJ	195	if (buf_.length()+firstLineSize() >= limit) {
	196	debugs(33, 5, "Too large " << which);
	197	parseStatusCode = Http::scHeaderTooLarge;
	198	parsingStage_ = HTTP_PARSE_DONE;
	199	} else
	200	debugs(33, 5, "Incomplete " << which << ", waiting for end of headers");
	201	return false;
	202	}
b8f86fd2	203
f1d5359e AJ	204	} else
	205	debugs(33, 3, "Missing HTTP/1.x identifier");
	206
	207	// NP: we do not do any further stages here yet so go straight to DONE
	208	parsingStage_ = HTTP_PARSE_DONE;
	209
f1d5359e AJ	210	return true;
	211	}
	212
2a51e34e	213	// arbitrary maximum-length for headers which can be found by Http1Parser::getHostHeaderField()
f53969cc	214	#define GET_HDR_SZ 1024
c99510dd	215
687696c1 AJ	216	// BUG: returns only the first header line with given name,
687696c1 AJ	217	// ignores multi-line headers and obs-fold headers
c99510dd	218	char *
2a51e34e	219	Http::One::Parser::getHostHeaderField()
c99510dd	220	{
2a51e34e	221	if (!headerBlockSize())
aee3523a	222	return nullptr;
c99510dd	223
687696c1	224	LOCAL_ARRAY(char, header, GET_HDR_SZ);
2a51e34e	225	const char *name = "Host";
1296170f	226	const int namelen = strlen(name);
687696c1	227
f6c7fa03	228	debugs(25, 5, "looking for " << name);
c99510dd	229
f6c7fa03	230	// while we can find more LF in the SBuf
417da400	231	Tokenizer tok(mimeHeaderBlock_);
687696c1	232	SBuf p;
c99510dd	233
00237269	234	while (tok.prefix(p, LineCharacters())) {
2d40b13f	235	if (!tok.skipOne(CharacterSet::LF)) // move tokenizer past the LF
2f8abb64	236	break; // error. reached invalid octet or end of buffer instead of an LF ??
c99510dd	237
687696c1 AJ	238	// header lines must start with the name (case insensitive)
687696c1 AJ	239	if (p.substr(0, namelen).caseCmp(name, namelen))
c99510dd AJ	240	continue;
c99510dd AJ	241
687696c1 AJ	242	// then a COLON
687696c1 AJ	243	if (p[namelen] != ':')
c99510dd AJ	244	continue;
c99510dd AJ	245
687696c1	246	// drop any trailing *CR sequence
00237269	247	p.trim(Http1::CrLf(), false, true);
c99510dd	248
687696c1 AJ	249	debugs(25, 5, "checking " << p);
687696c1 AJ	250	p.consume(namelen + 1);
c99510dd	251
687696c1	252	// TODO: optimize SBuf::trim to take CharacterSet directly
417da400	253	Tokenizer t(p);
9bafa70d	254	t.skipAll(CharacterSet::WSP);
687696c1	255	p = t.remaining();
c99510dd	256
687696c1 AJ	257	// prevent buffer overrun on char header[];
687696c1 AJ	258	p.chop(0, sizeof(header)-1);
c99510dd	259
2a51e34e AJ	260	// currently only used for pre-parse Host header, ensure valid domain[:port] or ip[:port]
	261	static const auto hostChars = CharacterSet("host",":[].-_") + CharacterSet::ALPHA + CharacterSet::DIGIT;
	262	if (p.findFirstNotOf(hostChars) != SBuf::npos)
	263	break; // error. line contains character not accepted in Host header
	264
687696c1	265	// return the header field-value
3f0e38d6	266	SBufToCstring(header, p);
f6c7fa03	267	debugs(25, 5, "returning " << header);
687696c1	268	return header;
c99510dd AJ	269	}
c99510dd AJ	270
aee3523a	271	return nullptr;
c99510dd	272	}
f53969cc	273
9a4b5048	274	int
26f0a359	275	Http::One::ErrorLevel()
9a4b5048 AJ	276	{
	277	return Config.onoff.relaxed_header_parser < 0 ? DBG_IMPORTANT : 5;
	278	}
2c4e5226	279
26f0a359	280	// BWS = *( SP / HTAB ) ; WhitespaceCharacters() may relax this RFC 7230 rule
417da400 EB	281	void
417da400 EB	282	Http::One::ParseBws(Parser::Tokenizer &tok)
26f0a359	283	{
417da400 EB	284	const auto count = tok.skipAll(Parser::WhitespaceCharacters());
	285
	286	if (tok.atEnd())
	287	throw InsufficientInput(); // even if count is positive
	288
	289	if (count) {
26f0a359 AR	290	// Generating BWS is a MUST-level violation so warn about it as needed.
	291	debugs(33, ErrorLevel(), "found " << count << " BWS octets");
	292	// RFC 7230 says we MUST parse BWS, so we fall through even if
	293	// Config.onoff.relaxed_header_parser is off.
	294	}
	295	// else we successfully "parsed" an empty BWS sequence
	296
417da400	297	// success: no more BWS characters expected
26f0a359	298	}
cae5602c	299