]> git.ipfire.org Git - thirdparty/AWStats.git/commitdiff
Decode RFC 3986 "unreserved chars" in URLs. 92/head
authorTomaz Solc <tomaz.solc@tablix.org>
Sun, 25 Feb 2018 19:25:23 +0000 (20:25 +0100)
committerTomaz Solc <tomaz.solc@tablix.org>
Wed, 7 Mar 2018 17:20:21 +0000 (18:20 +0100)
This makes awstats treat "/foo" and "/%66%6f%6f" as equivalent.

This change only affects some common characters from the ASCII range. It
doesn't do any kind of utf-8 decoding (as per RFC 3986, see Section 2.3.)

wwwroot/cgi-bin/awstats.pl

index b04ed6f8fe931d36b4e17a9c2f2ade504033b8f5..5a7d809efb5717d2b0272fb950a182d3a21f9938 100755 (executable)
@@ -7906,6 +7906,22 @@ sub DecodeEncodedString {
        return $stringtodecode;
 }
 
+#------------------------------------------------------------------------------
+# Function:     Similar to DecodeEncodedString, but decode only
+#               RFC3986 "unreserved characters"
+# Parameters:   stringtodecode
+# Input:        None
+# Output:       None
+# Return:       decodedstring
+#------------------------------------------------------------------------------
+sub DecodeRFC3986UnreservedString {
+       my $stringtodecode = shift;
+
+       $stringtodecode =~ s/%([46][1-9A-F]|[57][0-9A]|3[0-9]|2D|2E|5F|7E)/pack("C", hex($1))/ieg;
+
+       return $stringtodecode;
+}
+
 #------------------------------------------------------------------------------
 # Function:     Decode a precompiled regex value to a common regex value
 # Parameters:   compiledregextodecode
@@ -18718,6 +18734,14 @@ if ( $UpdateStats && $FrameName ne 'index' && $FrameName ne 'mainleft' )
        # We keep a clean $field[$pos_url] and
        # we store original value for urlwithnoquery, tokenquery and standalonequery
        #---------------------------------------------------------------------------
+
+               # Decode "unreserved characters" - URIs with common ASCII characters
+               # percent-encoded are equivalent to their unencoded versions.
+               #
+               # See section 2.3. of RFC 3986.
+
+               $field[$pos_url] = DecodeRFC3986UnreservedString($field[$pos_url]);
+
                if ($URLNotCaseSensitive) { $field[$pos_url] = lc( $field[$pos_url] ); }
 
 # Possible URL syntax for $field[$pos_url]: /mydir/mypage.ext?param1=x&param2=y#aaa, /mydir/mypage.ext#aaa, /