From: Daniel Stenberg Date: Fri, 5 Dec 2025 10:45:35 +0000 (+0100) Subject: mdlinkcheck: detect and check "raw" links X-Git-Tag: rc-8_18_0-1~1 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=51587f6f14af22e932d2448ed3a7b0052e880ea1;p=thirdparty%2Fcurl.git mdlinkcheck: detect and check "raw" links - URLs specified outside of the markdown []() are now extracted and checked - also check TODO, FAQ and KNOWN_BUGS - more aggressive avoiding to check github.com/curl/curl, all uses of example domains and some more established URLs on the curl.se site - list all errors in the end to make them easier to spot in CI logs Closes #19848 --- diff --git a/.github/workflows/checkdocs.yml b/.github/workflows/checkdocs.yml index 60d4230194..ba3858f66c 100644 --- a/.github/workflows/checkdocs.yml +++ b/.github/workflows/checkdocs.yml @@ -15,8 +15,7 @@ name: 'Docs' paths: - '.github/workflows/checkdocs.yml' - '.github/scripts/**' - - '.github/scripts/mdlinkcheck' - - '/scripts/**' + - 'scripts/**' - '**.md' - 'docs/*' pull_request: @@ -25,8 +24,7 @@ name: 'Docs' paths: - '.github/workflows/checkdocs.yml' - '.github/scripts/**' - - '.github/scripts/mdlinkcheck' - - '/scripts/**' + - 'scripts/**' - '**.md' - 'docs/*' diff --git a/scripts/mdlinkcheck b/scripts/mdlinkcheck index 6b648786f3..734617949d 100755 --- a/scripts/mdlinkcheck +++ b/scripts/mdlinkcheck @@ -27,7 +27,10 @@ use strict; use warnings; my %whitelist = ( + 'https://curl.se' => 1, 'https://curl.se/' => 1, + 'https://curl.se/bug/' => 1, + 'https://curl.se/bug/view.cgi' => 1, 'https://curl.se/changes.html' => 1, 'https://curl.se/dev/advisory.html' => 1, 'https://curl.se/dev/builds.html' => 1, @@ -40,19 +43,25 @@ my %whitelist = ( 'https://curl.se/docs/bugbounty.html' => 1, 'https://curl.se/docs/caextract.html' => 1, 'https://curl.se/docs/copyright.html' => 1, + 'https://curl.se/docs/http-cookies.html' => 1, 'https://curl.se/docs/install.html' => 1, 'https://curl.se/docs/knownbugs.html' => 1, 'https://curl.se/docs/manpage.html' => 1, + 'https://curl.se/docs/releases.html' => 1, 'https://curl.se/docs/security.html' => 1, + 'https://curl.se/docs/ssl-ciphers.html' => 1, + 'https://curl.se/docs/ssl-compared.html' => 1, 'https://curl.se/docs/sslcerts.html' => 1, 'https://curl.se/docs/thanks.html' => 1, 'https://curl.se/docs/todo.html' => 1, 'https://curl.se/docs/vulnerabilities.html' => 1, + 'https://curl.se/download.html' => 1, 'https://curl.se/libcurl/' => 1, - 'https://curl.se/libcurl/c/CURLOPT_SSLVERSION.html' => 1, 'https://curl.se/libcurl/c/CURLOPT_SSL_CIPHER_LIST.html' => 1, + 'https://curl.se/libcurl/c/CURLOPT_SSLVERSION.html' => 1, 'https://curl.se/libcurl/c/CURLOPT_TLS13_CIPHERS.html' => 1, 'https://curl.se/libcurl/c/libcurl.html' => 1, + 'https://curl.se/libcurl/c/threadsafe.html' => 1, 'https://curl.se/logo/curl-logo.svg' => 1, 'https://curl.se/mail/' => 1, 'https://curl.se/mail/etiquette.html' => 1, @@ -62,14 +71,15 @@ my %whitelist = ( 'https://curl.se/rfc/rfc2255.txt' => 1, 'https://curl.se/sponsors.html' => 1, 'https://curl.se/support.html' => 1, + 'https://curl.se/windows' => 1, + 'https://curl.se/windows/' => 1, + + 'https://testclutch.curl.se/' => 1, - 'https://github.com/curl/curl' => 1, 'https://github.com/curl/curl-fuzzer' => 1, 'https://github.com/curl/curl-www' => 1, - 'https://github.com/curl/curl/discussions' => 1, - 'https://github.com/curl/curl/issues' => 1, - 'https://github.com/curl/curl/labels/help%20wanted' => 1, - 'https://github.com/curl/curl/pulls' => 1, + 'https://github.com/curl/curl.git' => 1, + 'https://github.com/curl/curl/wcurl' => 1, ); @@ -77,7 +87,7 @@ my %url; my %flink; # list all .md files in the repo -my @files=`git ls-files '**.md'`; +my @files=`git ls-files '**.md' docs/TODO docs/KNOWN_BUGS docs/FAQ`; sub storelink { my ($f, $line, $link) = @_; @@ -91,7 +101,29 @@ sub storelink { $link =~ s:\#.*\z::; if($link =~ /^(https|http):/) { - $url{$link} .= "$f:$line "; + if($whitelist{$link}) { + #print "-- whitelisted: $link\n"; + } + # example.com is just example + elsif($link =~ /^https:\/\/(.*)example.(com|org|net)/) { + #print "-- example: $link\n"; + } + # so is using the .example TLD + elsif($link =~ /^https:\/\/(.*)\.example(\/|$|:)/) { + #print "-- .example: $link\n"; + } + # so is using anything on localhost + elsif($link =~ /^http(s|):\/\/localhost/) { + #print "-- localhost: $link\n"; + } + # ignore all links to curl's github repo + elsif($link =~ /^https:\/\/github.com\/curl\/curl(\/|$)/) { + #print "-- curl github repo: $link\n"; + } + else { + #print "ADD '$link'\n"; + $url{$link} .= "$f:$line "; + } return; } @@ -119,11 +151,19 @@ sub findlinks { return; while() { + chomp; if(/\]\(([^)]*)/) { my $link = $1; #print "$f:$line $link\n"; storelink($f, $line, $link); } + # ignore trailing: dot, quote, asterisk, hash, comma, question mark, + # colon, closing parenthesis, closing angle bracket, whitespace, pipe, + # backtick, semicolon + elsif(/(https:\/\/[a-z0-9.\/:%_-]+[^."*\#,?:\)> \t|`;])/i) { + #print "RAW "; + storelink($f, $line, $1); + } $line++; } close(F); @@ -133,11 +173,10 @@ sub checkurl { my ($url) = @_; if($whitelist{$url}) { - #print "$url is whitelisted\n"; + #print STDERR "$url is whitelisted\n"; return 0; } - print "check $url\n"; $url =~ s/\+/%2B/g; my @content; if(open(my $fh, '-|', 'curl', '-ILfsm10', '--retry', '2', '--retry-delay', '5', @@ -146,9 +185,10 @@ sub checkurl { close $fh; } if(!$content[0]) { - print STDERR "FAIL\n"; + print "FAIL: $url\n"; return 1; # fail } + print "OK: $url\n"; return 0; # ok } @@ -157,14 +197,19 @@ for my $f (@files) { findlinks($f); } -my $error; +#for my $u (sort keys %url) { +# print "$u\n"; +#} +#exit; +my $error; +my @errlist; for my $u (sort keys %url) { my $r = checkurl($u); if($r) { for my $f (split(/ /, $url{$u})) { - printf "%s ERROR links to missing URL %s\n", $f, $u; + push @errlist, sprintf "%s ERROR links to missing URL %s\n", $f, $u; $error++; } } @@ -173,10 +218,17 @@ for my $u (sort keys %url) { for my $l (sort keys %flink) { if(! -r $l) { for my $f (split(/ /, $flink{$l})) { - printf "%s ERROR links to missing file %s\n", $f, $l; + push @errlist, sprintf "%s ERROR links to missing file %s\n", $f, $l; $error++; } } } +printf "Checked %d URLs\n", scalar(keys %url); +if($error) { + print "$error URLs had problems:\n"; + for(@errlist) { + print $_; + } +} exit 1 if($error);