]> git.ipfire.org Git - thirdparty/curl.git/commitdiff
GHA: spellcheck curl symbols better
authorDaniel Stenberg <daniel@haxx.se>
Thu, 27 Feb 2025 10:17:42 +0000 (11:17 +0100)
committerDaniel Stenberg <daniel@haxx.se>
Thu, 27 Feb 2025 12:15:21 +0000 (13:15 +0100)
This now makes sure to trim off exact matches for curl symbols and long
curl commanad line options instead of using pattern matching as before.
This should catch typoed names (that still follow the pattern) better.

The cleanspell.pl script is no longer used. cleancmd.pl is used for all
markdown files.

Closes #16504

.github/scripts/cleancmd.pl
.github/scripts/cleanspell.pl [deleted file]
.github/scripts/spellcheck.curl [new file with mode: 0644]
.github/workflows/checkdocs.yml

index 355a845e4c5e5627a3666e400fec08dabc292af2..283a9f4e934f4f8613ef718b9309db1113fe5d22 100755 (executable)
 #
 # SPDX-License-Identifier: curl
 #
-# Input: a cmdline docs markdown, it gets modified *in place*
+# Input: cmdline docs markdown files, they get modified *in place*
 #
-# The main purpose is to strip off the leading meta-data part, but also to
-# clean up whatever else the spell checker might have a problem with that we
-# still deem is fine.
-
-my $header = 1;
-while(1) {
-    # set this if the markdown has no meta-data header to skip
-    if($ARGV[0] eq "--no-header") {
-        shift @ARGV;
-        $header = 0;
+# Strip off the leading meta-data/header part, remove all known curl symbols
+# and long command line options. Also clean up whatever else the spell checker
+# might have a problem with that we still deem is fine.
+#
+
+open(S, "<./docs/libcurl/symbols-in-versions")
+    || die "can't find symbols-in-versions";
+while(<S>) {
+    if(/^([^ ]*) /) {
+        push @asyms, $1;
+    }
+}
+close(S);
+
+# init the opts table with "special" options not easy to figure out
+my @aopts = (
+    '--ftp-ssl-reqd', # old alias
+    );
+
+open(O, "<./docs/options-in-versions")
+    || die "can't find options-in-versions";
+while(<O>) {
+    chomp;
+    if(/^([^ ]+)/) {
+        my $o = $1;
+        push @aopts, $o;
+        if($o =~ /^--no-(.*)/) {
+            # for the --no options, also make one without it
+            push @aopts, "--$1";
+        }
+        elsif($o =~ /^--disable-(.*)/) {
+            # for the --disable options, also make the special ones
+            push @aopts, "--$1";
+            push @aopts, "--no-$1";
+        }
+    }
+}
+close(O);
+
+open(C, "<./.github/scripts/spellcheck.curl")
+    || die "can't find spellcheck.curl";
+while(<C>) {
+    if(/^\#/) {
+        next;
     }
-    else {
-        last;
+    chomp;
+    if(/^([^ ]+)/) {
+        push @asyms, $1;
     }
 }
+close(C);
 
-my $f = $ARGV[0];
+# longest symbols first
+my @syms = sort { length($b) <=> length($a) } @asyms;
 
-open(F, "<$f") or die;
+# longest cmdline options first
+my @opts = sort { length($b) <=> length($a) } @aopts;
 
-my $ignore = $header;
-my $sepcount = 0;
-my @out;
-while(<F>) {
-    if(/^---/ && $header) {
-        if(++$sepcount == 2) {
+sub process {
+    my ($f) = @_;
+
+    my $ignore = 0;
+    my $sepcount = 0;
+    my $out;
+    my $line = 0;
+    open(F, "<$f") or die;
+
+    while(<F>) {
+        $line++;
+        if(/^---/ && ($line == 1)) {
+            $ignore = 1;
+            next;
+        }
+        elsif(/^---/ && $ignore) {
             $ignore = 0;
+            next;
         }
-        next;
-    }
-    next if($ignore);
+        next if($ignore);
+
+        my $l = $_;
 
-    # strip out backticked words
-    $_ =~ s/`[^`]+`//g;
+        # strip out backticked words
+        $l =~ s/`[^`]+`//g;
 
-    # strip out all long command line options
-    $_ =~ s/--[a-z0-9-]+//g;
+        # **bold**
+        $l =~ s/\*\*(\S.*?)\*\*//g;
+        # *italics*
+        $l =~ s/\*(\S.*?)\*//g;
 
-    # strip out https URLs, we don't want them spellchecked
-    $_ =~ s!https://[a-z0-9\#_/.-]+!!gi;
+        # strip out https URLs, we don't want them spellchecked
+        $l =~ s!https://[a-z0-9\#_/.-]+!!gi;
 
-    push @out, $_;
+        $out .= $l;
+    }
+    close(F);
+
+    # cut out all known curl cmdline options
+    map { $out =~ s/$_//g; } (@opts);
+
+    # cut out all known curl symbols
+    map { $out =~ s/\b$_\b//g; } (@syms);
+
+    if(!$ignore) {
+        open(O, ">$f") or die;
+        print O $out;
+        close(O);
+    }
 }
-close(F);
 
-if(!$ignore) {
-    open(O, ">$f") or die;
-    print O @out;
-    close(O);
+for my $f (@ARGV) {
+    process($f);
 }
diff --git a/.github/scripts/cleanspell.pl b/.github/scripts/cleanspell.pl
deleted file mode 100755 (executable)
index bfa07dc..0000000
+++ /dev/null
@@ -1,86 +0,0 @@
-#!/usr/bin/env perl
-# Copyright (C) Daniel Stenberg, <daniel@haxx.se>, et al.
-#
-# SPDX-License-Identifier: curl
-#
-# Given: a libcurl curldown man page
-# Outputs: the same file, minus the SYNOPSIS and the EXAMPLE sections
-#
-
-my $f = $ARGV[0];
-
-open(F, "<$f") or die;
-
-my @out;
-my $ignore = 0;
-while(<F>) {
-    if($_ =~ /^# (SYNOPSIS|EXAMPLE)/) {
-        $ignore = 1;
-    }
-    elsif($ignore && ($_ =~ /^# [A-Z]/)) {
-        $ignore = 0;
-    }
-    elsif(!$ignore) {
-        # **bold**
-        $_ =~ s/\*\*(\S.*?)\*\*//g;
-        # *italics*
-        $_ =~ s/\*(\S.*?)\*//g;
-
-        $_ =~ s/CURL(M|SH|U|H)code//g;
-        $_ =~ s/CURL_[A-Z0-9_]*//g;
-        $_ =~ s/CURLALTSVC_[A-Z0-9_]*//g;
-        $_ =~ s/CURLAUTH_[A-Z0-9_]*//g;
-        $_ =~ s/CURLE_[A-Z0-9_]*//g;
-        $_ =~ s/CURLFORM_[A-Z0-9_]*//g;
-        $_ =~ s/CURLFTP_[A-Z0-9_]*//g;
-        $_ =~ s/CURLFTPAUTH_[A-Z0-9_]*//g;
-        $_ =~ s/CURLFTPMETHOD_[A-Z0-9_]*//g;
-        $_ =~ s/CURLFTPSSL_[A-Z0-9_]*//g;
-        $_ =~ s/CURLGSSAPI_[A-Z0-9_]*//g;
-        $_ =~ s/CURLHEADER_[A-Z0-9_]*//g;
-        $_ =~ s/CURLINFO_[A-Z0-9_]*//g;
-        $_ =~ s/CURLM_[A-Z0-9_]*//g;
-        $_ =~ s/CURLMIMEOPT_[A-Z0-9_]*//g;
-        $_ =~ s/CURLMOPT_[A-Z0-9_]*//g;
-        $_ =~ s/CURLOPT_[A-Z0-9_]*//g;
-        $_ =~ s/CURLPIPE_[A-Z0-9_]*//g;
-        $_ =~ s/CURLPROTO_[A-Z0-9_]*//g;
-        $_ =~ s/CURLPROXY_[A-Z0-9_]*//g;
-        $_ =~ s/CURLPX_[A-Z0-9_]*//g;
-        $_ =~ s/CURLSHE_[A-Z0-9_]*//g;
-        $_ =~ s/CURLSHOPT_[A-Z0-9_]*//g;
-        $_ =~ s/CURLSSLOPT_[A-Z0-9_]*//g;
-        $_ =~ s/CURLSSH_[A-Z0-9_]*//g;
-        $_ =~ s/CURLSSLBACKEND_[A-Z0-9_]*//g;
-        $_ =~ s/CURLU_[A-Z0-9_]*//g;
-        $_ =~ s/CURLUPART_[A-Z0-9_]*//g;
-        #$_ =~ s/\bCURLU\b//g; # stand-alone CURLU
-        $_ =~ s/CURLUE_[A-Z0-9_]*//g;
-        $_ =~ s/CURLHE_[A-Z0-9_]*//g;
-        $_ =~ s/CURLWS_[A-Z0-9_]*//g;
-        $_ =~ s/CURLKH[A-Z0-9_]*//g;
-        $_ =~ s/CURLUPART_[A-Z0-9_]*//g;
-        $_ =~ s/CURLUSESSL_[A-Z0-9_]*//g;
-        $_ =~ s/CURLPAUSE_[A-Z0-9_]*//g;
-        $_ =~ s/CURLHSTS_[A-Z0-9_]*//g;
-        $_ =~ s/curl_global_([a-z_]*)//g;
-        $_ =~ s/curl_(strequal|strnequal|formadd|waitfd|formget|getdate|formfree)//g;
-        $_ =~ s/curl_easy_([a-z]*)//g;
-        $_ =~ s/curl_multi_([a-z_]*)//g;
-        $_ =~ s/curl_mime_(subparts|addpart|filedata|data_cb)//g;
-        $_ =~ s/curl_ws_(send|recv|meta)//g;
-        $_ =~ s/curl_url_(dup)//g;
-        $_ =~ s/curl_pushheader_by(name|num)//g;
-        $_ =~ s/libcurl-(env|ws)//g;
-        $_ =~ s/libcurl\\-(env|ws)//g;
-        $_ =~ s/(^|\W)((tftp|https|http|ftp):\/\/[a-z0-9\-._~%:\/?\#\[\]\@!\$&'()*+,;=\\]+)//gi;
-        push @out, $_;
-    }
-}
-close(F);
-
-open(O, ">$f") or die;
-for my $l (@out) {
-    print O $l;
-}
-close(O);
diff --git a/.github/scripts/spellcheck.curl b/.github/scripts/spellcheck.curl
new file mode 100644 (file)
index 0000000..4de9d86
--- /dev/null
@@ -0,0 +1,151 @@
+# Copyright (C) Daniel Stenberg, <daniel@haxx.se>, et al.
+#
+# SPDX-License-Identifier: curl
+#
+# common variable types + structs
+# callback typedefs
+# public functions names
+# some man page names
+curl_fileinfo
+curl_forms
+curl_hstsentry
+curl_httppost
+curl_index
+curl_khkey
+curl_pushheaders
+curl_waitfd
+CURLcode
+CURLformoption
+CURLHcode
+CURLMcode
+CURLMsg
+CURLSHcode
+CURLUcode
+curl_calloc_callback
+curl_chunk_bgn_callback
+curl_chunk_end_callback
+curl_conv_callback
+curl_debug_callback
+curl_fnmatch_callback
+curl_formget_callback
+curl_free_callback
+curl_hstsread_callback
+curl_hstswrite_callback
+curl_ioctl_callback
+curl_malloc_callback
+curl_multi_timer_callback
+curl_opensocket_callback
+curl_prereq_callback
+curl_progress_callback
+curl_push_callback
+curl_read_callback
+curl_realloc_callback
+curl_resolver_start_callback
+curl_seek_callback
+curl_socket_callback
+curl_sockopt_callback
+curl_ssl_ctx_callback
+curl_strdup_callback
+curl_trailer_callback
+curl_write_callback
+curl_xferinfo_callback
+curl_strequal
+curl_strnequal
+curl_mime_init
+curl_mime_free
+curl_mime_addpart
+curl_mime_name
+curl_mime_filename
+curl_mime_type
+curl_mime_encoder
+curl_mime_data
+curl_mime_filedata
+curl_mime_data_cb
+curl_mime_subparts
+curl_mime_headers
+curl_formadd
+curl_formget
+curl_formfree
+curl_getdate
+curl_getenv
+curl_version
+curl_easy_escape
+curl_escape
+curl_easy_unescape
+curl_unescape
+curl_free
+curl_global_init
+curl_global_init_mem
+curl_global_cleanup
+curl_global_trace
+curl_global_sslset
+curl_slist_append
+curl_slist_free_all
+curl_getdate
+curl_share_init
+curl_share_setopt
+curl_share_cleanup
+curl_version_info
+curl_easy_strerror
+curl_share_strerror
+curl_easy_pause
+curl_easy_ssls_import
+curl_easy_ssls_export
+curl_easy_init
+curl_easy_setopt
+curl_easy_perform
+curl_easy_cleanup
+curl_easy_getinfo
+curl_easy_duphandle
+curl_easy_reset
+curl_easy_recv
+curl_easy_send
+curl_easy_upkeep
+curl_easy_header
+curl_easy_nextheader
+curl_mprintf
+curl_mfprintf
+curl_msprintf
+curl_msnprintf
+curl_mvprintf
+curl_mvfprintf
+curl_mvsprintf
+curl_mvsnprintf
+curl_maprintf
+curl_mvaprintf
+curl_multi_init
+curl_multi_add_handle
+curl_multi_remove_handle
+curl_multi_fdset
+curl_multi_waitfds
+curl_multi_wait
+curl_multi_poll
+curl_multi_wakeup
+curl_multi_perform
+curl_multi_cleanup
+curl_multi_info_read
+curl_multi_strerror
+curl_multi_socket
+curl_multi_socket_action
+curl_multi_socket_all
+curl_multi_timeout
+curl_multi_setopt
+curl_multi_assign
+curl_multi_get_handles
+curl_pushheader_bynum
+curl_pushheader_byname
+curl_multi_waitfds
+curl_easy_option_by_name
+curl_easy_option_by_id
+curl_easy_option_next
+curl_url
+curl_url_cleanup
+curl_url_dup
+curl_url_get
+curl_url_set
+curl_url_strerror
+curl_ws_recv
+curl_ws_send
+curl_ws_meta
+libcurl-env
+libcurl-ws
index 753883e8eb76af03db55b383c5eddc50d4270467..dd52efab146539d8cdf01a168b7e535bda78ef55 100644 (file)
@@ -107,20 +107,8 @@ jobs:
           persist-credentials: false
         name: checkout
 
-      - name: trim all man page *.md files
-        run: find docs -name "*.md" ! -name "_*" -print0 | xargs -0 -n1 .github/scripts/cleancmd.pl
-
-      - name: trim libcurl man page *.md files
-        run: find docs/libcurl \( -name "curl_*.md" -o -name "libcurl*.md" \) -print0 | xargs -0 -n1 .github/scripts/cleanspell.pl
-
-      - name: trim libcurl option man page *.md files
-        run: find docs/libcurl/opts -name "CURL*.md" -print0 | xargs -0 -n1 .github/scripts/cleanspell.pl
-
-      - name: trim cmdline docs markdown _*.md files
-        run: find docs/cmdline-opts -name "_*.md" -print0 | xargs -0 -n1 .github/scripts/cleancmd.pl --no-header
-
-      - name: trim docs/ markdown _*.md files
-        run: git ls-files docs/*.md docs/internals/*.md | xargs -n1 .github/scripts/cleancmd.pl --no-header
+      - name: trim all *.md files in docs/
+        run: .github/scripts/cleancmd.pl $(find docs -name "*.md")
 
       - name: setup the custom wordlist
         run: grep -v '^#' .github/scripts/spellcheck.words >  wordlist.txt