GHA: spellcheck curl symbols better

author Daniel Stenberg <daniel@haxx.se>

Thu, 27 Feb 2025 10:17:42 +0000 (11:17 +0100)

committer Daniel Stenberg <daniel@haxx.se>

Thu, 27 Feb 2025 12:15:21 +0000 (13:15 +0100)
author Daniel Stenberg <daniel@haxx.se>
Thu, 27 Feb 2025 10:17:42 +0000 (11:17 +0100)
committer Daniel Stenberg <daniel@haxx.se>
Thu, 27 Feb 2025 12:15:21 +0000 (13:15 +0100)
diff --git a/.github/scripts/cleancmd.pl b/.github/scripts/cleancmd.pl

index 355a845e4c5e5627a3666e400fec08dabc292af2..283a9f4e934f4f8613ef718b9309db1113fe5d22 100755 (executable)
--- a/.github/scripts/cleancmd.pl
+++ b/.github/scripts/cleancmd.pl
@@ -3,55 +3,117 @@
  #
  # SPDX-License-Identifier: curl
  #
-# Input: a cmdline docs markdown, it gets modified *in place*
+# Input: cmdline docs markdown files, they get modified *in place*
  #
-# The main purpose is to strip off the leading meta-data part, but also to
-# clean up whatever else the spell checker might have a problem with that we
-# still deem is fine.
-
-my $header = 1;
-while(1) {
-    # set this if the markdown has no meta-data header to skip
-    if($ARGV[0] eq "--no-header") {
-        shift @ARGV;
-        $header = 0;
+# Strip off the leading meta-data/header part, remove all known curl symbols
+# and long command line options. Also clean up whatever else the spell checker
+# might have a problem with that we still deem is fine.
+#
+
+open(S, "<./docs/libcurl/symbols-in-versions")
+    || die "can't find symbols-in-versions";
+while(<S>) {
+    if(/^([^ ]*) /) {
+        push @asyms, $1;
+    }
+}
+close(S);
+
+# init the opts table with "special" options not easy to figure out
+my @aopts = (
+    '--ftp-ssl-reqd', # old alias
+    );
+
+open(O, "<./docs/options-in-versions")
+    || die "can't find options-in-versions";
+while(<O>) {
+    chomp;
+    if(/^([^ ]+)/) {
+        my $o = $1;
+        push @aopts, $o;
+        if($o =~ /^--no-(.*)/) {
+            # for the --no options, also make one without it
+            push @aopts, "--$1";
+        }
+        elsif($o =~ /^--disable-(.*)/) {
+            # for the --disable options, also make the special ones
+            push @aopts, "--$1";
+            push @aopts, "--no-$1";
+        }
+    }
+}
+close(O);
+
+open(C, "<./.github/scripts/spellcheck.curl")
+    || die "can't find spellcheck.curl";
+while(<C>) {
+    if(/^\#/) {
+        next;
      }
-    else {
-        last;
+    chomp;
+    if(/^([^ ]+)/) {
+        push @asyms, $1;
      }
  }
+close(C);
  
-my $f = $ARGV[0];
+# longest symbols first
+my @syms = sort { length($b) <=> length($a) } @asyms;
  
-open(F, "<$f") or die;
+# longest cmdline options first
+my @opts = sort { length($b) <=> length($a) } @aopts;
  
-my $ignore = $header;
-my $sepcount = 0;
-my @out;
-while(<F>) {
-    if(/^---/ && $header) {
-        if(++$sepcount == 2) {
+sub process {
+    my ($f) = @_;
+
+    my $ignore = 0;
+    my $sepcount = 0;
+    my $out;
+    my $line = 0;
+    open(F, "<$f") or die;
+
+    while(<F>) {
+        $line++;
+        if(/^---/ && ($line == 1)) {
+            $ignore = 1;
+            next;
+        }
+        elsif(/^---/ && $ignore) {
              $ignore = 0;
+            next;
          }
-        next;
-    }
-    next if($ignore);
+        next if($ignore);
+
+        my $l = $_;
  
-    # strip out backticked words
-    $_ =~ s/`[^`]+`//g;
+        # strip out backticked words
+        $l =~ s/`[^`]+`//g;
  
-    # strip out all long command line options
-    $_ =~ s/--[a-z0-9-]+//g;
+        # **bold**
+        $l =~ s/\*\*(\S.*?)\*\*//g;
+        # *italics*
+        $l =~ s/\*(\S.*?)\*//g;
  
-    # strip out https URLs, we don't want them spellchecked
-    $_ =~ s!https://[a-z0-9\#_/.-]+!!gi;
+        # strip out https URLs, we don't want them spellchecked
+        $l =~ s!https://[a-z0-9\#_/.-]+!!gi;
  
-    push @out, $_;
+        $out .= $l;
+    }
+    close(F);
+
+    # cut out all known curl cmdline options
+    map { $out =~ s/$_//g; } (@opts);
+
+    # cut out all known curl symbols
+    map { $out =~ s/\b$_\b//g; } (@syms);
+
+    if(!$ignore) {
+        open(O, ">$f") or die;
+        print O $out;
+        close(O);
+    }
  }
-close(F);
  
-if(!$ignore) {
-    open(O, ">$f") or die;
-    print O @out;
-    close(O);
+for my $f (@ARGV) {
+    process($f);
  }
diff --git a/.github/scripts/cleanspell.pl b/.github/scripts/cleanspell.pl

deleted file mode 100755 (executable)

index bfa07dc..0000000
--- a/.github/scripts/cleanspell.pl
+++ /dev/null
@@ -1,86 +0,0 @@
-#!/usr/bin/env perl
-# Copyright (C) Daniel Stenberg, <daniel@haxx.se>, et al.
-#
-# SPDX-License-Identifier: curl
-#
-# Given: a libcurl curldown man page
-# Outputs: the same file, minus the SYNOPSIS and the EXAMPLE sections
-#
-
-my $f = $ARGV[0];
-
-open(F, "<$f") or die;
-
-my @out;
-my $ignore = 0;
-while(<F>) {
-    if($_ =~ /^# (SYNOPSIS|EXAMPLE)/) {
-        $ignore = 1;
-    }
-    elsif($ignore && ($_ =~ /^# [A-Z]/)) {
-        $ignore = 0;
-    }
-    elsif(!$ignore) {
-        # **bold**
-        $_ =~ s/\*\*(\S.*?)\*\*//g;
-        # *italics*
-        $_ =~ s/\*(\S.*?)\*//g;
-
-        $_ =~ s/CURL(M|SH|U|H)code//g;
-        $_ =~ s/CURL_[A-Z0-9_]*//g;
-        $_ =~ s/CURLALTSVC_[A-Z0-9_]*//g;
-        $_ =~ s/CURLAUTH_[A-Z0-9_]*//g;
-        $_ =~ s/CURLE_[A-Z0-9_]*//g;
-        $_ =~ s/CURLFORM_[A-Z0-9_]*//g;
-        $_ =~ s/CURLFTP_[A-Z0-9_]*//g;
-        $_ =~ s/CURLFTPAUTH_[A-Z0-9_]*//g;
-        $_ =~ s/CURLFTPMETHOD_[A-Z0-9_]*//g;
-        $_ =~ s/CURLFTPSSL_[A-Z0-9_]*//g;
-        $_ =~ s/CURLGSSAPI_[A-Z0-9_]*//g;
-        $_ =~ s/CURLHEADER_[A-Z0-9_]*//g;
-        $_ =~ s/CURLINFO_[A-Z0-9_]*//g;
-        $_ =~ s/CURLM_[A-Z0-9_]*//g;
-        $_ =~ s/CURLMIMEOPT_[A-Z0-9_]*//g;
-        $_ =~ s/CURLMOPT_[A-Z0-9_]*//g;
-        $_ =~ s/CURLOPT_[A-Z0-9_]*//g;
-        $_ =~ s/CURLPIPE_[A-Z0-9_]*//g;
-        $_ =~ s/CURLPROTO_[A-Z0-9_]*//g;
-        $_ =~ s/CURLPROXY_[A-Z0-9_]*//g;
-        $_ =~ s/CURLPX_[A-Z0-9_]*//g;
-        $_ =~ s/CURLSHE_[A-Z0-9_]*//g;
-        $_ =~ s/CURLSHOPT_[A-Z0-9_]*//g;
-        $_ =~ s/CURLSSLOPT_[A-Z0-9_]*//g;
-        $_ =~ s/CURLSSH_[A-Z0-9_]*//g;
-        $_ =~ s/CURLSSLBACKEND_[A-Z0-9_]*//g;
-        $_ =~ s/CURLU_[A-Z0-9_]*//g;
-        $_ =~ s/CURLUPART_[A-Z0-9_]*//g;
-        #$_ =~ s/\bCURLU\b//g; # stand-alone CURLU
-        $_ =~ s/CURLUE_[A-Z0-9_]*//g;
-        $_ =~ s/CURLHE_[A-Z0-9_]*//g;
-        $_ =~ s/CURLWS_[A-Z0-9_]*//g;
-        $_ =~ s/CURLKH[A-Z0-9_]*//g;
-        $_ =~ s/CURLUPART_[A-Z0-9_]*//g;
-        $_ =~ s/CURLUSESSL_[A-Z0-9_]*//g;
-        $_ =~ s/CURLPAUSE_[A-Z0-9_]*//g;
-        $_ =~ s/CURLHSTS_[A-Z0-9_]*//g;
-        $_ =~ s/curl_global_([a-z_]*)//g;
-        $_ =~ s/curl_(strequal|strnequal|formadd|waitfd|formget|getdate|formfree)//g;
-        $_ =~ s/curl_easy_([a-z]*)//g;
-        $_ =~ s/curl_multi_([a-z_]*)//g;
-        $_ =~ s/curl_mime_(subparts|addpart|filedata|data_cb)//g;
-        $_ =~ s/curl_ws_(send|recv|meta)//g;
-        $_ =~ s/curl_url_(dup)//g;
-        $_ =~ s/curl_pushheader_by(name|num)//g;
-        $_ =~ s/libcurl-(env|ws)//g;
-        $_ =~ s/libcurl\\-(env|ws)//g;
-        $_ =~ s/(^|\W)((tftp|https|http|ftp):\/\/[a-z0-9\-._~%:\/?\#\[\]\@!\$&'()*+,;=\\]+)//gi;
-        push @out, $_;
-    }
-}
-close(F);
-
-open(O, ">$f") or die;
-for my $l (@out) {
-    print O $l;
-}
-close(O);
diff --git a/.github/scripts/spellcheck.curl b/.github/scripts/spellcheck.curl

new file mode 100644 (file)

index 0000000..4de9d86
--- /dev/null
+++ b/.github/scripts/spellcheck.curl
@@ -0,0 +1,151 @@
+# Copyright (C) Daniel Stenberg, <daniel@haxx.se>, et al.
+#
+# SPDX-License-Identifier: curl
+#
+# common variable types + structs
+# callback typedefs
+# public functions names
+# some man page names
+curl_fileinfo
+curl_forms
+curl_hstsentry
+curl_httppost
+curl_index
+curl_khkey
+curl_pushheaders
+curl_waitfd
+CURLcode
+CURLformoption
+CURLHcode
+CURLMcode
+CURLMsg
+CURLSHcode
+CURLUcode
+curl_calloc_callback
+curl_chunk_bgn_callback
+curl_chunk_end_callback
+curl_conv_callback
+curl_debug_callback
+curl_fnmatch_callback
+curl_formget_callback
+curl_free_callback
+curl_hstsread_callback
+curl_hstswrite_callback
+curl_ioctl_callback
+curl_malloc_callback
+curl_multi_timer_callback
+curl_opensocket_callback
+curl_prereq_callback
+curl_progress_callback
+curl_push_callback
+curl_read_callback
+curl_realloc_callback
+curl_resolver_start_callback
+curl_seek_callback
+curl_socket_callback
+curl_sockopt_callback
+curl_ssl_ctx_callback
+curl_strdup_callback
+curl_trailer_callback
+curl_write_callback
+curl_xferinfo_callback
+curl_strequal
+curl_strnequal
+curl_mime_init
+curl_mime_free
+curl_mime_addpart
+curl_mime_name
+curl_mime_filename
+curl_mime_type
+curl_mime_encoder
+curl_mime_data
+curl_mime_filedata
+curl_mime_data_cb
+curl_mime_subparts
+curl_mime_headers
+curl_formadd
+curl_formget
+curl_formfree
+curl_getdate
+curl_getenv
+curl_version
+curl_easy_escape
+curl_escape
+curl_easy_unescape
+curl_unescape
+curl_free
+curl_global_init
+curl_global_init_mem
+curl_global_cleanup
+curl_global_trace
+curl_global_sslset
+curl_slist_append
+curl_slist_free_all
+curl_getdate
+curl_share_init
+curl_share_setopt
+curl_share_cleanup
+curl_version_info
+curl_easy_strerror
+curl_share_strerror
+curl_easy_pause
+curl_easy_ssls_import
+curl_easy_ssls_export
+curl_easy_init
+curl_easy_setopt
+curl_easy_perform
+curl_easy_cleanup
+curl_easy_getinfo
+curl_easy_duphandle
+curl_easy_reset
+curl_easy_recv
+curl_easy_send
+curl_easy_upkeep
+curl_easy_header
+curl_easy_nextheader
+curl_mprintf
+curl_mfprintf
+curl_msprintf
+curl_msnprintf
+curl_mvprintf
+curl_mvfprintf
+curl_mvsprintf
+curl_mvsnprintf
+curl_maprintf
+curl_mvaprintf
+curl_multi_init
+curl_multi_add_handle
+curl_multi_remove_handle
+curl_multi_fdset
+curl_multi_waitfds
+curl_multi_wait
+curl_multi_poll
+curl_multi_wakeup
+curl_multi_perform
+curl_multi_cleanup
+curl_multi_info_read
+curl_multi_strerror
+curl_multi_socket
+curl_multi_socket_action
+curl_multi_socket_all
+curl_multi_timeout
+curl_multi_setopt
+curl_multi_assign
+curl_multi_get_handles
+curl_pushheader_bynum
+curl_pushheader_byname
+curl_multi_waitfds
+curl_easy_option_by_name
+curl_easy_option_by_id
+curl_easy_option_next
+curl_url
+curl_url_cleanup
+curl_url_dup
+curl_url_get
+curl_url_set
+curl_url_strerror
+curl_ws_recv
+curl_ws_send
+curl_ws_meta
+libcurl-env
+libcurl-ws
diff --git a/.github/workflows/checkdocs.yml b/.github/workflows/checkdocs.yml

index 753883e8eb76af03db55b383c5eddc50d4270467..dd52efab146539d8cdf01a168b7e535bda78ef55 100644 (file)
--- a/.github/workflows/checkdocs.yml
+++ b/.github/workflows/checkdocs.yml
@@ -107,20 +107,8 @@ jobs:
            persist-credentials: false
          name: checkout
  
-      - name: trim all man page *.md files
-        run: find docs -name "*.md" ! -name "_*" -print0 | xargs -0 -n1 .github/scripts/cleancmd.pl
-
-      - name: trim libcurl man page *.md files
-        run: find docs/libcurl \( -name "curl_*.md" -o -name "libcurl*.md" \) -print0 | xargs -0 -n1 .github/scripts/cleanspell.pl
-
-      - name: trim libcurl option man page *.md files
-        run: find docs/libcurl/opts -name "CURL*.md" -print0 | xargs -0 -n1 .github/scripts/cleanspell.pl
-
-      - name: trim cmdline docs markdown _*.md files
-        run: find docs/cmdline-opts -name "_*.md" -print0 | xargs -0 -n1 .github/scripts/cleancmd.pl --no-header
-
-      - name: trim docs/ markdown _*.md files
-        run: git ls-files docs/*.md docs/internals/*.md | xargs -n1 .github/scripts/cleancmd.pl --no-header
+      - name: trim all *.md files in docs/
+        run: .github/scripts/cleancmd.pl $(find docs -name "*.md")
  
        - name: setup the custom wordlist
          run: grep -v '^#' .github/scripts/spellcheck.words >  wordlist.txt
author	Daniel Stenberg <daniel@haxx.se>
	Thu, 27 Feb 2025 10:17:42 +0000 (11:17 +0100)
committer	Daniel Stenberg <daniel@haxx.se>
	Thu, 27 Feb 2025 12:15:21 +0000 (13:15 +0100)
.github/scripts/cleancmd.pl		patch \| blob \| blame \| history
.github/scripts/cleanspell.pl	[deleted file]	patch \| blob \| blame \| history
.github/scripts/spellcheck.curl	[new file with mode: 0644]	patch \| blob
.github/workflows/checkdocs.yml		patch \| blob \| blame \| history