From 7007f59caaf5e743b9e15e44126c621b8f589848 Mon Sep 17 00:00:00 2001
From: Daniel Stenberg <daniel@haxx.se>
Date: Thu, 27 Feb 2025 11:17:42 +0100
Subject: [PATCH] GHA: spellcheck curl symbols better

This now makes sure to trim off exact matches for curl symbols and long
curl commanad line options instead of using pattern matching as before.
This should catch typoed names (that still follow the pattern) better.

The cleanspell.pl script is no longer used. cleancmd.pl is used for all
markdown files.

Closes #16504
---
 .github/scripts/cleancmd.pl     | 134 ++++++++++++++++++++--------
 .github/scripts/cleanspell.pl   |  86 ------------------
 .github/scripts/spellcheck.curl | 151 ++++++++++++++++++++++++++++++++
 .github/workflows/checkdocs.yml |  16 +---
 4 files changed, 251 insertions(+), 136 deletions(-)
 delete mode 100755 .github/scripts/cleanspell.pl
 create mode 100644 .github/scripts/spellcheck.curl
diff --git a/.github/scripts/cleancmd.pl b/.github/scripts/cleancmd.pl
index 355a845e4c..283a9f4e93 100755
--- a/.github/scripts/cleancmd.pl
+++ b/.github/scripts/cleancmd.pl
@@ -3,55 +3,117 @@
 #
 # SPDX-License-Identifier: curl
 #
-# Input: a cmdline docs markdown, it gets modified *in place*
+# Input: cmdline docs markdown files, they get modified *in place*
 #
-# The main purpose is to strip off the leading meta-data part, but also to
-# clean up whatever else the spell checker might have a problem with that we
-# still deem is fine.
-
-my $header = 1;
-while(1) {
-    # set this if the markdown has no meta-data header to skip
-    if($ARGV[0] eq "--no-header") {
-        shift @ARGV;
-        $header = 0;
+# Strip off the leading meta-data/header part, remove all known curl symbols
+# and long command line options. Also clean up whatever else the spell checker
+# might have a problem with that we still deem is fine.
+#
+
+open(S, "<./docs/libcurl/symbols-in-versions")
+    || die "can't find symbols-in-versions";
+while(<S>) {
+    if(/^([^ ]*) /) {
+        push @asyms, $1;
+    }
+}
+close(S);
+
+# init the opts table with "special" options not easy to figure out
+my @aopts = (
+    '--ftp-ssl-reqd', # old alias
+    );
+
+open(O, "<./docs/options-in-versions")
+    || die "can't find options-in-versions";
+while(<O>) {
+    chomp;
+    if(/^([^ ]+)/) {
+        my $o = $1;
+        push @aopts, $o;
+        if($o =~ /^--no-(.*)/) {
+            # for the --no options, also make one without it
+            push @aopts, "--$1";
+        }
+        elsif($o =~ /^--disable-(.*)/) {
+            # for the --disable options, also make the special ones
+            push @aopts, "--$1";
+            push @aopts, "--no-$1";
+        }
+    }
+}
+close(O);
+
+open(C, "<./.github/scripts/spellcheck.curl")
+    || die "can't find spellcheck.curl";
+while(<C>) {
+    if(/^\#/) {
+        next;
     }
-    else {
-        last;
+    chomp;
+    if(/^([^ ]+)/) {
+        push @asyms, $1;
     }
 }
+close(C);
 
-my $f = $ARGV[0];
+# longest symbols first
+my @syms = sort { length($b) <=> length($a) } @asyms;
 
-open(F, "<$f") or die;
+# longest cmdline options first
+my @opts = sort { length($b) <=> length($a) } @aopts;
 
-my $ignore = $header;
-my $sepcount = 0;
-my @out;
-while(<F>) {
-    if(/^---/ && $header) {
-        if(++$sepcount == 2) {
+sub process {
+    my ($f) = @_;
+
+    my $ignore = 0;
+    my $sepcount = 0;
+    my $out;
+    my $line = 0;
+    open(F, "<$f") or die;
+
+    while(<F>) {
+        $line++;
+        if(/^---/ && ($line == 1)) {
+            $ignore = 1;
+            next;
+        }
+        elsif(/^---/ && $ignore) {
             $ignore = 0;
+            next;
         }
-        next;
-    }
-    next if($ignore);
+        next if($ignore);
+
+        my $l = $_;
 
-    # strip out backticked words
-    $_ =~ s/`[^`]+`//g;
+        # strip out backticked words
+        $l =~ s/`[^`]+`//g;
 
-    # strip out all long command line options
-    $_ =~ s/--[a-z0-9-]+//g;
+        # **bold**
+        $l =~ s/\*\*(\S.*?)\*\*//g;
+        # *italics*
+        $l =~ s/\*(\S.*?)\*//g;
 
-    # strip out https URLs, we don't want them spellchecked
-    $_ =~ s!https://[a-z0-9\#_/.-]+!!gi;
+        # strip out https URLs, we don't want them spellchecked
+        $l =~ s!https://[a-z0-9\#_/.-]+!!gi;
 
-    push @out, $_;
+        $out .= $l;
+    }
+    close(F);
+
+    # cut out all known curl cmdline options
+    map { $out =~ s/$_//g; } (@opts);
+
+    # cut out all known curl symbols
+    map { $out =~ s/\b$_\b//g; } (@syms);
+
+    if(!$ignore) {
+        open(O, ">$f") or die;
+        print O $out;
+        close(O);
+    }
 }
-close(F);
 
-if(!$ignore) {
-    open(O, ">$f") or die;
-    print O @out;
-    close(O);
+for my $f (@ARGV) {
+    process($f);
 }
diff --git a/.github/scripts/cleanspell.pl b/.github/scripts/cleanspell.pl
deleted file mode 100755
index bfa07dc053..0000000000
--- a/.github/scripts/cleanspell.pl
+++ /dev/null
@@ -1,86 +0,0 @@
-#!/usr/bin/env perl
-# Copyright (C) Daniel Stenberg, <daniel@haxx.se>, et al.
-#
-# SPDX-License-Identifier: curl
-#
-# Given: a libcurl curldown man page
-# Outputs: the same file, minus the SYNOPSIS and the EXAMPLE sections
-#
-
-my $f = $ARGV[0];
-
-open(F, "<$f") or die;
-
-my @out;
-my $ignore = 0;
-while(<F>) {
-    if($_ =~ /^# (SYNOPSIS|EXAMPLE)/) {
-        $ignore = 1;
-    }
-    elsif($ignore && ($_ =~ /^# [A-Z]/)) {
-        $ignore = 0;
-    }
-    elsif(!$ignore) {
-        # **bold**
-        $_ =~ s/\*\*(\S.*?)\*\*//g;
-        # *italics*
-        $_ =~ s/\*(\S.*?)\*//g;
-
-        $_ =~ s/CURL(M|SH|U|H)code//g;
-        $_ =~ s/CURL_[A-Z0-9_]*//g;
-        $_ =~ s/CURLALTSVC_[A-Z0-9_]*//g;
-        $_ =~ s/CURLAUTH_[A-Z0-9_]*//g;
-        $_ =~ s/CURLE_[A-Z0-9_]*//g;
-        $_ =~ s/CURLFORM_[A-Z0-9_]*//g;
-        $_ =~ s/CURLFTP_[A-Z0-9_]*//g;
-        $_ =~ s/CURLFTPAUTH_[A-Z0-9_]*//g;
-        $_ =~ s/CURLFTPMETHOD_[A-Z0-9_]*//g;
-        $_ =~ s/CURLFTPSSL_[A-Z0-9_]*//g;
-        $_ =~ s/CURLGSSAPI_[A-Z0-9_]*//g;
-        $_ =~ s/CURLHEADER_[A-Z0-9_]*//g;
-        $_ =~ s/CURLINFO_[A-Z0-9_]*//g;
-        $_ =~ s/CURLM_[A-Z0-9_]*//g;
-        $_ =~ s/CURLMIMEOPT_[A-Z0-9_]*//g;
-        $_ =~ s/CURLMOPT_[A-Z0-9_]*//g;
-        $_ =~ s/CURLOPT_[A-Z0-9_]*//g;
-        $_ =~ s/CURLPIPE_[A-Z0-9_]*//g;
-        $_ =~ s/CURLPROTO_[A-Z0-9_]*//g;
-        $_ =~ s/CURLPROXY_[A-Z0-9_]*//g;
-        $_ =~ s/CURLPX_[A-Z0-9_]*//g;
-        $_ =~ s/CURLSHE_[A-Z0-9_]*//g;
-        $_ =~ s/CURLSHOPT_[A-Z0-9_]*//g;
-        $_ =~ s/CURLSSLOPT_[A-Z0-9_]*//g;
-        $_ =~ s/CURLSSH_[A-Z0-9_]*//g;
-        $_ =~ s/CURLSSLBACKEND_[A-Z0-9_]*//g;
-        $_ =~ s/CURLU_[A-Z0-9_]*//g;
-        $_ =~ s/CURLUPART_[A-Z0-9_]*//g;
-        #$_ =~ s/\bCURLU\b//g; # stand-alone CURLU
-        $_ =~ s/CURLUE_[A-Z0-9_]*//g;
-        $_ =~ s/CURLHE_[A-Z0-9_]*//g;
-        $_ =~ s/CURLWS_[A-Z0-9_]*//g;
-        $_ =~ s/CURLKH[A-Z0-9_]*//g;
-        $_ =~ s/CURLUPART_[A-Z0-9_]*//g;
-        $_ =~ s/CURLUSESSL_[A-Z0-9_]*//g;
-        $_ =~ s/CURLPAUSE_[A-Z0-9_]*//g;
-        $_ =~ s/CURLHSTS_[A-Z0-9_]*//g;
-        $_ =~ s/curl_global_([a-z_]*)//g;
-        $_ =~ s/curl_(strequal|strnequal|formadd|waitfd|formget|getdate|formfree)//g;
-        $_ =~ s/curl_easy_([a-z]*)//g;
-        $_ =~ s/curl_multi_([a-z_]*)//g;
-        $_ =~ s/curl_mime_(subparts|addpart|filedata|data_cb)//g;
-        $_ =~ s/curl_ws_(send|recv|meta)//g;
-        $_ =~ s/curl_url_(dup)//g;
-        $_ =~ s/curl_pushheader_by(name|num)//g;
-        $_ =~ s/libcurl-(env|ws)//g;
-        $_ =~ s/libcurl\\-(env|ws)//g;
-        $_ =~ s/(^|\W)((tftp|https|http|ftp):\/\/[a-z0-9\-._~%:\/?\#\[\]\@!\$&'()*+,;=\\]+)//gi;
-        push @out, $_;
-    }
-}
-close(F);
-
-open(O, ">$f") or die;
-for my $l (@out) {
-    print O $l;
-}
-close(O);
diff --git a/.github/scripts/spellcheck.curl b/.github/scripts/spellcheck.curl
new file mode 100644
index 0000000000..4de9d86596
--- /dev/null
+++ b/.github/scripts/spellcheck.curl
@@ -0,0 +1,151 @@
+# Copyright (C) Daniel Stenberg, <daniel@haxx.se>, et al.
+#
+# SPDX-License-Identifier: curl
+#
+# common variable types + structs
+# callback typedefs
+# public functions names
+# some man page names
+curl_fileinfo
+curl_forms
+curl_hstsentry
+curl_httppost
+curl_index
+curl_khkey
+curl_pushheaders
+curl_waitfd
+CURLcode
+CURLformoption
+CURLHcode
+CURLMcode
+CURLMsg
+CURLSHcode
+CURLUcode
+curl_calloc_callback
+curl_chunk_bgn_callback
+curl_chunk_end_callback
+curl_conv_callback
+curl_debug_callback
+curl_fnmatch_callback
+curl_formget_callback
+curl_free_callback
+curl_hstsread_callback
+curl_hstswrite_callback
+curl_ioctl_callback
+curl_malloc_callback
+curl_multi_timer_callback
+curl_opensocket_callback
+curl_prereq_callback
+curl_progress_callback
+curl_push_callback
+curl_read_callback
+curl_realloc_callback
+curl_resolver_start_callback
+curl_seek_callback
+curl_socket_callback
+curl_sockopt_callback
+curl_ssl_ctx_callback
+curl_strdup_callback
+curl_trailer_callback
+curl_write_callback
+curl_xferinfo_callback
+curl_strequal
+curl_strnequal
+curl_mime_init
+curl_mime_free
+curl_mime_addpart
+curl_mime_name
+curl_mime_filename
+curl_mime_type
+curl_mime_encoder
+curl_mime_data
+curl_mime_filedata
+curl_mime_data_cb
+curl_mime_subparts
+curl_mime_headers
+curl_formadd
+curl_formget
+curl_formfree
+curl_getdate
+curl_getenv
+curl_version
+curl_easy_escape
+curl_escape
+curl_easy_unescape
+curl_unescape
+curl_free
+curl_global_init
+curl_global_init_mem
+curl_global_cleanup
+curl_global_trace
+curl_global_sslset
+curl_slist_append
+curl_slist_free_all
+curl_getdate
+curl_share_init
+curl_share_setopt
+curl_share_cleanup
+curl_version_info
+curl_easy_strerror
+curl_share_strerror
+curl_easy_pause
+curl_easy_ssls_import
+curl_easy_ssls_export
+curl_easy_init
+curl_easy_setopt
+curl_easy_perform
+curl_easy_cleanup
+curl_easy_getinfo
+curl_easy_duphandle
+curl_easy_reset
+curl_easy_recv
+curl_easy_send
+curl_easy_upkeep
+curl_easy_header
+curl_easy_nextheader
+curl_mprintf
+curl_mfprintf
+curl_msprintf
+curl_msnprintf
+curl_mvprintf
+curl_mvfprintf
+curl_mvsprintf
+curl_mvsnprintf
+curl_maprintf
+curl_mvaprintf
+curl_multi_init
+curl_multi_add_handle
+curl_multi_remove_handle
+curl_multi_fdset
+curl_multi_waitfds
+curl_multi_wait
+curl_multi_poll
+curl_multi_wakeup
+curl_multi_perform
+curl_multi_cleanup
+curl_multi_info_read
+curl_multi_strerror
+curl_multi_socket
+curl_multi_socket_action
+curl_multi_socket_all
+curl_multi_timeout
+curl_multi_setopt
+curl_multi_assign
+curl_multi_get_handles
+curl_pushheader_bynum
+curl_pushheader_byname
+curl_multi_waitfds
+curl_easy_option_by_name
+curl_easy_option_by_id
+curl_easy_option_next
+curl_url
+curl_url_cleanup
+curl_url_dup
+curl_url_get
+curl_url_set
+curl_url_strerror
+curl_ws_recv
+curl_ws_send
+curl_ws_meta
+libcurl-env
+libcurl-ws
diff --git a/.github/workflows/checkdocs.yml b/.github/workflows/checkdocs.yml
index 753883e8eb..dd52efab14 100644
--- a/.github/workflows/checkdocs.yml
+++ b/.github/workflows/checkdocs.yml
@@ -107,20 +107,8 @@ jobs:
           persist-credentials: false
         name: checkout
 
-      - name: trim all man page *.md files
-        run: find docs -name "*.md" ! -name "_*" -print0 | xargs -0 -n1 .github/scripts/cleancmd.pl
-
-      - name: trim libcurl man page *.md files
-        run: find docs/libcurl \( -name "curl_*.md" -o -name "libcurl*.md" \) -print0 | xargs -0 -n1 .github/scripts/cleanspell.pl
-
-      - name: trim libcurl option man page *.md files
-        run: find docs/libcurl/opts -name "CURL*.md" -print0 | xargs -0 -n1 .github/scripts/cleanspell.pl
-
-      - name: trim cmdline docs markdown _*.md files
-        run: find docs/cmdline-opts -name "_*.md" -print0 | xargs -0 -n1 .github/scripts/cleancmd.pl --no-header
-
-      - name: trim docs/ markdown _*.md files
-        run: git ls-files docs/*.md docs/internals/*.md | xargs -n1 .github/scripts/cleancmd.pl --no-header
+      - name: trim all *.md files in docs/
+        run: .github/scripts/cleancmd.pl $(find docs -name "*.md")
 
       - name: setup the custom wordlist
         run: grep -v '^#' .github/scripts/spellcheck.words >  wordlist.txt
-- 
2.47.3