]> git.ipfire.org Git - thirdparty/binutils-gdb.git/blob - gdb/contrib/words.sh
Automatic Copyright Year update after running gdb/copyright.py
[thirdparty/binutils-gdb.git] / gdb / contrib / words.sh
1 #!/bin/sh
2
3 # Copyright (C) 2019-2022 Free Software Foundation, Inc.
4 # This program is free software; you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License as published by
6 # the Free Software Foundation; either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU General Public License for more details.
13 #
14 # You should have received a copy of the GNU General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
16
17 # This script intends to facilitate spell checking of source/doc files.
18 # It:
19 # - transforms the files into a list of lowercase words
20 # - prefixes each word with the frequency
21 # - filters out words within a frequency range
22 # - sorts the words, longest first
23 #
24 # If '-c' is passed as option, it operates on the C comments only, rather than
25 # on the entire file.
26 #
27 # For:
28 # ...
29 # $ files=$(find gdb -type f -name "*.c" -o -name "*.h")
30 # $ ./gdb/contrib/words.sh -c $files
31 # ...
32 # it generates a list of ~15000 words prefixed with frequency.
33 #
34 # This could be used to generate a dictionary that is kept as part of the
35 # sources, against which new code can be checked, generating a warning or
36 # error. The hope is that misspellings would trigger this frequently, and rare
37 # words rarely, otherwise the burden of updating the dictionary would be too
38 # much.
39 #
40 # And for:
41 # ...
42 # $ files=$(find gdb -type f -name "*.c" -o -name "*.h")
43 # $ ./gdb/contrib/words.sh -c -f 1 $files
44 # ...
45 # it generates a list of ~5000 words with frequency 1.
46 #
47 # This can be used to scan for misspellings manually.
48 #
49
50 minfreq=
51 maxfreq=
52 c=false
53 while [ $# -gt 0 ]; do
54 case "$1" in
55 -c)
56 c=true
57 shift
58 ;;
59 --freq|-f)
60 minfreq=$2
61 maxfreq=$2
62 shift 2
63 ;;
64 --min)
65 minfreq=$2
66 if [ "$maxfreq" = "" ]; then
67 maxfreq=0
68 fi
69 shift 2
70 ;;
71 --max)
72 maxfreq=$2
73 if [ "$minfreq" = "" ]; then
74 minfreq=0
75 fi
76 shift 2
77 ;;
78 *)
79 break;
80 ;;
81 esac
82 done
83
84 if [ "$minfreq" = "" ] && [ "$maxfreq" = "" ]; then
85 minfreq=0
86 maxfreq=0
87 fi
88
89 awkfile=$(mktemp)
90 trap 'rm -f "$awkfile"' EXIT
91
92 cat > "$awkfile" <<EOF
93 BEGIN {
94 in_comment=0
95 }
96
97 // {
98 line=\$0
99 }
100
101 /\/\*/ {
102 in_comment=1
103 sub(/.*\/\*/, "", line)
104 }
105
106 /\*\// {
107 sub(/\*\/.*/, "", line)
108 in_comment=0
109 print line
110 next
111 }
112
113 // {
114 if (in_comment) {
115 print line
116 }
117 }
118 EOF
119
120 # Stabilize sort.
121 export LC_ALL=C
122
123 if $c; then
124 awk \
125 -f "$awkfile" \
126 -- "$@"
127 else
128 cat "$@"
129 fi \
130 | sed \
131 -e 's/[!"?;:%^$~#{}`&=@,. \t\/_()|<>\+\*-]/\n/g' \
132 -e 's/\[/\n/g' \
133 -e 's/\]/\n/g' \
134 -e "s/'/\n/g" \
135 -e 's/[0-9][0-9]*/\n/g' \
136 -e 's/[ \t]*//g' \
137 | tr '[:upper:]' '[:lower:]' \
138 | sort \
139 | uniq -c \
140 | awk "{ if (($minfreq == 0 || $minfreq <= \$1) \
141 && ($maxfreq == 0 || \$1 <= $maxfreq)) { print \$0; } }" \
142 | awk '{ print length($0) " " $0; }' \
143 | sort -n -r \
144 | cut -d ' ' -f 2-