From: Bruno Haible Date: Fri, 4 Jul 2025 02:00:23 +0000 (+0200) Subject: xgettext: Warn when a message contains an URL or email address. X-Git-Tag: v0.26~22 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=0c0bd837fafbf5da77fc77be41c7fec4f52cef28;p=thirdparty%2Fgettext.git xgettext: Warn when a message contains an URL or email address. Reported by Arsen Arsenović at . * gettext-tools/src/xg-check.c: Include c-strstr.h. (SIZEOF): New macro. (string_has_url, message_has_url, string_has_email, message_has_email, url_check_message): New functions. (xgettext_check_message_list): Invoke url_check_message. * gettext-tools/tests/xgettext-20: New file. * gettext-tools/tests/Makefile.am (TESTS): Add it. * NEWS: Mention the change. --- diff --git a/NEWS b/NEWS index 3a0be9fd0..e10776206 100644 --- a/NEWS +++ b/NEWS @@ -30,6 +30,8 @@ Version 0.26 - July 2025 omitting from msgid a placeholder that is used in msgid_plural. But when a placeholder is used in both msgid and msgid_plural, its type must be the same in both. + - xgettext now suggests a refactoring when a translatable string + contains an URL or email address. # Improvements for translators: * msggrep: diff --git a/gettext-tools/src/xg-check.c b/gettext-tools/src/xg-check.c index 079027737..c8e73b19c 100644 --- a/gettext-tools/src/xg-check.c +++ b/gettext-tools/src/xg-check.c @@ -34,6 +34,7 @@ #include "if-error.h" #include "sentence.h" #include "c-ctype.h" +#include "c-strstr.h" #include "unictype.h" #include "unistr.h" #include "quote.h" @@ -41,6 +42,8 @@ #define _(str) gettext (str) +#define SIZEOF(a) (sizeof(a) / sizeof(a[0])) + /* Function that implements a single syntax check. MP is a message. @@ -416,6 +419,167 @@ format_check_message (const message_ty *mp) } +/* Determine whether a string (msgid or msgid_plural) contains a URL. */ +static bool +string_has_url (const char *string) +{ + /* Test for the common pattern of URLs that reside on the internet + (not "file:"). */ + static const char *patterns[] = + { + "mailto:", + "http://", "https://", + "ftp://", + "irc://", "ircs://" + }; + size_t i; + + for (i = 0; i < SIZEOF (patterns); i++) + { + const char *pattern = patterns[i]; + /* msgid and msgid_plural are typically entirely ASCII. Therefore here + it's OK to use the functions; no need for UTF-8 aware + functions. */ + const char *string_tail; + for (string_tail = string;;) + { + const char *found = c_strstr (string_tail, pattern); + if (found == NULL) + break; + /* Test whether the pattern starts at a word boundary. */ + if (found == string_tail || !(c_isalnum (found[-1]) || found[-1] == '_')) + { + /* Find the end of the URL. */ + const char *found_end = found + strlen (pattern); + const char *p = found_end; + while (*p != '\0' + && !(c_isspace (*p) || *p == '<' || *p == '>' || *p == '"')) + p++; + if (p > found_end) + { + /* Here *p == '\0' or + (c_isspace (*p) || *p == '<' || *p == '>' || *p == '"'). + This implies !(c_isalnum (*p) || *p == '_'). */ + /* In case of a "mailto" URL, test for a '@'. */ + if (!(i == 0) || memchr (found, '@', p - found_end) != NULL) + { + /* Yes, it looks like a URL. */ + return true; + } + } + } + string_tail = found + 1; + } + } + + return false; +} + +/* Determine whether a message contains a URL. */ +static bool +message_has_url (const message_ty *mp) +{ + return string_has_url (mp->msgid) + || (mp->msgid_plural != NULL && string_has_url (mp->msgid_plural)); +} + + +/* Determine whether a string (msgid or msgid_plural) contains an + email address. */ +static bool +string_has_email (const char *string) +{ + const char *string_tail; + for (string_tail = string;;) + { + /* An email address consists of LOCALPART@DOMAIN. */ + const char *at = strchr (string_tail, '@'); + if (at == NULL) + break; + /* Find the start of the email address. */ + const char *start; + { + const char *p = at; + while (p > string) + { + char c = p[-1]; + if (!(c_isalnum (c) + || c == '!' || c == '#' || c == '$' || c == '%' || c == '&' + || c == '\'' || c == '*' || c == '+' || c == '-' || c == '.' + || c == '/' || c == '=' || c == '?' || c == '^' || c == '_' + || c == '`' || c == '{' || c == '|' || c == '}' || c == '~')) + break; + /* Consecutive dots not allowed. */ + if (c == '.' && p[0] == '.') + break; + p--; + } + start = p; + } + if (start < at && start[0] != '.' && at[-1] != '.') + { + /* Find the end of the email address. */ + const char *end; + const char *last_dot_in_domain = NULL; + { + const char *p = at + 1; + while (*p != '\0') + { + char c = *p; + if (!(c_isalnum (c) || c == '-' || c == '.')) + break; + /* Consecutive dots not allowed. */ + if (c == '.' && p[-1] == '.') + break; + if (c == '.') + last_dot_in_domain = p; + p++; + } + end = p; + } + if (at + 1 < end && at[1] != '.' && end[-1] != '.' + /* The domain should contain a dot. */ + && last_dot_in_domain != NULL + /* We can't enumerate all the possible top-level domains, but at + least we know that they are all 2 or more characters long. */ + && end - (last_dot_in_domain + 1) >= 2) + { + /* Yes, it looks like an email address. */ + return true; + } + } + string_tail = at + 1; + } + + return false; +} + +/* Determine whether a message contains an email address. */ +static bool +message_has_email (const message_ty *mp) +{ + return string_has_email (mp->msgid) + || (mp->msgid_plural != NULL && string_has_email (mp->msgid_plural)); +} + + +/* Perform the URL check on a non-obsolete message. */ +static void +url_check_message (const message_ty *mp) +{ + if (message_has_url (mp)) + if_error (IF_SEVERITY_WARNING, + mp->pos.file_name, mp->pos.line_number, (size_t)(-1), false, + _("Message contains an embedded URL. Better move it out of the translatable string, see %s"), + "https://www.gnu.org/software/gettext/manual/html_node/No-embedded-URLs.html"); + else if (message_has_email (mp)) + if_error (IF_SEVERITY_WARNING, + mp->pos.file_name, mp->pos.line_number, (size_t)(-1), false, + _("Message contains an embedded email address. Better move it out of the translatable string, see %s"), + "https://www.gnu.org/software/gettext/manual/html_node/No-embedded-URLs.html"); +} + + /* Perform all checks on a message list. Return the number of errors that were seen. */ int @@ -429,7 +593,10 @@ xgettext_check_message_list (message_list_ty *mlp) message_ty *mp = mlp->item[j]; if (!is_header (mp)) - seen_errors += syntax_check_message (mp) + format_check_message (mp); + { + seen_errors += syntax_check_message (mp) + format_check_message (mp); + url_check_message (mp); + } } return seen_errors; diff --git a/gettext-tools/tests/Makefile.am b/gettext-tools/tests/Makefile.am index 06967ae3c..71429d655 100644 --- a/gettext-tools/tests/Makefile.am +++ b/gettext-tools/tests/Makefile.am @@ -85,7 +85,7 @@ TESTS = gettext-1 gettext-2 \ xgettext-2 xgettext-3 xgettext-4 xgettext-5 xgettext-6 \ xgettext-7 xgettext-8 xgettext-9 xgettext-10 xgettext-11 xgettext-12 \ xgettext-13 xgettext-14 xgettext-15 xgettext-16 xgettext-17 \ - xgettext-18 xgettext-19 \ + xgettext-18 xgettext-19 xgettext-20 \ xgettext-combine-1 xgettext-combine-2 xgettext-combine-3 \ xgettext-git-1 \ xgettext-appdata-1 xgettext-appdata-2 xgettext-appdata-3 \ diff --git a/gettext-tools/tests/xgettext-20 b/gettext-tools/tests/xgettext-20 new file mode 100755 index 000000000..683e462b7 --- /dev/null +++ b/gettext-tools/tests/xgettext-20 @@ -0,0 +1,24 @@ +#!/bin/sh +. "${srcdir=.}/init.sh"; path_prepend_ . ../src +set -x +# Test checking for URL. +# + +cat <<\EOF > xg-test20.c + gettext ("Using the browser to open a mailto: URI"); + gettext ("Report bugs to "); + gettext ("Report bugs to: bug-foobar@gnu.org"); + gettext ("Report bugs in the bug tracker at "); +EOF + +: ${XGETTEXT=xgettext} +LANGUAGE= LC_ALL=C ${XGETTEXT} --omit-header --add-comments -d xg-test20.tmp xg-test20.c 2>xg-test20.err \ + || Exit 1 + +if grep "xg-test20.c:1:.*No-embedded-URLs.html" xg-test20.err; then + Exit 1 +fi + +grep "xg-test20.c:2:.*No-embedded-URLs.html" xg-test20.err || Exit 1 +grep "xg-test20.c:3:.*No-embedded-URLs.html" xg-test20.err || Exit 1 +grep "xg-test20.c:4:.*No-embedded-URLs.html" xg-test20.err || Exit 1