[thirdparty/git.git] / ci / check-directional-formatting.bash

#!/bin/bash

# This script verifies that the non-binary files tracked in the Git index do
# not contain any Unicode directional formatting: such formatting could be used
# to deceive reviewers into interpreting code differently from the compiler.
# This is intended to run on an Ubuntu agent in a GitHub workflow.
#
# To allow translated messages to introduce such directional formatting in the
# future, we exclude the `.po` files from this validation.
#
# Neither GNU grep nor `git grep` (not even with `-P`) handle `\u` as a way to
# specify UTF-8.
#
# To work around that, we use `printf` to produce the pattern as a byte
# sequence, and then feed that to `git grep` as a byte sequence (setting
# `LC_CTYPE` to make sure that the arguments are interpreted as intended).
#
# Note: we need to use Bash here because its `printf` interprets `\uNNNN` as
# UTF-8 code points, as desired. Running this script through Ubuntu's `dash`,
# for example, would use a `printf` that does not understand that syntax.

# U+202a..U+2a2e: LRE, RLE, PDF, LRO and RLO
# U+2066..U+2069: LRI, RLI, FSI and PDI
regex='(\u202a|\u202b|\u202c|\u202d|\u202e|\u2066|\u2067|\u2068|\u2069)'

! LC_CTYPE=C git grep -El "$(LC_CTYPE=C.UTF-8 printf "$regex")" \
	-- ':(exclude,attr:binary)' ':(exclude)*.po'
Commit	Line	Data
0e7696c6 JS	1	#!/bin/bash
	2
	3	# This script verifies that the non-binary files tracked in the Git index do
	4	# not contain any Unicode directional formatting: such formatting could be used
	5	# to deceive reviewers into interpreting code differently from the compiler.
	6	# This is intended to run on an Ubuntu agent in a GitHub workflow.
	7	#
	8	# To allow translated messages to introduce such directional formatting in the
	9	# future, we exclude the `.po` files from this validation.
	10	#
	11	# Neither GNU grep nor `git grep` (not even with `-P`) handle `\u` as a way to
	12	# specify UTF-8.
	13	#
	14	# To work around that, we use `printf` to produce the pattern as a byte
	15	# sequence, and then feed that to `git grep` as a byte sequence (setting
	16	# `LC_CTYPE` to make sure that the arguments are interpreted as intended).
	17	#
	18	# Note: we need to use Bash here because its `printf` interprets `\uNNNN` as
	19	# UTF-8 code points, as desired. Running this script through Ubuntu's `dash`,
	20	# for example, would use a `printf` that does not understand that syntax.
	21
	22	# U+202a..U+2a2e: LRE, RLE, PDF, LRO and RLO
	23	# U+2066..U+2069: LRI, RLI, FSI and PDI
	24	regex='(\u202a\|\u202b\|\u202c\|\u202d\|\u202e\|\u2066\|\u2067\|\u2068\|\u2069)'
	25
	26	! LC_CTYPE=C git grep -El "$(LC_CTYPE=C.UTF-8 printf "$regex")" \
	27	-- ':(exclude,attr:binary)' ':(exclude)*.po'