]> git.ipfire.org Git - thirdparty/git.git/blame - t/lib-unicode-nfc-nfd.sh
The third batch
[thirdparty/git.git] / t / lib-unicode-nfc-nfd.sh
CommitLineData
00991e10
JH
1# Help detect how Unicode NFC and NFD are handled on the filesystem.
2
3# A simple character that has a NFD form.
4#
5# NFC: U+00e9 LATIN SMALL LETTER E WITH ACUTE
6# UTF8(NFC): \xc3 \xa9
7#
8# NFD: U+0065 LATIN SMALL LETTER E
9# U+0301 COMBINING ACUTE ACCENT
10# UTF8(NFD): \x65 + \xcc \x81
11#
12utf8_nfc=$(printf "\xc3\xa9")
13utf8_nfd=$(printf "\x65\xcc\x81")
14
15# Is the OS or the filesystem "Unicode composition sensitive"?
16#
17# That is, does the OS or the filesystem allow files to exist with
18# both the NFC and NFD spellings? Or, does the OS/FS lie to us and
19# tell us that the NFC and NFD forms are equivalent.
20#
21# This is or may be independent of what type of filesystem we have,
22# since it might be handled by the OS at a layer above the FS.
23# Testing shows on MacOS using APFS, HFS+, and FAT32 reports a
24# collision, for example.
25#
26# This does not tell us how the Unicode pathname will be spelled
27# on disk, but rather only that the two spelling "collide". We
28# will examine the actual on disk spelling in a later prereq.
29#
30test_lazy_prereq UNICODE_COMPOSITION_SENSITIVE '
31 mkdir trial_${utf8_nfc} &&
32 mkdir trial_${utf8_nfd}
33'
34
35# Is the spelling of an NFC pathname preserved on disk?
36#
37# On MacOS with HFS+ and FAT32, NFC paths are converted into NFD
38# and on APFS, NFC paths are preserved. As we have established
39# above, this is independent of "composition sensitivity".
40#
41test_lazy_prereq UNICODE_NFC_PRESERVED '
42 mkdir c_${utf8_nfc} &&
43 ls | test-tool hexdump >dump &&
44 grep "63 5f c3 a9" dump
45'
46
47# Is the spelling of an NFD pathname preserved on disk?
48#
49test_lazy_prereq UNICODE_NFD_PRESERVED '
50 mkdir d_${utf8_nfd} &&
51 ls | test-tool hexdump >dump &&
52 grep "64 5f 65 cc 81" dump
53'
54
55# The following _DOUBLE_ forms are more for my curiosity,
56# but there may be quirks lurking when there are multiple
57# combining characters in non-canonical order.
58
59# Unicode also allows multiple combining characters
60# that can be decomposed in pieces.
61#
62# NFC: U+1f67 GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI
63# UTF8(NFC): \xe1 \xbd \xa7
64#
65# NFD1: U+1f61 GREEK SMALL LETTER OMEGA WITH DASIA
66# U+0342 COMBINING GREEK PERISPOMENI
67# UTF8(NFD1): \xe1 \xbd \xa1 + \xcd \x82
68#
69# But U+1f61 decomposes into
70# NFD2: U+03c9 GREEK SMALL LETTER OMEGA
71# U+0314 COMBINING REVERSED COMMA ABOVE
72# UTF8(NFD2): \xcf \x89 + \xcc \x94
73#
74# Yielding: \xcf \x89 + \xcc \x94 + \xcd \x82
75#
76# Note that I've used the canonical ordering of the
77# combinining characters. It is also possible to
78# swap them. My testing shows that that non-standard
79# ordering also causes a collision in mkdir. However,
80# the resulting names don't draw correctly on the
81# terminal (implying that the on-disk format also has
82# them out of order).
83#
84greek_nfc=$(printf "\xe1\xbd\xa7")
85greek_nfd1=$(printf "\xe1\xbd\xa1\xcd\x82")
86greek_nfd2=$(printf "\xcf\x89\xcc\x94\xcd\x82")
87
88# See if a double decomposition also collides.
89#
90test_lazy_prereq UNICODE_DOUBLE_COMPOSITION_SENSITIVE '
91 mkdir trial_${greek_nfc} &&
92 mkdir trial_${greek_nfd2}
93'
94
95# See if the NFC spelling appears on the disk.
96#
97test_lazy_prereq UNICODE_DOUBLE_NFC_PRESERVED '
98 mkdir c_${greek_nfc} &&
99 ls | test-tool hexdump >dump &&
100 grep "63 5f e1 bd a7" dump
101'
102
103# See if the NFD spelling appears on the disk.
104#
105test_lazy_prereq UNICODE_DOUBLE_NFD_PRESERVED '
106 mkdir d_${greek_nfd2} &&
107 ls | test-tool hexdump >dump &&
108 grep "64 5f cf 89 cc 94 cd 82" dump
109'
110
111# The following is for debugging. I found it useful when
112# trying to understand the various (OS, FS) quirks WRT
113# Unicode and how composition/decomposition is handled.
114# For example, when trying to understand how (macOS, APFS)
115# and (macOS, HFS) and (macOS, FAT32) compare.
116#
117# It is rather noisy, so it is disabled by default.
118#
119if test "$unicode_debug" = "true"
120then
121 if test_have_prereq UNICODE_COMPOSITION_SENSITIVE
122 then
123 echo NFC and NFD are distinct on this OS/filesystem.
124 else
125 echo NFC and NFD are aliases on this OS/filesystem.
126 fi
127
128 if test_have_prereq UNICODE_NFC_PRESERVED
129 then
130 echo NFC maintains original spelling.
131 else
132 echo NFC is modified.
133 fi
134
135 if test_have_prereq UNICODE_NFD_PRESERVED
136 then
137 echo NFD maintains original spelling.
138 else
139 echo NFD is modified.
140 fi
141
142 if test_have_prereq UNICODE_DOUBLE_COMPOSITION_SENSITIVE
143 then
144 echo DOUBLE NFC and NFD are distinct on this OS/filesystem.
145 else
146 echo DOUBLE NFC and NFD are aliases on this OS/filesystem.
147 fi
148
149 if test_have_prereq UNICODE_DOUBLE_NFC_PRESERVED
150 then
151 echo Double NFC maintains original spelling.
152 else
153 echo Double NFC is modified.
154 fi
155
156 if test_have_prereq UNICODE_DOUBLE_NFD_PRESERVED
157 then
158 echo Double NFD maintains original spelling.
159 else
160 echo Double NFD is modified.
161 fi
162fi