]>
Commit | Line | Data |
---|---|---|
00991e10 JH |
1 | # Help detect how Unicode NFC and NFD are handled on the filesystem. |
2 | ||
3 | # A simple character that has a NFD form. | |
4 | # | |
5 | # NFC: U+00e9 LATIN SMALL LETTER E WITH ACUTE | |
6 | # UTF8(NFC): \xc3 \xa9 | |
7 | # | |
8 | # NFD: U+0065 LATIN SMALL LETTER E | |
9 | # U+0301 COMBINING ACUTE ACCENT | |
10 | # UTF8(NFD): \x65 + \xcc \x81 | |
11 | # | |
12 | utf8_nfc=$(printf "\xc3\xa9") | |
13 | utf8_nfd=$(printf "\x65\xcc\x81") | |
14 | ||
15 | # Is the OS or the filesystem "Unicode composition sensitive"? | |
16 | # | |
17 | # That is, does the OS or the filesystem allow files to exist with | |
18 | # both the NFC and NFD spellings? Or, does the OS/FS lie to us and | |
19 | # tell us that the NFC and NFD forms are equivalent. | |
20 | # | |
21 | # This is or may be independent of what type of filesystem we have, | |
22 | # since it might be handled by the OS at a layer above the FS. | |
23 | # Testing shows on MacOS using APFS, HFS+, and FAT32 reports a | |
24 | # collision, for example. | |
25 | # | |
26 | # This does not tell us how the Unicode pathname will be spelled | |
27 | # on disk, but rather only that the two spelling "collide". We | |
28 | # will examine the actual on disk spelling in a later prereq. | |
29 | # | |
30 | test_lazy_prereq UNICODE_COMPOSITION_SENSITIVE ' | |
31 | mkdir trial_${utf8_nfc} && | |
32 | mkdir trial_${utf8_nfd} | |
33 | ' | |
34 | ||
35 | # Is the spelling of an NFC pathname preserved on disk? | |
36 | # | |
37 | # On MacOS with HFS+ and FAT32, NFC paths are converted into NFD | |
38 | # and on APFS, NFC paths are preserved. As we have established | |
39 | # above, this is independent of "composition sensitivity". | |
40 | # | |
41 | test_lazy_prereq UNICODE_NFC_PRESERVED ' | |
42 | mkdir c_${utf8_nfc} && | |
43 | ls | test-tool hexdump >dump && | |
44 | grep "63 5f c3 a9" dump | |
45 | ' | |
46 | ||
47 | # Is the spelling of an NFD pathname preserved on disk? | |
48 | # | |
49 | test_lazy_prereq UNICODE_NFD_PRESERVED ' | |
50 | mkdir d_${utf8_nfd} && | |
51 | ls | test-tool hexdump >dump && | |
52 | grep "64 5f 65 cc 81" dump | |
53 | ' | |
54 | ||
55 | # The following _DOUBLE_ forms are more for my curiosity, | |
56 | # but there may be quirks lurking when there are multiple | |
57 | # combining characters in non-canonical order. | |
58 | ||
59 | # Unicode also allows multiple combining characters | |
60 | # that can be decomposed in pieces. | |
61 | # | |
62 | # NFC: U+1f67 GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI | |
63 | # UTF8(NFC): \xe1 \xbd \xa7 | |
64 | # | |
65 | # NFD1: U+1f61 GREEK SMALL LETTER OMEGA WITH DASIA | |
66 | # U+0342 COMBINING GREEK PERISPOMENI | |
67 | # UTF8(NFD1): \xe1 \xbd \xa1 + \xcd \x82 | |
68 | # | |
69 | # But U+1f61 decomposes into | |
70 | # NFD2: U+03c9 GREEK SMALL LETTER OMEGA | |
71 | # U+0314 COMBINING REVERSED COMMA ABOVE | |
72 | # UTF8(NFD2): \xcf \x89 + \xcc \x94 | |
73 | # | |
74 | # Yielding: \xcf \x89 + \xcc \x94 + \xcd \x82 | |
75 | # | |
76 | # Note that I've used the canonical ordering of the | |
77 | # combinining characters. It is also possible to | |
78 | # swap them. My testing shows that that non-standard | |
79 | # ordering also causes a collision in mkdir. However, | |
80 | # the resulting names don't draw correctly on the | |
81 | # terminal (implying that the on-disk format also has | |
82 | # them out of order). | |
83 | # | |
84 | greek_nfc=$(printf "\xe1\xbd\xa7") | |
85 | greek_nfd1=$(printf "\xe1\xbd\xa1\xcd\x82") | |
86 | greek_nfd2=$(printf "\xcf\x89\xcc\x94\xcd\x82") | |
87 | ||
88 | # See if a double decomposition also collides. | |
89 | # | |
90 | test_lazy_prereq UNICODE_DOUBLE_COMPOSITION_SENSITIVE ' | |
91 | mkdir trial_${greek_nfc} && | |
92 | mkdir trial_${greek_nfd2} | |
93 | ' | |
94 | ||
95 | # See if the NFC spelling appears on the disk. | |
96 | # | |
97 | test_lazy_prereq UNICODE_DOUBLE_NFC_PRESERVED ' | |
98 | mkdir c_${greek_nfc} && | |
99 | ls | test-tool hexdump >dump && | |
100 | grep "63 5f e1 bd a7" dump | |
101 | ' | |
102 | ||
103 | # See if the NFD spelling appears on the disk. | |
104 | # | |
105 | test_lazy_prereq UNICODE_DOUBLE_NFD_PRESERVED ' | |
106 | mkdir d_${greek_nfd2} && | |
107 | ls | test-tool hexdump >dump && | |
108 | grep "64 5f cf 89 cc 94 cd 82" dump | |
109 | ' | |
110 | ||
111 | # The following is for debugging. I found it useful when | |
112 | # trying to understand the various (OS, FS) quirks WRT | |
113 | # Unicode and how composition/decomposition is handled. | |
114 | # For example, when trying to understand how (macOS, APFS) | |
115 | # and (macOS, HFS) and (macOS, FAT32) compare. | |
116 | # | |
117 | # It is rather noisy, so it is disabled by default. | |
118 | # | |
119 | if test "$unicode_debug" = "true" | |
120 | then | |
121 | if test_have_prereq UNICODE_COMPOSITION_SENSITIVE | |
122 | then | |
123 | echo NFC and NFD are distinct on this OS/filesystem. | |
124 | else | |
125 | echo NFC and NFD are aliases on this OS/filesystem. | |
126 | fi | |
127 | ||
128 | if test_have_prereq UNICODE_NFC_PRESERVED | |
129 | then | |
130 | echo NFC maintains original spelling. | |
131 | else | |
132 | echo NFC is modified. | |
133 | fi | |
134 | ||
135 | if test_have_prereq UNICODE_NFD_PRESERVED | |
136 | then | |
137 | echo NFD maintains original spelling. | |
138 | else | |
139 | echo NFD is modified. | |
140 | fi | |
141 | ||
142 | if test_have_prereq UNICODE_DOUBLE_COMPOSITION_SENSITIVE | |
143 | then | |
144 | echo DOUBLE NFC and NFD are distinct on this OS/filesystem. | |
145 | else | |
146 | echo DOUBLE NFC and NFD are aliases on this OS/filesystem. | |
147 | fi | |
148 | ||
149 | if test_have_prereq UNICODE_DOUBLE_NFC_PRESERVED | |
150 | then | |
151 | echo Double NFC maintains original spelling. | |
152 | else | |
153 | echo Double NFC is modified. | |
154 | fi | |
155 | ||
156 | if test_have_prereq UNICODE_DOUBLE_NFD_PRESERVED | |
157 | then | |
158 | echo Double NFD maintains original spelling. | |
159 | else | |
160 | echo Double NFD is modified. | |
161 | fi | |
162 | fi |