]>
Commit | Line | Data |
---|---|---|
db9ecf05 | 1 | /* SPDX-License-Identifier: LGPL-2.1-or-later */ |
08bcebf3 | 2 | |
11c3a366 TA |
3 | #include <errno.h> |
4 | #include <stddef.h> | |
08bcebf3 | 5 | |
11c3a366 | 6 | #include "macro.h" |
07630cea | 7 | #include "string-util.h" |
08bcebf3 LP |
8 | #include "xml.h" |
9 | ||
10 | enum { | |
bcf3295d | 11 | STATE_NULL, |
08bcebf3 LP |
12 | STATE_TEXT, |
13 | STATE_TAG, | |
14 | STATE_ATTRIBUTE, | |
15 | }; | |
16 | ||
bcf3295d LP |
17 | static void inc_lines(unsigned *line, const char *s, size_t n) { |
18 | const char *p = s; | |
19 | ||
20 | if (!line) | |
21 | return; | |
22 | ||
23 | for (;;) { | |
24 | const char *f; | |
25 | ||
26 | f = memchr(p, '\n', n); | |
27 | if (!f) | |
28 | return; | |
29 | ||
30 | n -= (f - p) + 1; | |
31 | p = f + 1; | |
32 | (*line)++; | |
33 | } | |
34 | } | |
35 | ||
08bcebf3 LP |
36 | /* We don't actually do real XML here. We only read a simplistic |
37 | * subset, that is a bit less strict that XML and lacks all the more | |
38 | * complex features, like entities, or namespaces. However, we do | |
39 | * support some HTML5-like simplifications */ | |
40 | ||
bcf3295d | 41 | int xml_tokenize(const char **p, char **name, void **state, unsigned *line) { |
08bcebf3 LP |
42 | const char *c, *e, *b; |
43 | char *ret; | |
44 | int t; | |
45 | ||
46 | assert(p); | |
47 | assert(*p); | |
48 | assert(name); | |
49 | assert(state); | |
50 | ||
51 | t = PTR_TO_INT(*state); | |
52 | c = *p; | |
53 | ||
bcf3295d LP |
54 | if (t == STATE_NULL) { |
55 | if (line) | |
56 | *line = 1; | |
57 | t = STATE_TEXT; | |
58 | } | |
59 | ||
08bcebf3 LP |
60 | for (;;) { |
61 | if (*c == 0) | |
62 | return XML_END; | |
63 | ||
64 | switch (t) { | |
65 | ||
66 | case STATE_TEXT: { | |
67 | int x; | |
68 | ||
69 | e = strchrnul(c, '<'); | |
70 | if (e > c) { | |
71 | /* More text... */ | |
72 | ret = strndup(c, e - c); | |
73 | if (!ret) | |
74 | return -ENOMEM; | |
75 | ||
bcf3295d LP |
76 | inc_lines(line, c, e - c); |
77 | ||
08bcebf3 LP |
78 | *name = ret; |
79 | *p = e; | |
80 | *state = INT_TO_PTR(STATE_TEXT); | |
81 | ||
82 | return XML_TEXT; | |
83 | } | |
84 | ||
85 | assert(*e == '<'); | |
86 | b = c + 1; | |
87 | ||
88 | if (startswith(b, "!--")) { | |
89 | /* A comment */ | |
90 | e = strstr(b + 3, "-->"); | |
91 | if (!e) | |
92 | return -EINVAL; | |
93 | ||
bcf3295d LP |
94 | inc_lines(line, b, e + 3 - b); |
95 | ||
08bcebf3 LP |
96 | c = e + 3; |
97 | continue; | |
98 | } | |
99 | ||
100 | if (*b == '?') { | |
101 | /* Processing instruction */ | |
102 | ||
103 | e = strstr(b + 1, "?>"); | |
104 | if (!e) | |
105 | return -EINVAL; | |
106 | ||
bcf3295d LP |
107 | inc_lines(line, b, e + 2 - b); |
108 | ||
08bcebf3 LP |
109 | c = e + 2; |
110 | continue; | |
111 | } | |
112 | ||
113 | if (*b == '!') { | |
114 | /* DTD */ | |
115 | ||
116 | e = strchr(b + 1, '>'); | |
117 | if (!e) | |
118 | return -EINVAL; | |
119 | ||
bcf3295d LP |
120 | inc_lines(line, b, e + 1 - b); |
121 | ||
08bcebf3 LP |
122 | c = e + 1; |
123 | continue; | |
124 | } | |
125 | ||
126 | if (*b == '/') { | |
127 | /* A closing tag */ | |
128 | x = XML_TAG_CLOSE; | |
129 | b++; | |
130 | } else | |
131 | x = XML_TAG_OPEN; | |
132 | ||
133 | e = strpbrk(b, WHITESPACE "/>"); | |
134 | if (!e) | |
135 | return -EINVAL; | |
136 | ||
137 | ret = strndup(b, e - b); | |
138 | if (!ret) | |
139 | return -ENOMEM; | |
140 | ||
141 | *name = ret; | |
142 | *p = e; | |
143 | *state = INT_TO_PTR(STATE_TAG); | |
144 | ||
145 | return x; | |
146 | } | |
147 | ||
148 | case STATE_TAG: | |
149 | ||
150 | b = c + strspn(c, WHITESPACE); | |
151 | if (*b == 0) | |
152 | return -EINVAL; | |
153 | ||
bcf3295d LP |
154 | inc_lines(line, c, b - c); |
155 | ||
08bcebf3 LP |
156 | e = b + strcspn(b, WHITESPACE "=/>"); |
157 | if (e > b) { | |
158 | /* An attribute */ | |
159 | ||
160 | ret = strndup(b, e - b); | |
161 | if (!ret) | |
162 | return -ENOMEM; | |
163 | ||
164 | *name = ret; | |
165 | *p = e; | |
166 | *state = INT_TO_PTR(STATE_ATTRIBUTE); | |
167 | ||
168 | return XML_ATTRIBUTE_NAME; | |
169 | } | |
170 | ||
171 | if (startswith(b, "/>")) { | |
172 | /* An empty tag */ | |
173 | ||
174 | *name = NULL; /* For empty tags we return a NULL name, the caller must be prepared for that */ | |
175 | *p = b + 2; | |
176 | *state = INT_TO_PTR(STATE_TEXT); | |
177 | ||
178 | return XML_TAG_CLOSE_EMPTY; | |
179 | } | |
180 | ||
181 | if (*b != '>') | |
182 | return -EINVAL; | |
183 | ||
184 | c = b + 1; | |
185 | t = STATE_TEXT; | |
186 | continue; | |
187 | ||
188 | case STATE_ATTRIBUTE: | |
189 | ||
190 | if (*c == '=') { | |
191 | c++; | |
192 | ||
e768a4f0 | 193 | if (IN_SET(*c, '\'', '"')) { |
08bcebf3 LP |
194 | /* Tag with a quoted value */ |
195 | ||
196 | e = strchr(c+1, *c); | |
197 | if (!e) | |
198 | return -EINVAL; | |
199 | ||
bcf3295d LP |
200 | inc_lines(line, c, e - c); |
201 | ||
08bcebf3 LP |
202 | ret = strndup(c+1, e - c - 1); |
203 | if (!ret) | |
204 | return -ENOMEM; | |
205 | ||
206 | *name = ret; | |
207 | *p = e + 1; | |
208 | *state = INT_TO_PTR(STATE_TAG); | |
209 | ||
210 | return XML_ATTRIBUTE_VALUE; | |
211 | ||
212 | } | |
213 | ||
214 | /* Tag with a value without quotes */ | |
215 | ||
216 | b = strpbrk(c, WHITESPACE ">"); | |
217 | if (!b) | |
218 | b = c; | |
219 | ||
220 | ret = strndup(c, b - c); | |
221 | if (!ret) | |
222 | return -ENOMEM; | |
223 | ||
224 | *name = ret; | |
225 | *p = b; | |
226 | *state = INT_TO_PTR(STATE_TAG); | |
227 | return XML_ATTRIBUTE_VALUE; | |
228 | } | |
229 | ||
230 | t = STATE_TAG; | |
231 | continue; | |
232 | } | |
233 | ||
234 | } | |
235 | ||
04499a70 | 236 | assert_not_reached(); |
08bcebf3 | 237 | } |