]>
Commit | Line | Data |
---|---|---|
87d2ae2a CR |
1 | /* cut,lcut - extract specified fields from a line and assign them to an array |
2 | or print them to the standard output */ | |
96a2ec12 CR |
3 | |
4 | /* | |
b3038907 | 5 | Copyright (C) 2020,2022,2023 Free Software Foundation, Inc. |
96a2ec12 CR |
6 | |
7 | Bash is free software: you can redistribute it and/or modify | |
8 | it under the terms of the GNU General Public License as published by | |
9 | the Free Software Foundation, either version 3 of the License, or | |
10 | (at your option) any later version. | |
11 | ||
12 | Bash is distributed in the hope that it will be useful, | |
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
15 | GNU General Public License for more details. | |
16 | ||
17 | You should have received a copy of the GNU General Public License | |
18 | along with Bash. If not, see <http://www.gnu.org/licenses/>. | |
19 | */ | |
20 | ||
21 | /* See Makefile for compilation details. */ | |
22 | ||
23 | #include <config.h> | |
24 | ||
25 | #if defined (HAVE_UNISTD_H) | |
26 | # include <unistd.h> | |
27 | #endif | |
28 | #include "bashansi.h" | |
29 | #include <stdio.h> | |
30 | #include <fcntl.h> | |
31 | #include <errno.h> | |
32 | ||
33 | #include "loadables.h" | |
34 | #include "shmbutil.h" | |
35 | ||
36 | #define CUT_ARRAY_DEFAULT "CUTFIELDS" | |
37 | ||
38 | #define NOPOS -2 /* sentinel for unset startpos/endpos */ | |
39 | ||
40 | #define BOL 0 | |
41 | #define EOL INT_MAX | |
42 | #define NORANGE -1 /* just a position, no range */ | |
43 | ||
44 | #define BFLAG (1 << 0) | |
45 | #define CFLAG (1 << 1) | |
46 | #define DFLAG (1 << 2) | |
47 | #define FFLAG (1 << 3) | |
48 | #define SFLAG (1 << 4) | |
49 | ||
50 | struct cutpos | |
51 | { | |
52 | int startpos, endpos; /* zero-based, correction done in getlist() */ | |
53 | }; | |
54 | ||
55 | struct cutop | |
56 | { | |
57 | int flags; | |
58 | int delim; | |
59 | int npos; | |
60 | struct cutpos *poslist; | |
61 | }; | |
62 | ||
63 | static int | |
81e3a4fb | 64 | poscmp (const void *a, const void *b) |
96a2ec12 CR |
65 | { |
66 | struct cutpos *p1, *p2; | |
67 | ||
68 | p1 = (struct cutpos *)a; | |
69 | p2 = (struct cutpos *)b; | |
70 | return (p1->startpos - p2->startpos); | |
71 | } | |
72 | ||
73 | static int | |
81e3a4fb | 74 | getlist (char *arg, struct cutpos **opp) |
96a2ec12 CR |
75 | { |
76 | char *ntok, *ltok, *larg; | |
77 | int s, e; | |
78 | intmax_t num; | |
79 | struct cutpos *poslist; | |
80 | int npos, nsize; | |
81 | ||
82 | poslist = 0; | |
83 | nsize = npos = 0; | |
84 | s = e = 0; | |
85 | larg = arg; | |
86 | while (ltok = strsep (&larg, ",")) | |
87 | { | |
88 | if (*ltok == 0) | |
89 | continue; | |
90 | ||
91 | ntok = strsep (<ok, "-"); | |
92 | if (*ntok == 0) | |
93 | s = BOL; | |
94 | else | |
95 | { | |
b3038907 | 96 | if (valid_number (ntok, &num) == 0 || (int)num != num || num <= 0) |
96a2ec12 CR |
97 | { |
98 | builtin_error ("%s: invalid list value", ntok); | |
99 | *opp = poslist; | |
100 | return -1; | |
101 | } | |
102 | s = num; | |
103 | s--; /* fields are 1-based */ | |
104 | } | |
105 | if (ltok == 0) | |
106 | e = NORANGE; | |
107 | else if (*ltok == 0) | |
108 | e = EOL; | |
109 | else | |
110 | { | |
b3038907 | 111 | if (valid_number (ltok, &num) == 0 || (int)num != num || num <= 0) |
96a2ec12 CR |
112 | { |
113 | builtin_error ("%s: invalid list value", ltok); | |
114 | *opp = poslist; | |
115 | return -1; | |
116 | } | |
117 | e = num; | |
118 | e--; | |
119 | if (e == s) | |
120 | e = NORANGE; | |
121 | } | |
122 | ||
123 | if (npos == nsize) | |
124 | { | |
125 | nsize += 4; | |
126 | poslist = (struct cutpos *)xrealloc (poslist, nsize * sizeof (struct cutpos)); | |
127 | } | |
128 | poslist[npos].startpos = s; | |
129 | poslist[npos].endpos = e; | |
130 | npos++; | |
131 | } | |
132 | if (npos == 0) | |
133 | { | |
134 | builtin_error ("missing list of positions"); | |
135 | *opp = poslist; | |
136 | return -1; | |
137 | } | |
138 | ||
139 | qsort (poslist, npos, sizeof(poslist[0]), poscmp); | |
140 | *opp = poslist; | |
141 | ||
142 | return npos; | |
143 | } | |
144 | ||
145 | static int | |
81e3a4fb | 146 | cutbytes (SHELL_VAR *v, char *line, struct cutop *ops) |
96a2ec12 CR |
147 | { |
148 | arrayind_t ind; | |
149 | char *buf, *bmap; | |
150 | size_t llen; | |
151 | int i, b, n, s, e; | |
152 | ||
153 | llen = strlen (line); | |
154 | buf = xmalloc (llen + 1); | |
155 | bmap = xmalloc (llen + 1); | |
156 | memset (bmap, 0, llen); | |
157 | ||
158 | for (n = 0; n < ops->npos; n++) | |
159 | { | |
160 | s = ops->poslist[n].startpos; /* no translation needed yet */ | |
161 | e = ops->poslist[n].endpos; | |
162 | if (e == NORANGE) | |
163 | e = s; | |
164 | else if (e == EOL || e >= llen) | |
165 | e = llen - 1; | |
166 | /* even if a column is specified multiple times, it will only be printed | |
167 | once */ | |
168 | for (i = s; i <= e; i++) | |
169 | bmap[i] = 1; | |
170 | } | |
171 | ||
172 | b = 0; | |
173 | for (i = 0; i < llen; i++) | |
174 | if (bmap[i]) | |
175 | buf[b++] = line[i]; | |
176 | buf[b] = 0; | |
177 | ||
178 | if (v) | |
179 | { | |
180 | ind = 0; | |
181 | bind_array_element (v, ind, buf, 0); | |
182 | ind++; | |
183 | } | |
184 | else | |
185 | printf ("%s\n", buf); | |
186 | ||
187 | free (buf); | |
188 | free (bmap); | |
189 | ||
190 | return ind; | |
191 | } | |
192 | ||
193 | static int | |
81e3a4fb | 194 | cutchars (SHELL_VAR *v, char *line, struct cutop *ops) |
96a2ec12 CR |
195 | { |
196 | arrayind_t ind; | |
197 | char *buf, *bmap; | |
198 | wchar_t *wbuf, *wb2; | |
199 | size_t llen, wlen; | |
200 | int i, b, n, s, e; | |
201 | ||
202 | if (MB_CUR_MAX == 1) | |
203 | return (cutbytes (v, line, ops)); | |
204 | if (locale_utf8locale && utf8_mbsmbchar (line) == 0) | |
205 | return (cutbytes (v, line, ops)); | |
206 | ||
207 | llen = strlen (line); | |
208 | wbuf = (wchar_t *)xmalloc ((llen + 1) * sizeof (wchar_t)); | |
209 | ||
210 | wlen = mbstowcs (wbuf, line, llen); | |
211 | if (MB_INVALIDCH (wlen)) | |
212 | { | |
213 | free (wbuf); | |
214 | return (cutbytes (v, line, ops)); | |
215 | } | |
216 | ||
217 | bmap = xmalloc (llen + 1); | |
218 | memset (bmap, 0, llen); | |
219 | ||
220 | for (n = 0; n < ops->npos; n++) | |
221 | { | |
222 | s = ops->poslist[n].startpos; /* no translation needed yet */ | |
223 | e = ops->poslist[n].endpos; | |
224 | if (e == NORANGE) | |
225 | e = s; | |
226 | else if (e == EOL || e >= wlen) | |
227 | e = wlen - 1; | |
228 | /* even if a column is specified multiple times, it will only be printed | |
229 | once */ | |
230 | for (i = s; i <= e; i++) | |
231 | bmap[i] = 1; | |
232 | } | |
233 | ||
234 | wb2 = (wchar_t *)xmalloc ((wlen + 1) * sizeof (wchar_t)); | |
235 | b = 0; | |
236 | for (i = 0; i < wlen; i++) | |
237 | if (bmap[i]) | |
238 | wb2[b++] = wbuf[i]; | |
239 | wb2[b] = 0; | |
240 | ||
241 | free (wbuf); | |
242 | ||
243 | buf = bmap; | |
244 | n = wcstombs (buf, wb2, llen); | |
245 | ||
246 | if (v) | |
247 | { | |
248 | ind = 0; | |
249 | bind_array_element (v, ind, buf, 0); | |
250 | ind++; | |
251 | } | |
252 | else | |
253 | printf ("%s\n", buf); | |
254 | ||
255 | free (buf); | |
256 | free (wb2); | |
257 | ||
258 | return ind; | |
259 | } | |
260 | ||
261 | /* The basic strategy is to cut the line into fields using strsep, populate | |
262 | an array of fields from 0..nf, then select those fields using the same | |
263 | bitmap approach as cut{bytes,chars} and assign them to the array variable | |
264 | V or print them on stdout. This function obeys SFLAG. */ | |
265 | static int | |
81e3a4fb | 266 | cutfields (SHELL_VAR *v, char *line, struct cutop *ops) |
96a2ec12 CR |
267 | { |
268 | arrayind_t ind; | |
269 | char *buf, *bmap, *field, **fields, delim[2]; | |
270 | size_t llen, fsize; | |
271 | int i, b, n, s, e, nf; | |
272 | ||
273 | ind = 0; | |
274 | ||
275 | delim[0] = ops->delim; | |
276 | delim[1] = '\0'; | |
277 | ||
278 | fields = 0; | |
279 | nf = 0; | |
280 | fsize = 0; | |
281 | ||
282 | field = buf = line; | |
283 | do | |
284 | { | |
285 | field = strsep (&buf, delim); /* destructive */ | |
286 | if (nf == fsize) | |
287 | { | |
288 | fsize += 8; | |
289 | fields = xrealloc (fields, fsize * sizeof (char *)); | |
290 | } | |
291 | fields[nf] = field; | |
292 | if (field) | |
293 | nf++; | |
294 | } | |
295 | while (field); | |
296 | ||
297 | if (nf == 1) | |
298 | { | |
299 | free (fields); | |
300 | if (ops->flags & SFLAG) | |
301 | return ind; | |
302 | if (v) | |
303 | { | |
304 | bind_array_element (v, ind, line, 0); | |
305 | ind++; | |
306 | } | |
307 | else | |
308 | printf ("%s\n", line); | |
309 | return ind; | |
310 | } | |
311 | ||
312 | bmap = xmalloc (nf + 1); | |
313 | memset (bmap, 0, nf); | |
314 | ||
315 | for (n = 0; n < ops->npos; n++) | |
316 | { | |
317 | s = ops->poslist[n].startpos; /* no translation needed yet */ | |
318 | e = ops->poslist[n].endpos; | |
319 | if (e == NORANGE) | |
320 | e = s; | |
321 | else if (e == EOL || e >= nf) | |
322 | e = nf - 1; | |
323 | /* even if a column is specified multiple times, it will only be printed | |
324 | once */ | |
325 | for (i = s; i <= e; i++) | |
326 | bmap[i] = 1; | |
327 | } | |
328 | ||
329 | for (i = 1, b = 0; b < nf; b++) | |
330 | { | |
331 | if (bmap[b] == 0) | |
332 | continue; | |
333 | if (v) | |
334 | { | |
335 | bind_array_element (v, ind, fields[b], 0); | |
336 | ind++; | |
337 | } | |
338 | else | |
339 | { | |
340 | if (i == 0) | |
341 | putchar (ops->delim); | |
342 | printf ("%s", fields[b]); | |
343 | } | |
344 | i = 0; | |
345 | } | |
346 | if (v == 0) | |
347 | putchar ('\n'); | |
348 | ||
349 | return nf; | |
350 | } | |
351 | ||
352 | static int | |
81e3a4fb | 353 | cutline (SHELL_VAR *v, char *line, struct cutop *ops) |
96a2ec12 CR |
354 | { |
355 | int rval; | |
356 | ||
357 | if (ops->flags & BFLAG) | |
358 | rval = cutbytes (v, line, ops); | |
359 | else if (ops->flags & CFLAG) | |
360 | rval = cutchars (v, line, ops); | |
361 | else | |
362 | rval = cutfields (v, line, ops); | |
363 | ||
364 | return (rval >= 0 ? EXECUTION_SUCCESS : EXECUTION_FAILURE); | |
365 | } | |
366 | ||
367 | static int | |
81e3a4fb | 368 | cutfile (SHELL_VAR *v, WORD_LIST *list, struct cutop *ops) |
96a2ec12 CR |
369 | { |
370 | int fd, unbuffered_read; | |
371 | char *line, *b; | |
372 | size_t llen; | |
373 | WORD_LIST *l; | |
374 | ssize_t n; | |
375 | ||
376 | line = 0; | |
377 | llen = 0; | |
378 | ||
379 | l = list; | |
380 | do | |
381 | { | |
382 | /* for each file */ | |
383 | if (l == 0 || (l->word->word[0] == '-' && l->word->word[1] == '\0')) | |
384 | fd = 0; | |
385 | else | |
386 | fd = open (l->word->word, O_RDONLY); | |
387 | if (fd < 0) | |
388 | { | |
389 | file_error (l->word->word); | |
390 | return (EXECUTION_FAILURE); | |
391 | } | |
392 | ||
393 | #ifndef __CYGWIN__ | |
394 | unbuffered_read = (lseek (fd, 0L, SEEK_CUR) < 0) && (errno == ESPIPE); | |
395 | #else | |
396 | unbuffered_read = 1; | |
397 | #endif | |
398 | ||
399 | while ((n = zgetline (fd, &line, &llen, '\n', unbuffered_read)) != -1) | |
a30f513f CR |
400 | { |
401 | QUIT; | |
b9ed20ac CR |
402 | if (line[n] == '\n') |
403 | line[n] = '\0'; /* cutline expects no newline terminator */ | |
404 | cutline (v, line, ops); /* can modify line */ | |
a30f513f | 405 | } |
96a2ec12 CR |
406 | if (fd > 0) |
407 | close (fd); | |
408 | ||
a30f513f | 409 | QUIT; |
96a2ec12 CR |
410 | if (l) |
411 | l = l->next; | |
412 | } | |
413 | while (l); | |
414 | ||
415 | free (line); | |
416 | return EXECUTION_SUCCESS; | |
417 | } | |
418 | ||
419 | #define OPTSET(x) ((cutflags & (x)) ? 1 : 0) | |
420 | ||
421 | static int | |
81e3a4fb | 422 | cut_internal (int which, WORD_LIST *list) |
96a2ec12 CR |
423 | { |
424 | int opt, rval, cutflags, delim, npos; | |
425 | char *array_name, *cutstring, *list_arg; | |
426 | SHELL_VAR *v; | |
427 | struct cutop op; | |
428 | struct cutpos *poslist; | |
429 | ||
430 | v = 0; | |
431 | rval = EXECUTION_SUCCESS; | |
432 | ||
433 | cutflags = 0; | |
434 | array_name = 0; | |
435 | list_arg = 0; | |
436 | delim = '\t'; | |
437 | ||
438 | reset_internal_getopt (); | |
439 | while ((opt = internal_getopt (list, "a:b:c:d:f:sn")) != -1) | |
440 | { | |
441 | switch (opt) | |
442 | { | |
443 | case 'a': | |
444 | array_name = list_optarg; | |
445 | break; | |
446 | case 'b': | |
447 | cutflags |= BFLAG; | |
448 | list_arg = list_optarg; | |
449 | break; | |
450 | case 'c': | |
451 | cutflags |= CFLAG; | |
452 | list_arg = list_optarg; | |
453 | break; | |
454 | case 'd': | |
455 | cutflags |= DFLAG; | |
456 | delim = list_optarg[0]; | |
457 | if (delim == 0 || list_optarg[1]) | |
458 | { | |
459 | builtin_error ("delimiter must be a single non-null character"); | |
460 | return (EX_USAGE); | |
461 | } | |
462 | break; | |
463 | case 'f': | |
464 | cutflags |= FFLAG; | |
465 | list_arg = list_optarg; | |
466 | break; | |
467 | case 'n': | |
468 | break; | |
469 | case 's': | |
470 | cutflags |= SFLAG; | |
471 | break; | |
472 | CASE_HELPOPT; | |
473 | default: | |
474 | builtin_usage (); | |
475 | return (EX_USAGE); | |
476 | } | |
477 | } | |
478 | list = loptend; | |
479 | ||
480 | if (array_name && (legal_identifier (array_name) == 0)) | |
481 | { | |
482 | sh_invalidid (array_name); | |
483 | return (EXECUTION_FAILURE); | |
484 | } | |
485 | ||
486 | if (list == 0 && which == 0) | |
487 | { | |
488 | builtin_error ("string argument required"); | |
489 | return (EX_USAGE); | |
490 | } | |
491 | ||
492 | /* options are mutually exclusive and one is required */ | |
493 | if ((OPTSET (BFLAG) + OPTSET (CFLAG) + OPTSET (FFLAG)) != 1) | |
494 | { | |
495 | builtin_usage (); | |
496 | return (EX_USAGE); | |
497 | } | |
498 | ||
499 | if ((npos = getlist (list_arg, &poslist)) < 0) | |
500 | { | |
501 | free (poslist); | |
502 | return (EXECUTION_FAILURE); | |
503 | } | |
504 | ||
505 | if (array_name) | |
f4683835 CR |
506 | { |
507 | v = builtin_find_indexed_array (array_name, 1); | |
508 | if (v == 0) | |
96a2ec12 | 509 | { |
f4683835 | 510 | free (poslist); |
96a2ec12 CR |
511 | return (EXECUTION_FAILURE); |
512 | } | |
96a2ec12 CR |
513 | } |
514 | ||
515 | op.flags = cutflags; | |
516 | op.delim = delim; | |
517 | op.npos = npos; | |
518 | op.poslist = poslist; | |
519 | ||
520 | /* we implement cut as a builtin with a cutfile() function that opens each | |
521 | filename in LIST as a filename (or `-' for stdin) and runs cutline on | |
522 | every line in the file. */ | |
523 | if (which == 0) | |
524 | { | |
525 | cutstring = list->word->word; | |
526 | if (cutstring == 0 || *cutstring == 0) | |
527 | { | |
528 | free (poslist); | |
529 | return (EXECUTION_SUCCESS); | |
530 | } | |
531 | rval = cutline (v, cutstring, &op); | |
532 | } | |
533 | else | |
534 | rval = cutfile (v, list, &op); | |
535 | ||
f4683835 | 536 | free (poslist); |
96a2ec12 CR |
537 | return (rval); |
538 | } | |
539 | ||
540 | int | |
81e3a4fb | 541 | lcut_builtin (WORD_LIST *list) |
96a2ec12 CR |
542 | { |
543 | return (cut_internal (0, list)); | |
544 | } | |
545 | ||
546 | int | |
81e3a4fb | 547 | cut_builtin (WORD_LIST *list) |
96a2ec12 CR |
548 | { |
549 | return (cut_internal (1, list)); | |
550 | } | |
551 | ||
96a2ec12 CR |
552 | char *lcut_doc[] = { |
553 | "Extract selected fields from a string.", | |
554 | "", | |
555 | "Select portions of LINE (as specified by LIST) and assign them to", | |
556 | "elements of the indexed array ARRAY starting at index 0, or write", | |
557 | "them to the standard output if -a is not specified.", | |
558 | "", | |
559 | "Items specified by LIST are either column positions or fields delimited", | |
560 | "by a special character, and are described more completely in cut(1).", | |
561 | "", | |
562 | "Columns correspond to bytes (-b), characters (-c), or fields (-f). The", | |
563 | "field delimiter is specified by -d (default TAB). Column numbering", | |
564 | "starts at 1.", | |
565 | (char *)NULL | |
566 | }; | |
567 | ||
568 | struct builtin lcut_struct = { | |
569 | "lcut", /* builtin name */ | |
570 | lcut_builtin, /* function implementing the builtin */ | |
571 | BUILTIN_ENABLED, /* initial flags for builtin */ | |
572 | lcut_doc, /* array of long documentation strings. */ | |
573 | "lcut [-a ARRAY] [-b LIST] [-c LIST] [-f LIST] [-d CHAR] [-sn] line", /* usage synopsis; becomes short_doc */ | |
574 | 0 /* reserved for internal use */ | |
575 | }; | |
576 | ||
577 | char *cut_doc[] = { | |
578 | "Extract selected fields from each line of a file.", | |
579 | "", | |
580 | "Select portions of each line (as specified by LIST) from each FILE", | |
581 | "and write them to the standard output. cut reads from the standard", | |
582 | "input if no FILE arguments are specified or if a FILE argument is a", | |
583 | "single hyphen.", | |
584 | "", | |
585 | "Items specified by LIST are either column positions or fields delimited", | |
586 | "by a special character, and are described more completely in cut(1).", | |
587 | "", | |
588 | "Columns correspond to bytes (-b), characters (-c), or fields (-f). The", | |
589 | "field delimiter is specified by -d (default TAB). Column numbering", | |
590 | "starts at 1.", | |
591 | (char *)NULL | |
592 | }; | |
593 | ||
594 | struct builtin cut_struct = { | |
595 | "cut", /* builtin name */ | |
596 | cut_builtin, /* function implementing the builtin */ | |
597 | BUILTIN_ENABLED, /* initial flags for builtin */ | |
598 | cut_doc, /* array of long documentation strings. */ | |
599 | "cut [-a ARRAY] [-b LIST] [-c LIST] [-f LIST] [-d CHAR] [-sn] [file ...]", /* usage synopsis; becomes short_doc */ | |
600 | 0 /* reserved for internal use */ | |
601 | }; |