]>
Commit | Line | Data |
---|---|---|
8868edaf CR |
1 | /* cut,lcut - extract specified fields from a line and assign them to an array |
2 | or print them to the standard output */ | |
3 | ||
4 | /* | |
5 | Copyright (C) 2020 Free Software Foundation, Inc. | |
6 | ||
7 | Bash is free software: you can redistribute it and/or modify | |
8 | it under the terms of the GNU General Public License as published by | |
9 | the Free Software Foundation, either version 3 of the License, or | |
10 | (at your option) any later version. | |
11 | ||
12 | Bash is distributed in the hope that it will be useful, | |
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
15 | GNU General Public License for more details. | |
16 | ||
17 | You should have received a copy of the GNU General Public License | |
18 | along with Bash. If not, see <http://www.gnu.org/licenses/>. | |
19 | */ | |
20 | ||
21 | /* See Makefile for compilation details. */ | |
22 | ||
23 | #include <config.h> | |
24 | ||
25 | #if defined (HAVE_UNISTD_H) | |
26 | # include <unistd.h> | |
27 | #endif | |
28 | #include "bashansi.h" | |
29 | #include <stdio.h> | |
30 | #include <fcntl.h> | |
31 | #include <errno.h> | |
32 | ||
33 | #include "loadables.h" | |
34 | #include "shmbutil.h" | |
35 | ||
36 | #define CUT_ARRAY_DEFAULT "CUTFIELDS" | |
37 | ||
38 | #define NOPOS -2 /* sentinel for unset startpos/endpos */ | |
39 | ||
40 | #define BOL 0 | |
41 | #define EOL INT_MAX | |
42 | #define NORANGE -1 /* just a position, no range */ | |
43 | ||
44 | #define BFLAG (1 << 0) | |
45 | #define CFLAG (1 << 1) | |
46 | #define DFLAG (1 << 2) | |
47 | #define FFLAG (1 << 3) | |
48 | #define SFLAG (1 << 4) | |
49 | ||
50 | struct cutpos | |
51 | { | |
52 | int startpos, endpos; /* zero-based, correction done in getlist() */ | |
53 | }; | |
54 | ||
55 | struct cutop | |
56 | { | |
57 | int flags; | |
58 | int delim; | |
59 | int npos; | |
60 | struct cutpos *poslist; | |
61 | }; | |
62 | ||
63 | static int | |
64 | poscmp (a, b) | |
65 | void *a, *b; | |
66 | { | |
67 | struct cutpos *p1, *p2; | |
68 | ||
69 | p1 = (struct cutpos *)a; | |
70 | p2 = (struct cutpos *)b; | |
71 | return (p1->startpos - p2->startpos); | |
72 | } | |
73 | ||
74 | static int | |
75 | getlist (arg, opp) | |
76 | char *arg; | |
77 | struct cutpos **opp; | |
78 | { | |
79 | char *ntok, *ltok, *larg; | |
80 | int s, e; | |
81 | intmax_t num; | |
82 | struct cutpos *poslist; | |
83 | int npos, nsize; | |
84 | ||
85 | poslist = 0; | |
86 | nsize = npos = 0; | |
87 | s = e = 0; | |
88 | larg = arg; | |
89 | while (ltok = strsep (&larg, ",")) | |
90 | { | |
91 | if (*ltok == 0) | |
92 | continue; | |
93 | ||
94 | ntok = strsep (<ok, "-"); | |
95 | if (*ntok == 0) | |
96 | s = BOL; | |
97 | else | |
98 | { | |
99 | if (legal_number (ntok, &num) == 0 || (int)num != num || num <= 0) | |
100 | { | |
101 | builtin_error ("%s: invalid list value", ntok); | |
102 | *opp = poslist; | |
103 | return -1; | |
104 | } | |
105 | s = num; | |
106 | s--; /* fields are 1-based */ | |
107 | } | |
108 | if (ltok == 0) | |
109 | e = NORANGE; | |
110 | else if (*ltok == 0) | |
111 | e = EOL; | |
112 | else | |
113 | { | |
114 | if (legal_number (ltok, &num) == 0 || (int)num != num || num <= 0) | |
115 | { | |
116 | builtin_error ("%s: invalid list value", ltok); | |
117 | *opp = poslist; | |
118 | return -1; | |
119 | } | |
120 | e = num; | |
121 | e--; | |
122 | if (e == s) | |
123 | e = NORANGE; | |
124 | } | |
125 | ||
126 | if (npos == nsize) | |
127 | { | |
128 | nsize += 4; | |
129 | poslist = (struct cutpos *)xrealloc (poslist, nsize * sizeof (struct cutpos)); | |
130 | } | |
131 | poslist[npos].startpos = s; | |
132 | poslist[npos].endpos = e; | |
133 | npos++; | |
134 | } | |
135 | if (npos == 0) | |
136 | { | |
137 | builtin_error ("missing list of positions"); | |
138 | *opp = poslist; | |
139 | return -1; | |
140 | } | |
141 | ||
142 | qsort (poslist, npos, sizeof(poslist[0]), poscmp); | |
143 | *opp = poslist; | |
144 | ||
145 | return npos; | |
146 | } | |
147 | ||
148 | static int | |
149 | cutbytes (v, line, ops) | |
150 | SHELL_VAR *v; | |
151 | char *line; | |
152 | struct cutop *ops; | |
153 | { | |
154 | arrayind_t ind; | |
155 | char *buf, *bmap; | |
156 | size_t llen; | |
157 | int i, b, n, s, e; | |
158 | ||
159 | llen = strlen (line); | |
160 | buf = xmalloc (llen + 1); | |
161 | bmap = xmalloc (llen + 1); | |
162 | memset (bmap, 0, llen); | |
163 | ||
164 | for (n = 0; n < ops->npos; n++) | |
165 | { | |
166 | s = ops->poslist[n].startpos; /* no translation needed yet */ | |
167 | e = ops->poslist[n].endpos; | |
168 | if (e == NORANGE) | |
169 | e = s; | |
170 | else if (e == EOL || e >= llen) | |
171 | e = llen - 1; | |
172 | /* even if a column is specified multiple times, it will only be printed | |
173 | once */ | |
174 | for (i = s; i <= e; i++) | |
175 | bmap[i] = 1; | |
176 | } | |
177 | ||
178 | b = 0; | |
179 | for (i = 0; i < llen; i++) | |
180 | if (bmap[i]) | |
181 | buf[b++] = line[i]; | |
182 | buf[b] = 0; | |
183 | ||
184 | if (v) | |
185 | { | |
186 | ind = 0; | |
187 | bind_array_element (v, ind, buf, 0); | |
188 | ind++; | |
189 | } | |
190 | else | |
191 | printf ("%s\n", buf); | |
192 | ||
193 | free (buf); | |
194 | free (bmap); | |
195 | ||
196 | return ind; | |
197 | } | |
198 | ||
199 | static int | |
200 | cutchars (v, line, ops) | |
201 | SHELL_VAR *v; | |
202 | char *line; | |
203 | struct cutop *ops; | |
204 | { | |
205 | arrayind_t ind; | |
206 | char *buf, *bmap; | |
207 | wchar_t *wbuf, *wb2; | |
208 | size_t llen, wlen; | |
209 | int i, b, n, s, e; | |
210 | ||
211 | if (MB_CUR_MAX == 1) | |
212 | return (cutbytes (v, line, ops)); | |
213 | if (locale_utf8locale && utf8_mbsmbchar (line) == 0) | |
214 | return (cutbytes (v, line, ops)); | |
215 | ||
216 | llen = strlen (line); | |
217 | wbuf = (wchar_t *)xmalloc ((llen + 1) * sizeof (wchar_t)); | |
218 | ||
219 | wlen = mbstowcs (wbuf, line, llen); | |
220 | if (MB_INVALIDCH (wlen)) | |
221 | { | |
222 | free (wbuf); | |
223 | return (cutbytes (v, line, ops)); | |
224 | } | |
225 | ||
226 | bmap = xmalloc (llen + 1); | |
227 | memset (bmap, 0, llen); | |
228 | ||
229 | for (n = 0; n < ops->npos; n++) | |
230 | { | |
231 | s = ops->poslist[n].startpos; /* no translation needed yet */ | |
232 | e = ops->poslist[n].endpos; | |
233 | if (e == NORANGE) | |
234 | e = s; | |
235 | else if (e == EOL || e >= wlen) | |
236 | e = wlen - 1; | |
237 | /* even if a column is specified multiple times, it will only be printed | |
238 | once */ | |
239 | for (i = s; i <= e; i++) | |
240 | bmap[i] = 1; | |
241 | } | |
242 | ||
243 | wb2 = (wchar_t *)xmalloc ((wlen + 1) * sizeof (wchar_t)); | |
244 | b = 0; | |
245 | for (i = 0; i < wlen; i++) | |
246 | if (bmap[i]) | |
247 | wb2[b++] = wbuf[i]; | |
248 | wb2[b] = 0; | |
249 | ||
250 | free (wbuf); | |
251 | ||
252 | buf = bmap; | |
253 | n = wcstombs (buf, wb2, llen); | |
254 | ||
255 | if (v) | |
256 | { | |
257 | ind = 0; | |
258 | bind_array_element (v, ind, buf, 0); | |
259 | ind++; | |
260 | } | |
261 | else | |
262 | printf ("%s\n", buf); | |
263 | ||
264 | free (buf); | |
265 | free (wb2); | |
266 | ||
267 | return ind; | |
268 | } | |
269 | ||
270 | /* The basic strategy is to cut the line into fields using strsep, populate | |
271 | an array of fields from 0..nf, then select those fields using the same | |
272 | bitmap approach as cut{bytes,chars} and assign them to the array variable | |
273 | V or print them on stdout. This function obeys SFLAG. */ | |
274 | static int | |
275 | cutfields (v, line, ops) | |
276 | SHELL_VAR *v; | |
277 | char *line; | |
278 | struct cutop *ops; | |
279 | { | |
280 | arrayind_t ind; | |
281 | char *buf, *bmap, *field, **fields, delim[2]; | |
282 | size_t llen, fsize; | |
283 | int i, b, n, s, e, nf; | |
284 | ||
285 | ind = 0; | |
286 | ||
287 | delim[0] = ops->delim; | |
288 | delim[1] = '\0'; | |
289 | ||
290 | fields = 0; | |
291 | nf = 0; | |
292 | fsize = 0; | |
293 | ||
294 | field = buf = line; | |
295 | do | |
296 | { | |
297 | field = strsep (&buf, delim); /* destructive */ | |
298 | if (nf == fsize) | |
299 | { | |
300 | fsize += 8; | |
301 | fields = xrealloc (fields, fsize * sizeof (char *)); | |
302 | } | |
303 | fields[nf] = field; | |
304 | if (field) | |
305 | nf++; | |
306 | } | |
307 | while (field); | |
308 | ||
309 | if (nf == 1) | |
310 | { | |
311 | free (fields); | |
312 | if (ops->flags & SFLAG) | |
313 | return ind; | |
314 | if (v) | |
315 | { | |
316 | bind_array_element (v, ind, line, 0); | |
317 | ind++; | |
318 | } | |
319 | else | |
320 | printf ("%s\n", line); | |
321 | return ind; | |
322 | } | |
323 | ||
324 | bmap = xmalloc (nf + 1); | |
325 | memset (bmap, 0, nf); | |
326 | ||
327 | for (n = 0; n < ops->npos; n++) | |
328 | { | |
329 | s = ops->poslist[n].startpos; /* no translation needed yet */ | |
330 | e = ops->poslist[n].endpos; | |
331 | if (e == NORANGE) | |
332 | e = s; | |
333 | else if (e == EOL || e >= nf) | |
334 | e = nf - 1; | |
335 | /* even if a column is specified multiple times, it will only be printed | |
336 | once */ | |
337 | for (i = s; i <= e; i++) | |
338 | bmap[i] = 1; | |
339 | } | |
340 | ||
341 | for (i = 1, b = 0; b < nf; b++) | |
342 | { | |
343 | if (bmap[b] == 0) | |
344 | continue; | |
345 | if (v) | |
346 | { | |
347 | bind_array_element (v, ind, fields[b], 0); | |
348 | ind++; | |
349 | } | |
350 | else | |
351 | { | |
352 | if (i == 0) | |
353 | putchar (ops->delim); | |
354 | printf ("%s", fields[b]); | |
355 | } | |
356 | i = 0; | |
357 | } | |
358 | if (v == 0) | |
359 | putchar ('\n'); | |
360 | ||
361 | return nf; | |
362 | } | |
363 | ||
364 | static int | |
365 | cutline (v, line, ops) | |
366 | SHELL_VAR *v; | |
367 | char *line; | |
368 | struct cutop *ops; | |
369 | { | |
370 | int rval; | |
371 | ||
372 | if (ops->flags & BFLAG) | |
373 | rval = cutbytes (v, line, ops); | |
374 | else if (ops->flags & CFLAG) | |
375 | rval = cutchars (v, line, ops); | |
376 | else | |
377 | rval = cutfields (v, line, ops); | |
378 | ||
379 | return (rval >= 0 ? EXECUTION_SUCCESS : EXECUTION_FAILURE); | |
380 | } | |
381 | ||
382 | static int | |
383 | cutfile (v, list, ops) | |
384 | SHELL_VAR *v; | |
385 | WORD_LIST *list; | |
386 | struct cutop *ops; | |
387 | { | |
388 | int fd, unbuffered_read; | |
389 | char *line, *b; | |
390 | size_t llen; | |
391 | WORD_LIST *l; | |
392 | ssize_t n; | |
393 | ||
394 | line = 0; | |
395 | llen = 0; | |
396 | ||
397 | l = list; | |
398 | do | |
399 | { | |
400 | /* for each file */ | |
401 | if (l == 0 || (l->word->word[0] == '-' && l->word->word[1] == '\0')) | |
402 | fd = 0; | |
403 | else | |
404 | fd = open (l->word->word, O_RDONLY); | |
405 | if (fd < 0) | |
406 | { | |
407 | file_error (l->word->word); | |
408 | return (EXECUTION_FAILURE); | |
409 | } | |
410 | ||
411 | #ifndef __CYGWIN__ | |
412 | unbuffered_read = (lseek (fd, 0L, SEEK_CUR) < 0) && (errno == ESPIPE); | |
413 | #else | |
414 | unbuffered_read = 1; | |
415 | #endif | |
416 | ||
417 | while ((n = zgetline (fd, &line, &llen, '\n', unbuffered_read)) != -1) | |
74091dd4 CR |
418 | { |
419 | QUIT; | |
420 | if (line[n] == '\n') | |
421 | line[n] = '\0'; /* cutline expects no newline terminator */ | |
422 | cutline (v, line, ops); /* can modify line */ | |
423 | } | |
8868edaf CR |
424 | if (fd > 0) |
425 | close (fd); | |
426 | ||
74091dd4 | 427 | QUIT; |
8868edaf CR |
428 | if (l) |
429 | l = l->next; | |
430 | } | |
431 | while (l); | |
432 | ||
433 | free (line); | |
434 | return EXECUTION_SUCCESS; | |
435 | } | |
436 | ||
437 | #define OPTSET(x) ((cutflags & (x)) ? 1 : 0) | |
438 | ||
439 | static int | |
440 | cut_internal (which, list) | |
441 | int which; /* not used yet */ | |
442 | WORD_LIST *list; | |
443 | { | |
444 | int opt, rval, cutflags, delim, npos; | |
445 | char *array_name, *cutstring, *list_arg; | |
446 | SHELL_VAR *v; | |
447 | struct cutop op; | |
448 | struct cutpos *poslist; | |
449 | ||
450 | v = 0; | |
451 | rval = EXECUTION_SUCCESS; | |
452 | ||
453 | cutflags = 0; | |
454 | array_name = 0; | |
455 | list_arg = 0; | |
456 | delim = '\t'; | |
457 | ||
458 | reset_internal_getopt (); | |
459 | while ((opt = internal_getopt (list, "a:b:c:d:f:sn")) != -1) | |
460 | { | |
461 | switch (opt) | |
462 | { | |
463 | case 'a': | |
464 | array_name = list_optarg; | |
465 | break; | |
466 | case 'b': | |
467 | cutflags |= BFLAG; | |
468 | list_arg = list_optarg; | |
469 | break; | |
470 | case 'c': | |
471 | cutflags |= CFLAG; | |
472 | list_arg = list_optarg; | |
473 | break; | |
474 | case 'd': | |
475 | cutflags |= DFLAG; | |
476 | delim = list_optarg[0]; | |
477 | if (delim == 0 || list_optarg[1]) | |
478 | { | |
479 | builtin_error ("delimiter must be a single non-null character"); | |
480 | return (EX_USAGE); | |
481 | } | |
482 | break; | |
483 | case 'f': | |
484 | cutflags |= FFLAG; | |
485 | list_arg = list_optarg; | |
486 | break; | |
487 | case 'n': | |
488 | break; | |
489 | case 's': | |
490 | cutflags |= SFLAG; | |
491 | break; | |
492 | CASE_HELPOPT; | |
493 | default: | |
494 | builtin_usage (); | |
495 | return (EX_USAGE); | |
496 | } | |
497 | } | |
498 | list = loptend; | |
499 | ||
500 | if (array_name && (legal_identifier (array_name) == 0)) | |
501 | { | |
502 | sh_invalidid (array_name); | |
503 | return (EXECUTION_FAILURE); | |
504 | } | |
505 | ||
506 | if (list == 0 && which == 0) | |
507 | { | |
508 | builtin_error ("string argument required"); | |
509 | return (EX_USAGE); | |
510 | } | |
511 | ||
512 | /* options are mutually exclusive and one is required */ | |
513 | if ((OPTSET (BFLAG) + OPTSET (CFLAG) + OPTSET (FFLAG)) != 1) | |
514 | { | |
515 | builtin_usage (); | |
516 | return (EX_USAGE); | |
517 | } | |
518 | ||
519 | if ((npos = getlist (list_arg, &poslist)) < 0) | |
520 | { | |
521 | free (poslist); | |
522 | return (EXECUTION_FAILURE); | |
523 | } | |
524 | ||
525 | if (array_name) | |
526 | { | |
527 | v = find_or_make_array_variable (array_name, 1); | |
528 | if (v == 0 || readonly_p (v) || noassign_p (v)) | |
529 | { | |
530 | if (v && readonly_p (v)) | |
531 | err_readonly (array_name); | |
532 | return (EXECUTION_FAILURE); | |
533 | } | |
534 | else if (array_p (v) == 0) | |
535 | { | |
536 | builtin_error ("%s: not an indexed array", array_name); | |
537 | return (EXECUTION_FAILURE); | |
538 | } | |
539 | if (invisible_p (v)) | |
540 | VUNSETATTR (v, att_invisible); | |
541 | array_flush (array_cell (v)); | |
542 | } | |
543 | ||
544 | op.flags = cutflags; | |
545 | op.delim = delim; | |
546 | op.npos = npos; | |
547 | op.poslist = poslist; | |
548 | ||
549 | /* we implement cut as a builtin with a cutfile() function that opens each | |
550 | filename in LIST as a filename (or `-' for stdin) and runs cutline on | |
551 | every line in the file. */ | |
552 | if (which == 0) | |
553 | { | |
554 | cutstring = list->word->word; | |
555 | if (cutstring == 0 || *cutstring == 0) | |
556 | { | |
557 | free (poslist); | |
558 | return (EXECUTION_SUCCESS); | |
559 | } | |
560 | rval = cutline (v, cutstring, &op); | |
561 | } | |
562 | else | |
563 | rval = cutfile (v, list, &op); | |
564 | ||
565 | return (rval); | |
566 | } | |
567 | ||
568 | int | |
569 | lcut_builtin (list) | |
570 | WORD_LIST *list; | |
571 | { | |
572 | return (cut_internal (0, list)); | |
573 | } | |
574 | ||
575 | int | |
576 | cut_builtin (list) | |
577 | WORD_LIST *list; | |
578 | { | |
579 | return (cut_internal (1, list)); | |
580 | } | |
581 | ||
582 | char *lcut_doc[] = { | |
583 | "Extract selected fields from a string.", | |
584 | "", | |
585 | "Select portions of LINE (as specified by LIST) and assign them to", | |
586 | "elements of the indexed array ARRAY starting at index 0, or write", | |
587 | "them to the standard output if -a is not specified.", | |
588 | "", | |
589 | "Items specified by LIST are either column positions or fields delimited", | |
590 | "by a special character, and are described more completely in cut(1).", | |
591 | "", | |
592 | "Columns correspond to bytes (-b), characters (-c), or fields (-f). The", | |
593 | "field delimiter is specified by -d (default TAB). Column numbering", | |
594 | "starts at 1.", | |
595 | (char *)NULL | |
596 | }; | |
597 | ||
598 | struct builtin lcut_struct = { | |
599 | "lcut", /* builtin name */ | |
600 | lcut_builtin, /* function implementing the builtin */ | |
601 | BUILTIN_ENABLED, /* initial flags for builtin */ | |
602 | lcut_doc, /* array of long documentation strings. */ | |
603 | "lcut [-a ARRAY] [-b LIST] [-c LIST] [-f LIST] [-d CHAR] [-sn] line", /* usage synopsis; becomes short_doc */ | |
604 | 0 /* reserved for internal use */ | |
605 | }; | |
606 | ||
607 | char *cut_doc[] = { | |
608 | "Extract selected fields from each line of a file.", | |
609 | "", | |
610 | "Select portions of each line (as specified by LIST) from each FILE", | |
611 | "and write them to the standard output. cut reads from the standard", | |
612 | "input if no FILE arguments are specified or if a FILE argument is a", | |
613 | "single hyphen.", | |
614 | "", | |
615 | "Items specified by LIST are either column positions or fields delimited", | |
616 | "by a special character, and are described more completely in cut(1).", | |
617 | "", | |
618 | "Columns correspond to bytes (-b), characters (-c), or fields (-f). The", | |
619 | "field delimiter is specified by -d (default TAB). Column numbering", | |
620 | "starts at 1.", | |
621 | (char *)NULL | |
622 | }; | |
623 | ||
624 | struct builtin cut_struct = { | |
625 | "cut", /* builtin name */ | |
626 | cut_builtin, /* function implementing the builtin */ | |
627 | BUILTIN_ENABLED, /* initial flags for builtin */ | |
628 | cut_doc, /* array of long documentation strings. */ | |
629 | "cut [-a ARRAY] [-b LIST] [-c LIST] [-f LIST] [-d CHAR] [-sn] [file ...]", /* usage synopsis; becomes short_doc */ | |
630 | 0 /* reserved for internal use */ | |
631 | }; |