From: Karel Zak Date: Wed, 12 Oct 2022 07:46:56 +0000 (+0200) Subject: libsmartcols: use standard deviation to optimize columns width X-Git-Tag: v2.39-rc1~461 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=5470acc6a36aec1469bf77c75a8fb08a48c0b20a;p=thirdparty%2Futil-linux.git libsmartcols: use standard deviation to optimize columns width The standard deviation together with mean (average) of the data width can be used to detect "problematic" columns and to calculate optimal width. The idea is to sort column by avg+deviation and start columns width reduction from the column with the most wide and variable width. The width reduction is also optimized by 68–95–99 rule (aka empirical rule, avg+n*deviation; where n={1,2,3}) to cover 95% or 68% data in the column. The disadvantage is we need to link libsmartcols with -lm (math) due to sqrt() function. Signed-off-by: Karel Zak --- diff --git a/libsmartcols/src/Makemodule.am b/libsmartcols/src/Makemodule.am index 2bb19fdbde..db3e68528f 100644 --- a/libsmartcols/src/Makemodule.am +++ b/libsmartcols/src/Makemodule.am @@ -23,7 +23,7 @@ libsmartcols_la_SOURCES= \ libsmartcols/src/walk.c \ libsmartcols/src/init.c -libsmartcols_la_LIBADD = $(LDADD) libcommon.la +libsmartcols_la_LIBADD = $(LDADD) $(MATH_LIBS) libcommon.la libsmartcols_la_CFLAGS = \ $(AM_CFLAGS) \ diff --git a/libsmartcols/src/calculate.c b/libsmartcols/src/calculate.c index 4f794088e2..b55d6c2ec9 100644 --- a/libsmartcols/src/calculate.c +++ b/libsmartcols/src/calculate.c @@ -1,24 +1,29 @@ #include "smartcolsP.h" #include "mbsalign.h" +#include + static void dbg_column(struct libscols_table *tb, struct libscols_column *cl) { + struct libscols_wstat *st; + if (scols_column_is_hidden(cl)) { DBG(COL, ul_debugobj(cl, "%s (hidden) ignored", cl->header.data)); return; } + st = &cl->wstat; + DBG(COL, ul_debugobj(cl, "%15s seq=%zu, width=%zd, " - "hint=%d, avg=%zu, max=%zu, min=%zu, " - "extreme=%s %s", + "hint=%d, max=%zu, min=%zu, " + "0x04%x [%s]", cl->header.data, cl->seqnum, cl->width, - cl->width_hint > 1 ? (int) cl->width_hint : + cl->width_hint >= 1.0 ? (int) cl->width_hint : (int) (cl->width_hint * tb->termwidth), - cl->width_avg, - cl->width_max, - cl->width_min, - cl->is_extreme ? "yes" : "not", + st->width_max, + st->width_min, + cl->flags, cl->flags & SCOLS_FL_TRUNC ? "trunc" : "")); } @@ -40,6 +45,8 @@ static int count_cell_width(struct libscols_table *tb, size_t len; char *data; int rc; + struct libscols_cell *ce; + struct libscols_wstat *st; rc = __cell_to_buffer(tb, ln, cl, buf); if (rc) @@ -57,24 +64,21 @@ static int count_cell_width(struct libscols_table *tb, if (len == (size_t) -1) /* ignore broken multibyte strings */ len = 0; - cl->width_max = max(len, cl->width_max); - if (cl->is_extreme && cl->width_avg && len > cl->width_avg * 2) - return 0; + ce = scols_line_get_cell(ln, cl->seqnum); + ce->width = len; + + st = &cl->wstat; + st->width_max = max(len, st->width_max); - if (scols_column_is_noextremes(cl)) { - cl->extreme_sum += len; - cl->extreme_count++; - } - cl->width = max(len, cl->width); if (scols_column_is_tree(cl)) { size_t treewidth = ul_buffer_get_safe_pointer_width(buf, SCOLS_BUFPTR_TREEEND); cl->width_treeart = max(cl->width_treeart, treewidth); } + return 0; } - static int walk_count_cell_width(struct libscols_table *tb, struct libscols_line *ln, struct libscols_column *cl, @@ -83,46 +87,94 @@ static int walk_count_cell_width(struct libscols_table *tb, return count_cell_width(tb, ln, cl, (struct ul_buffer *) data); } +static void count_column_deviation(struct libscols_table *tb, struct libscols_column *cl) +{ + struct libscols_wstat *st; + struct libscols_iter itr; + struct libscols_line *ln; + struct libscols_cell *ce; + size_t sum = 0, n = 0, extra = 0; + + st = &cl->wstat; + + if (scols_column_is_tree(cl) && has_groups(tb)) + extra = tb->grpset_size + 1; + + /* count average */ + scols_reset_iter(&itr, SCOLS_ITER_FORWARD); + while (scols_table_next_line(tb, &itr, &ln) == 0) { + ce = scols_line_get_cell(ln, cl->seqnum); + + n++; + sum += ce->width + extra; + } + st->width_avg = sum / n; + + /* count deviation */ + if (n > 1) { + double variance; + + scols_reset_iter(&itr, SCOLS_ITER_FORWARD); + while (scols_table_next_line(tb, &itr, &ln) == 0) { + double diff; + ce = scols_line_get_cell(ln, cl->seqnum); + + diff = (double) ce->width - st->width_avg; + st->width_sqr_sum += diff * diff; /* aka pow(x, 2) */ + } + + variance = st->width_sqr_sum / (n - 1); + st->width_deviation = sqrt(variance); + } + + DBG(COL, ul_debugobj(cl, "%15s avg=%g, deviation=%g", + cl->header.data, + st->width_avg, + st->width_deviation)); +} + /* * This function counts column width. - * - * For the SCOLS_FL_NOEXTREMES columns it is possible to call this function - * two times. The first pass counts the width and average width. If the column - * contains fields that are too large (a width greater than 2 * average) then - * the column is marked as "extreme". In the second pass all extreme fields - * are ignored and the column width is counted from non-extreme fields only. */ static int count_column_width(struct libscols_table *tb, struct libscols_column *cl, struct ul_buffer *buf) { int rc = 0, no_header = 0; + const char *data; + struct libscols_wstat *st; + struct libscols_iter itr; + struct libscols_line *ln; assert(tb); assert(cl); + st = &cl->wstat; + cl->width = 0; - if (!cl->width_min) { - const char *data; + memset(st, 0, sizeof(struct libscols_wstat)); - if (cl->width_hint < 1 && scols_table_is_maxout(tb) && tb->is_term) { - cl->width_min = (size_t) (cl->width_hint * tb->termwidth); - if (cl->width_min && !is_last_column(cl)) - cl->width_min--; - } + /* set minimal width according to width_hint */ + if (cl->width_hint < 1 && scols_table_is_maxout(tb) && tb->is_term) { + st->width_min = (size_t) (cl->width_hint * tb->termwidth); + if (st->width_min && !is_last_column(cl)) + st->width_min--; + } - data = scols_cell_get_data(&cl->header); - if (data) { - size_t len = scols_table_is_noencoding(tb) ? - mbs_width(data) : mbs_safe_width(data); - cl->width_min = max(cl->width_min, len); - } else - no_header = 1; + /* set minimal width according to header width */ + data = scols_cell_get_data(&cl->header); + if (data) { + size_t len = scols_table_is_noencoding(tb) ? + mbs_width(data) : mbs_safe_width(data); - if (!cl->width_min) - cl->width_min = 1; - } + st->width_min = max(st->width_min, len); + } else + no_header = 1; + + if (!st->width_min) + st->width_min = 1; + /* count width according to cells data */ if (scols_table_is_tree(tb)) { /* Count width for tree */ rc = scols_walk_tree(tb, cl, walk_count_cell_width, (void *) buf); @@ -130,9 +182,6 @@ static int count_column_width(struct libscols_table *tb, goto done; } else { /* Count width for list */ - struct libscols_iter itr; - struct libscols_line *ln; - scols_reset_iter(&itr, SCOLS_ITER_FORWARD); while (scols_table_next_line(tb, &itr, &ln) == 0) { rc = count_cell_width(tb, ln, cl, buf); @@ -147,50 +196,75 @@ static int count_column_width(struct libscols_table *tb, * calculate final width from grpset_size. */ size_t gprwidth = tb->grpset_size + 1; - cl->width_treeart += gprwidth; - cl->width_max += gprwidth; - cl->width += gprwidth; - if (cl->extreme_count) - cl->extreme_sum += gprwidth; + st->width_treeart += gprwidth; + st->width_max += gprwidth; } - if (cl->extreme_count && cl->width_avg == 0) { - cl->width_avg = cl->extreme_sum / cl->extreme_count; - if (cl->width_avg && cl->width_max > cl->width_avg * 2) - cl->is_extreme = 1; - } + if (st->width_max < st->width_min) + st->width_max = st->width_min; + + /* this is default, may be leter reduced */ + cl->width = st->width_max; /* enlarge to minimal width */ - if (cl->width < cl->width_min && !scols_column_is_strict_width(cl)) - cl->width = cl->width_min; + if (cl->width < st->width_min && !scols_column_is_strict_width(cl)) + cl->width = st->width_min; /* use absolute size for large columns */ else if (cl->width_hint >= 1 && cl->width < (size_t) cl->width_hint - && cl->width_min < (size_t) cl->width_hint) + && st->width_min < (size_t) cl->width_hint) cl->width = (size_t) cl->width_hint; /* Column without header and data, set minimal size to zero (default is 1) */ - if (cl->width_max == 0 && no_header && cl->width_min == 1 && cl->width <= 1) - cl->width = cl->width_min = 0; + if (st->width_max == 0 && no_header && st->width_min == 1 && cl->width <= 1) + cl->width = st->width_min = 0; done: ON_DBG(COL, dbg_column(tb, cl)); return rc; } +static int cmp_deviation(struct list_head *a, struct list_head *b, + void *data __attribute__((__unused__))) +{ + struct libscols_column *ca = list_entry(a, struct libscols_column, cl_columns); + struct libscols_column *cb = list_entry(b, struct libscols_column, cl_columns); + + double xa = ca->wstat.width_avg + ca->wstat.width_deviation; + double xb = cb->wstat.width_avg + cb->wstat.width_deviation; + + return cmp_numbers(xa, xb); +} + +static int cmp_seqnum(struct list_head *a, struct list_head *b, + void *data __attribute__((__unused__))) +{ + struct libscols_column *ca = list_entry(a, struct libscols_column, cl_columns); + struct libscols_column *cb = list_entry(b, struct libscols_column, cl_columns); + + return cmp_numbers(ca->seqnum, cb->seqnum); +} + +static inline void sort_columns(struct libscols_table *tb, + int (*cmp)(struct list_head *, struct list_head *, void *)) +{ + list_sort(&tb->tb_columns, cmp, NULL); +} + /* * This is core of the scols_* voodoo... */ int __scols_calculate(struct libscols_table *tb, struct ul_buffer *buf) { - struct libscols_column *cl; + struct libscols_column *cl, *last_cl; struct libscols_iter itr; size_t width = 0, width_min = 0; /* output width */ int stage, rc = 0; - int extremes = 0, group_ncolumns = 0; + int ignore_extremes = 0, group_ncolumns = 0; size_t colsepsz; + int sorted = 0; DBG(TAB, ul_debugobj(tb, "-----calculate-(termwidth=%zu)-----", tb->termwidth)); @@ -225,9 +299,7 @@ int __scols_calculate(struct libscols_table *tb, struct ul_buffer *buf) is_last = is_last_column(cl); width += cl->width + (is_last ? 0 : colsepsz); /* separator for non-last column */ - width_min += cl->width_min + (is_last ? 0 : colsepsz); - if (cl->is_extreme) - extremes++; + width_min += cl->wstat.width_min + (is_last ? 0 : colsepsz); } if (!tb->is_term) { @@ -245,56 +317,83 @@ int __scols_calculate(struct libscols_table *tb, struct ul_buffer *buf) if (scols_column_is_hidden(cl)) continue; width_min--; - cl->width_min--; + cl->wstat.width_min--; } DBG(TAB, ul_debugobj(tb, " min width reduced to %zu", width_min)); } - /* reduce columns with extreme fields */ - if (width > tb->termwidth && extremes) { - DBG(TAB, ul_debugobj(tb, " reduce width (extreme columns)")); + /* calculate statistics */ + scols_reset_iter(&itr, SCOLS_ITER_FORWARD); + while (scols_table_next_column(tb, &itr, &cl) == 0) { - scols_reset_iter(&itr, SCOLS_ITER_FORWARD); - while (scols_table_next_column(tb, &itr, &cl) == 0) { - size_t org_width; + count_column_deviation(tb, cl); - if (!cl->is_extreme || scols_column_is_hidden(cl)) - continue; + if (scols_column_is_noextremes(cl)) + ignore_extremes++; + } - org_width = cl->width; - rc = count_column_width(tb, cl, buf); - if (rc) - goto done; + /* remember last column before we sort columns */ + last_cl = list_entry(tb->tb_columns.prev, struct libscols_column, cl_columns); - if (org_width > cl->width) - width -= org_width - cl->width; - else - extremes--; /* hmm... nothing reduced */ + /* reduce columns with extreme cells */ + if (width > tb->termwidth && ignore_extremes) { + if (!sorted) { + sort_columns(tb, cmp_deviation); + sorted = 1; + } + + /* Let's follow 68%–95%–99% rule (aka empirical rule). It means + * "avg + (n * standard_deviation)" covers 68% of data for n=1, + * 95% for n=2 and 99% for n=3. We try n=2 and n=1. */ + for (stage = 2; width > tb->termwidth && stage > 0; stage--) { + scols_reset_iter(&itr, SCOLS_ITER_BACKWARD); + + while (scols_table_next_column(tb, &itr, &cl) == 0) { + size_t old = cl->width, new, reduce; + + if (!scols_column_is_noextremes(cl) || scols_column_is_hidden(cl)) + continue; + if (!cl->wstat.width_deviation) + continue; + + new = cl->wstat.width_avg + (stage * cl->wstat.width_deviation); + if (new < cl->wstat.width_min) + new = cl->wstat.width_min; + + reduce = old - new; + if (width - reduce < tb->termwidth) + reduce = width - tb->termwidth; + + cl->width = old - reduce; + DBG(TAB, ul_debugobj(tb, " reduce to %zu (extreme %s)", + cl->width, cl->header.data)); + width -= reduce; + if (width <= tb->termwidth) + break; + } } } if (width < tb->termwidth) { - if (extremes) { - DBG(TAB, ul_debugobj(tb, " enlarge width (extreme columns)")); + if (ignore_extremes) { + if (!sorted) { + sort_columns(tb, cmp_deviation); + sorted = 1; + } - /* enlarge the first extreme column */ - scols_reset_iter(&itr, SCOLS_ITER_FORWARD); + scols_reset_iter(&itr, SCOLS_ITER_BACKWARD); while (scols_table_next_column(tb, &itr, &cl) == 0) { size_t add; - if (!cl->is_extreme || scols_column_is_hidden(cl)) + if (!scols_column_is_noextremes(cl) || scols_column_is_hidden(cl)) continue; - /* this column is too large, ignore? - if (cl->width_max - cl->width > - (tb->termwidth - width)) - continue; - */ - add = tb->termwidth - width; - if (add && cl->width + add > cl->width_max) - add = cl->width_max - cl->width; + if (add && cl->width + add > cl->wstat.width_max) + add = cl->wstat.width_max - cl->width; + DBG(TAB, ul_debugobj(tb, " add +%zd (extreme %s)", + add, cl->header.data)); cl->width += add; width += add; @@ -308,10 +407,12 @@ int __scols_calculate(struct libscols_table *tb, struct ul_buffer *buf) /* try enlarging all columns */ while (width < tb->termwidth) { - scols_reset_iter(&itr, SCOLS_ITER_FORWARD); + scols_reset_iter(&itr, SCOLS_ITER_BACKWARD); while (scols_table_next_column(tb, &itr, &cl) == 0) { if (scols_column_is_hidden(cl)) continue; + DBG(TAB, ul_debugobj(tb, " enlarge (max-out %s)", + cl->header.data)); cl->width++; width++; if (width == tb->termwidth) @@ -320,13 +421,10 @@ int __scols_calculate(struct libscols_table *tb, struct ul_buffer *buf) } } else if (width < tb->termwidth) { /* enlarge the last column */ - struct libscols_column *col = list_entry( - tb->tb_columns.prev, struct libscols_column, cl_columns); - DBG(TAB, ul_debugobj(tb, " enlarge width (last column)")); - if (!scols_column_is_right(col)) { - col->width += tb->termwidth - width; + if (!scols_column_is_right(last_cl)) { + last_cl->width += tb->termwidth - width; width = tb->termwidth; } } @@ -334,8 +432,8 @@ int __scols_calculate(struct libscols_table *tb, struct ul_buffer *buf) /* bad, we have to reduce output width, this is done in three stages: * - * 1) trunc relative with trunc flag if the column width is greater than - * expected column width (it means "width_hint * terminal_width"). + * 1) trunc column with relative with hint and trunc flag if the column width + * is greater than expected column width (it means "width_hint * terminal_width"). * * 2) trunc all with trunc flag * @@ -350,10 +448,11 @@ int __scols_calculate(struct libscols_table *tb, struct ul_buffer *buf) DBG(TAB, ul_debugobj(tb, " reduce width - #%d stage (current=%zu, wanted=%zu)", stage, width, tb->termwidth)); - scols_reset_iter(&itr, SCOLS_ITER_FORWARD); + scols_reset_iter(&itr, SCOLS_ITER_BACKWARD); while (scols_table_next_column(tb, &itr, &cl) == 0) { int trunc_flag = 0; + size_t reduce = 1; DBG(TAB, ul_debugobj(cl, " checking %s (width=%zu, treeart=%zu)", cl->header.data, cl->width, cl->width_treeart)); @@ -363,7 +462,7 @@ int __scols_calculate(struct libscols_table *tb, struct ul_buffer *buf) break; /* never truncate if already minimal width */ - if (cl->width == cl->width_min) + if (cl->width == cl->wstat.width_min) continue; /* never truncate the tree */ @@ -374,6 +473,9 @@ int __scols_calculate(struct libscols_table *tb, struct ul_buffer *buf) if (cl->width == 0) continue; + if (cl->wstat.width_deviation / 2 > 1.0) + reduce = (size_t) cl->wstat.width_deviation; + trunc_flag = scols_column_is_trunc(cl) || (scols_column_is_wrap(cl) && !scols_column_is_customwrap(cl)); @@ -388,8 +490,8 @@ int __scols_calculate(struct libscols_table *tb, struct ul_buffer *buf) break; DBG(TAB, ul_debugobj(tb, " reducing (relative with flag)")); - cl->width--; - width--; + cl->width -= reduce; + width -= reduce; break; /* #2 stage - trunc all with TRUNC flag */ @@ -398,8 +500,8 @@ int __scols_calculate(struct libscols_table *tb, struct ul_buffer *buf) break; DBG(TAB, ul_debugobj(tb, " reducing (all with flag)")); - cl->width--; - width--; + cl->width -= reduce; + width -= reduce; break; /* #3 stage - trunc relative without flag */ @@ -408,8 +510,8 @@ int __scols_calculate(struct libscols_table *tb, struct ul_buffer *buf) break; DBG(TAB, ul_debugobj(tb, " reducing (relative without flag)")); - cl->width--; - width--; + cl->width -= reduce; + width -= reduce; break; } @@ -423,6 +525,11 @@ int __scols_calculate(struct libscols_table *tb, struct ul_buffer *buf) stage++; } + if (sorted) { + sort_columns(tb, cmp_seqnum); + sorted = 0; + } + /* ignore last column(s) or force last column to be truncated if * nowrap mode enabled */ if (tb->no_wrap && width > tb->termwidth) { @@ -446,6 +553,9 @@ int __scols_calculate(struct libscols_table *tb, struct ul_buffer *buf) } } done: + if (sorted) + sort_columns(tb, cmp_seqnum); + tb->is_dummy_print = 0; DBG(TAB, ul_debugobj(tb, "-----final width: %zu (rc=%d)-----", width, rc)); ON_DBG(TAB, dbg_columns(tb)); diff --git a/libsmartcols/src/column.c b/libsmartcols/src/column.c index 0283b976f4..8ebfa1ea27 100644 --- a/libsmartcols/src/column.c +++ b/libsmartcols/src/column.c @@ -105,14 +105,12 @@ struct libscols_column *scols_copy_column(const struct libscols_column *cl) goto err; ret->width = cl->width; - ret->width_min = cl->width_min; - ret->width_max = cl->width_max; - ret->width_avg = cl->width_avg; ret->width_hint = cl->width_hint; ret->flags = cl->flags; - ret->is_extreme = cl->is_extreme; ret->is_groups = cl->is_groups; + memcpy(&ret->wstat, &cl->wstat, sizeof(cl->wstat)); + return ret; err: scols_unref_column(ret); @@ -169,7 +167,7 @@ int scols_column_set_flags(struct libscols_column *cl, int flags) cl->table->ntreecols--; } - DBG(COL, ul_debugobj(cl, "setting flags from 0%x to 0%x", cl->flags, flags)); + DBG(COL, ul_debugobj(cl, "setting flags from 0x%04x to 0x%04x", cl->flags, flags)); cl->flags = flags; return 0; } diff --git a/libsmartcols/src/smartcolsP.h b/libsmartcols/src/smartcolsP.h index 8b83a1420c..8a7ee9b5d7 100644 --- a/libsmartcols/src/smartcolsP.h +++ b/libsmartcols/src/smartcolsP.h @@ -83,10 +83,21 @@ struct libscols_cell { char *color; void *userdata; int flags; + size_t width; }; extern int scols_line_move_cells(struct libscols_line *ln, size_t newn, size_t oldn); +struct libscols_wstat { + size_t width_min; + size_t width_max; + double width_avg; + double width_sqr_sum; + double width_deviation; + + size_t width_treeart; +}; + /* * Table column */ @@ -94,15 +105,11 @@ struct libscols_column { int refcount; /* reference counter */ size_t seqnum; /* column index */ - size_t width; /* real column width */ - size_t width_min; /* minimal width (usually header width) */ - size_t width_max; /* maximal width */ - size_t width_avg; /* average width, used to detect extreme fields */ - size_t width_treeart; /* size of the tree ascii art */ + size_t width; /* expected column width */ + size_t width_treeart; double width_hint; /* hint (N < 1 is in percent of termwidth) */ - size_t extreme_sum; - int extreme_count; + struct libscols_wstat wstat; /* private __scols_calculate() data */ int json_type; /* SCOLS_JSON_* */ @@ -133,8 +140,7 @@ struct libscols_column { struct libscols_table *table; - unsigned int is_extreme : 1, /* extreme width in the column */ - is_groups : 1; /* print group chart */ + unsigned int is_groups : 1; /* print group chart */ };