From: Eric Wong Date: Mon, 9 Oct 2023 17:56:23 +0000 (+0000) Subject: www_coderepo: fix handling of non-UTF-8 git data X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=5754faeb3fa1c9aaeff8922b449127cfbc86236d;p=thirdparty%2Fpublic-inbox.git www_coderepo: fix handling of non-UTF-8 git data We can't assume git output is UTF-8, and we'll always have legacy data in git coderepos. So attempt to display some some garbled text rather than nothing at all if Perl croaks on it. sox commit c38987e8d20505621b8d872863afa7d233ed1096 (Added raw inverse-bit u-law and A-law support. Updated *.txt files., 2001-12-13) is an example of a commit which caused problems for me. --- diff --git a/lib/PublicInbox/Hval.pm b/lib/PublicInbox/Hval.pm index 0677865e3..e9b9ae649 100644 --- a/lib/PublicInbox/Hval.pm +++ b/lib/PublicInbox/Hval.pm @@ -4,13 +4,13 @@ # represents a header value in various forms. Used for HTML generation # in our web interface(s) package PublicInbox::Hval; +use v5.10.1; # be careful about unicode_strings in v5.12; use strict; -use warnings; use Encode qw(find_encoding); use PublicInbox::MID qw/mid_clean mid_escape/; use base qw/Exporter/; our @EXPORT_OK = qw/ascii_html obfuscate_addrs to_filename src_escape - to_attr prurl mid_href fmt_ts ts2str/; + to_attr prurl mid_href fmt_ts ts2str utf8_maybe/; use POSIX qw(strftime); my $enc_ascii = find_encoding('us-ascii'); @@ -137,4 +137,9 @@ sub ts2str ($) { strftime('%Y%m%d%H%M%S', gmtime($_[0])) }; # human-friendly format sub fmt_ts ($) { strftime('%Y-%m-%d %k:%M', gmtime($_[0])) } +sub utf8_maybe ($) { + utf8::decode($_[0]); + utf8::valid($_[0]) or utf8::encode($_[0]); # non-UTF-8 data exists +} + 1; diff --git a/lib/PublicInbox/RepoAtom.pm b/lib/PublicInbox/RepoAtom.pm index c89d45514..79b76c12a 100644 --- a/lib/PublicInbox/RepoAtom.pm +++ b/lib/PublicInbox/RepoAtom.pm @@ -8,7 +8,7 @@ use parent qw(PublicInbox::GzipFilter); use POSIX qw(strftime); use URI::Escape qw(uri_escape); use Scalar::Util (); -use PublicInbox::Hval qw(ascii_html); +use PublicInbox::Hval qw(ascii_html utf8_maybe); # git for-each-ref and log use different format fields :< my $ATOM_FMT = '--pretty=tformat:'.join('%n', @@ -50,7 +50,7 @@ sub translate { my $is_tag = $self->{-is_tag}; my ($H, $ct, $an, $ae, $at, $s, $bdy); while ($lbuf =~ s/\A([^\0]+)\0\n//s) { - utf8::decode($bdy = $1); + utf8_maybe($bdy = $1); if ($is_tag) { my %r; eval "$bdy"; diff --git a/lib/PublicInbox/RepoTree.pm b/lib/PublicInbox/RepoTree.pm index 9c7b86b33..5c73531a8 100644 --- a/lib/PublicInbox/RepoTree.pm +++ b/lib/PublicInbox/RepoTree.pm @@ -8,7 +8,7 @@ use PublicInbox::ViewDiff qw(uri_escape_path); use PublicInbox::WwwStatic qw(r); use PublicInbox::Qspawn; use PublicInbox::WwwStream qw(html_oneshot); -use PublicInbox::Hval qw(ascii_html); +use PublicInbox::Hval qw(ascii_html utf8_maybe); sub rd_404_log { my ($bref, $ctx) = @_; @@ -26,7 +26,7 @@ sub rd_404_log { $code = 404; } else { my ($H, $h, $s_as) = split(/ /, $$bref, 3); - utf8::decode($s_as); + utf8_maybe($s_as); my $x = uri_escape_path($ctx->{-path}); $s_as = ascii_html($s_as); print $zfh <new; - utf8::decode($$bref); + utf8_maybe($$bref); html_page($ctx, 200, '
', $l->to_html($$bref), '

', dbg_log($ctx)); } sub cmt_title { # git->cat_async callback my ($bref, $oid, $type, $size, $ctx) = @_; - utf8::decode($$bref); + utf8_maybe($$bref); my $title = $$bref =~ /\r?\n\r?\n([^\r\n]+)\r?\n?/ ? $1 : ''; push(@{$ctx->{-cmt_pt}} , ascii_html($title)) == @{$ctx->{-cmt_P}} and cmt_finalize($ctx); @@ -160,8 +160,7 @@ sub show_commit_start { # ->psgi_qx callback open my $fh, '<', "$ctx->{-tmp}/h" or die "open $ctx->{-tmp}/h: $!"; chop(my $buf = do { local $/ = "\0"; <$fh> }); - utf8::decode($buf); - utf8::valid($buf) or utf8::encode($buf); # non-UTF-8 commits exist + utf8_maybe($buf); # non-UTF-8 commits exist chomp $buf; my ($P, $p); ($P, $p, @{$ctx->{cmt_info}}) = split(/\n/, $buf, 9); @@ -248,12 +247,12 @@ committer $co EOM print $zfh "\n", $ctx->{-linkify}->to_html($bdy) if length($bdy); $bdy = ''; - open my $fh, '<:utf8', "$ctx->{-tmp}/p" or - die "open $ctx->{-tmp}/p: $!"; + open my $fh, '<', "$ctx->{-tmp}/p" or die "open $ctx->{-tmp}/p: $!"; if (-s $fh > $MAX_SIZE) { print $zfh "---\n patch is too large to show\n"; } else { # prepare flush_diff: read($fh, $x, -s _); + utf8_maybe($x); $ctx->{-apfx} = $ctx->{-spfx} = $upfx; $x =~ s/\r?\n/\n/gs; $ctx->{-anchors} = {} if $x =~ /^diff --git /sm; @@ -418,7 +417,7 @@ EOM undef $_; ($m, $t, $oid, $sz) = split(/ +/, $x, 4); $m = $GIT_MODE{$m} // '?'; - utf8::decode($f); + utf8_maybe($f); $n = ascii_html($f); if ($m eq 'g') { # gitlink submodule commit $$bref .= "\ng\t\t$n @ commit$oid"; @@ -480,7 +479,7 @@ sub tz_adj ($) { sub show_tag_result { # git->cat_async callback my ($bref, $oid, $type, $size, $ctx) = @_; - utf8::decode($$bref); + utf8_maybe($$bref); my $l = PublicInbox::Linkify->new; $$bref = $l->to_html($$bref); $$bref =~ s!^object ([a-f0-9]+)!object cat_async callback " $raw_more".dbg_log($ctx)); # TODO: detect + convert to ensure validity - utf8::decode($$blob); + utf8_maybe($$blob); my $nl = ($$blob =~ s/\r?\n/\n/sg); my $pad = length($nl); diff --git a/lib/PublicInbox/WwwCoderepo.pm b/lib/PublicInbox/WwwCoderepo.pm index 834145e9f..e8c340b5d 100644 --- a/lib/PublicInbox/WwwCoderepo.pm +++ b/lib/PublicInbox/WwwCoderepo.pm @@ -14,7 +14,7 @@ use PublicInbox::ViewVCS; use PublicInbox::WwwStatic qw(r); use PublicInbox::GitHTTPBackend; use PublicInbox::WwwStream; -use PublicInbox::Hval qw(ascii_html); +use PublicInbox::Hval qw(ascii_html utf8_maybe); use PublicInbox::ViewDiff qw(uri_escape_path); use PublicInbox::RepoSnapshot; use PublicInbox::RepoAtom; @@ -179,7 +179,7 @@ EOM sub capture { # psgi_qx callback to capture git-for-each-ref my ($bref, $arg) = @_; # arg = [ctx, key, OnDestroy(summary_END)] - utf8::decode($$bref); + utf8_maybe($$bref); $arg->[0]->{qx_res}->{$arg->[1]} = $$bref; # summary_END may be called via OnDestroy $arg->[2] } @@ -241,13 +241,13 @@ sub translate { $fbuf .= shift while @_; if ($ctx->{-heads}) { while ($fbuf =~ s/\A([^\n]+)\n//s) { - utf8::decode(my $x = $1); + utf8_maybe(my $x = $1); push @out, _refs_heads_link($x, '../../'); } } else { my ($snap_pfx, @snap_fmt) = _snapshot_link_prep($ctx); while ($fbuf =~ s/\A([^\n]+)\n//s) { - utf8::decode(my $x = $1); + utf8_maybe(my $x = $1); push @out, _refs_tags_link($x, '../../', $snap_pfx, @snap_fmt); } diff --git a/xt/solver.t b/xt/solver.t index 06f5a4936..51b4144c2 100644 --- a/xt/solver.t +++ b/xt/solver.t @@ -32,23 +32,30 @@ my $todo = { 'c2f3bf071ee90b01f2d629921bb04c4f798f02fa/s/', # tag '7eb93c89651c47c8095d476251f2e4314656b292/s/', # non-UTF-8 ], + 'sox-devel' => [ + 'c38987e8d20505621b8d872863afa7d233ed1096/s/', # non-UTF-8 + ] }; -my ($ibx_name, $urls, @gone); +my @gone; my $client = sub { my ($cb) = @_; - for my $u (@$urls) { - my $url = "/$ibx_name/$u"; - my $res = $cb->(GET($url)); - is($res->code, 200, $url); - next if $res->code == 200; - diag "$url failed"; - diag $res->content; + for my $ibx_name (sort keys %$todo) { + diag "testing $ibx_name"; + my $urls = $todo->{$ibx_name}; + for my $u (@$urls) { + my $url = "/$ibx_name/$u"; + my $res = $cb->(GET($url)); + is($res->code, 200, $url); + next if $res->code == 200; + diag "$url failed"; + diag $res->content; + } } }; my $nr = 0; -while (($ibx_name, $urls) = each %$todo) { +while (my ($ibx_name, $urls) = each %$todo) { SKIP: { my $ibx = $cfg->lookup_name($ibx_name); if (!$ibx) { @@ -61,15 +68,13 @@ while (($ibx_name, $urls) = each %$todo) { skip(qq{publicinbox.$ibx_name.coderepo not configured}, scalar(@$urls)); } - test_psgi($app, $client); $nr++; } } delete @$todo{@gone}; +test_psgi($app, $client); my $env = { PI_CONFIG => PublicInbox::Config->default_file }; -while (($ibx_name, $urls) = each %$todo) { - test_httpd($env, $client, $nr); -} +test_httpd($env, $client, $nr); done_testing();