Bug 633776: Automatic charset detection for text attachments

author Byron Jones <bjones@mozilla.com>

Wed, 9 Mar 2011 09:46:02 +0000 (17:46 +0800)

committer Byron Jones <bjones@mozilla.com>

Wed, 9 Mar 2011 09:46:02 +0000 (17:46 +0800)
author Byron Jones <bjones@mozilla.com>
Wed, 9 Mar 2011 09:46:02 +0000 (17:46 +0800)
committer Byron Jones <bjones@mozilla.com>
Wed, 9 Mar 2011 09:46:02 +0000 (17:46 +0800)
diff --git a/Bugzilla/Config/Attachment.pm b/Bugzilla/Config/Attachment.pm

old mode 100644 (file)

new mode 100755 (executable)
diff --git a/Bugzilla/Config/Common.pm b/Bugzilla/Config/Common.pm

old mode 100644 (file)

new mode 100755 (executable)
diff --git a/Bugzilla/Install/Requirements.pm b/Bugzilla/Install/Requirements.pm

old mode 100644 (file)

new mode 100755 (executable)

index 047ed36..5b8f77e
--- a/Bugzilla/Install/Requirements.pm
+++ b/Bugzilla/Install/Requirements.pm
@@ -291,6 +291,19 @@ sub OPTIONAL_MODULES {
          version => 0,
          feature => ['html_desc'],
      },
+    {
+        # we need version 2.21 of Encode for mime_name
+        package => 'Encode',
+        module  => 'Encode',
+        version => 2.21,
+        feature => ['detect_charset'],
+    },
+    {
+        package => 'Encode-Detect',
+        module  => 'Encode::Detect',
+        version => 0,
+        feature => ['detect_charset'],
+    },
  
      # Inbound Email
      {
diff --git a/Bugzilla/Util.pm b/Bugzilla/Util.pm

old mode 100644 (file)

new mode 100755 (executable)

index 058a49a..ced1549
--- a/Bugzilla/Util.pm
+++ b/Bugzilla/Util.pm
@@ -43,7 +43,8 @@ use base qw(Exporter);
                               file_mod_time is_7bit_clean
                               bz_crypt generate_random_password
                               validate_email_syntax clean_text
-                             get_text template_var disable_utf8);
+                             get_text template_var disable_utf8
+                             detect_encoding);
  
  use Bugzilla::Constants;
  
@@ -58,6 +59,8 @@ use Math::Random::Secure qw(irand);
  use Scalar::Util qw(tainted blessed);
  use Template::Filters;
  use Text::Wrap;
+use Encode qw(encode decode resolve_alias);
+use Encode::Guess;
  
  sub trick_taint {
      require Carp;
@@ -673,6 +676,63 @@ sub disable_utf8 {
      }
  }
  
+use constant UTF8_ACCIDENTAL => qw(shiftjis big5-eten euc-kr euc-jp);
+
+sub detect_encoding {
+    my $data = shift;
+
+    if (!Bugzilla->feature('detect_charset')) {
+        require Bugzilla::Error;
+        Bugzilla::Error::ThrowCodeError('feature_disabled',
+            { feature => 'detect_charset' });
+    }
+
+    require Encode::Detect::Detector;
+    import Encode::Detect::Detector 'detect';
+
+    my $encoding = detect($data);
+    $encoding = resolve_alias($encoding) if $encoding;
+
+    # Encode::Detect is bad at detecting certain charsets, but Encode::Guess
+    # is better at them. Here's the details:
+
+    # shiftjis, big5-eten, euc-kr, and euc-jp: (Encode::Detect
+    # tends to accidentally mis-detect UTF-8 strings as being
+    # these encodings.)
+    if ($encoding && grep($_ eq $encoding, UTF8_ACCIDENTAL)) {
+        $encoding = undef;
+        my $decoder = guess_encoding($data, UTF8_ACCIDENTAL);
+        $encoding = $decoder->name if ref $decoder;
+    }
+
+    # Encode::Detect sometimes mis-detects various ISO encodings as iso-8859-8,
+    # but Encode::Guess can usually tell which one it is.
+    if ($encoding && $encoding eq 'iso-8859-8') {
+        my $decoded_as = _guess_iso($data, 'iso-8859-8', 
+            # These are ordered this way because it gives the most 
+            # accurate results.
+            qw(iso-8859-7 iso-8859-2));
+        $encoding = $decoded_as if $decoded_as;
+    }
+
+    return $encoding;
+}
+
+# A helper for detect_encoding.
+sub _guess_iso {
+    my ($data, $versus, @isos) = (shift, shift, shift);
+
+    my $encoding;
+    foreach my $iso (@isos) {
+        my $decoder = guess_encoding($data, ($iso, $versus));
+        if (ref $decoder) {
+            $encoding = $decoder->name if ref $decoder;
+            last;
+        }
+    }
+    return $encoding;
+}
+
  1;
  
  __END__
@@ -903,6 +963,12 @@ ASCII 10 (LineFeed) and ASCII 13 (Carrage Return).
  
  Disable utf8 on STDOUT (and display raw data instead).
  
+=item C<detect_encoding($str)>
+
+Guesses what encoding a given data is encoded in, returning the canonical name
+of the detected encoding (which may be different from the MIME charset 
+specification).
+
  =item C<clean_text($str)>
  Returns the parameter "cleaned" by exchanging non-printable characters with spaces.
  Specifically characters (ASCII 0 through 31) and (ASCII 127) will become ASCII 32 (Space).
diff --git a/attachment.cgi b/attachment.cgi

index 9273b5f2989a56fb04b62f6eb95876597c033a5b..8ea802f44fd505443e5a85c9458dab862b0ec961 100755 (executable)
--- a/attachment.cgi
+++ b/attachment.cgi
@@ -53,7 +53,7 @@ use Bugzilla::Attachment::PatchReader;
  use Bugzilla::Token;
  use Bugzilla::Keyword;
  
-use Encode qw(encode);
+use Encode qw(encode find_encoding);
  
  # For most scripts we don't make $cgi and $template global variables. But
  # when preparing Bugzilla for mod_perl, this script used these
@@ -335,6 +335,12 @@ sub view {
          # In order to prevent Apache from adding a charset, we have to send a
          # charset that's a single space.
          $cgi->charset(' ');
+        if (Bugzilla->feature('detect_charset') && $contenttype =~ /^text\//) {
+            my $encoding = detect_encoding($attachment->data);
+            if ($encoding) {
+                $cgi->charset(find_encoding($encoding)->mime_name);
+            }
+        }
      }
      print $cgi->header(-type=>"$contenttype; name=\"$filename\"",
                         -content_disposition=> "$disposition; filename=\"$filename\"",
diff --git a/contrib/recode.pl b/contrib/recode.pl

index f7ba034aca25558debb14e329f879ca1b7789e00..f8de12eb1e18105b4cbfe9b050863e4e23fd2e17 100755 (executable)
--- a/contrib/recode.pl
+++ b/contrib/recode.pl
@@ -24,10 +24,10 @@ use lib qw(. lib);
  
  use Bugzilla;
  use Bugzilla::Constants;
+use Bugzilla::Util qw(detect_encoding);
  
  use Digest::MD5 qw(md5_base64);
  use Encode qw(encode decode resolve_alias is_utf8);
-use Encode::Guess;
  use Getopt::Long;
  use Pod::Usage;
  
@@ -71,53 +71,6 @@ sub trunc {
      return $truncated;
  }
  
-sub do_guess {
-    my ($data) = @_;
-
-    my $encoding = detect($data);
-    $encoding = resolve_alias($encoding) if $encoding;
-
-    # Encode::Detect is bad at detecting certain charsets, but Encode::Guess
-    # is better at them. Here's the details:
-
-    # shiftjis, big5-eten, euc-kr, and euc-jp: (Encode::Detect
-    # tends to accidentally mis-detect UTF-8 strings as being
-    # these encodings.)
-    my @utf8_accidental = qw(shiftjis big5-eten euc-kr euc-jp);
-    if ($encoding && grep($_ eq $encoding, @utf8_accidental)) {
-        $encoding = undef;
-        my $decoder = guess_encoding($data, @utf8_accidental);
-        $encoding = $decoder->name if ref $decoder;
-    }
-
-    # Encode::Detect sometimes mis-detects various ISO encodings as iso-8859-8,
-    # but Encode::Guess can usually tell which one it is.
-    if ($encoding && $encoding eq 'iso-8859-8') {
-        my $decoded_as = guess_iso($data, 'iso-8859-8', 
-            # These are ordered this way because it gives the most 
-            # accurate results.
-            qw(iso-8859-7 iso-8859-2));
-        $encoding = $decoded_as if $decoded_as;
-    }
-
-    return $encoding;
-}
-
-# A helper for do_guess.
-sub guess_iso {
-    my ($data, $versus, @isos) = @_;
-
-    my $encoding;
-    foreach my $iso (@isos) {
-        my $decoder = guess_encoding($data, ($iso, $versus));
-        if (ref $decoder) {
-            $encoding = $decoder->name if ref $decoder;
-            last;
-        }
-    }
-    return $encoding;
-}
-
  sub is_valid_utf8 {
      my ($str) = @_;
      Encode::_utf8_on($str);
@@ -143,8 +96,6 @@ if (exists $switch{'charset'}) {
  }
  
  if ($switch{'guess'}) {
-    # Encode::Detect::Detector doesn't seem to return a true value.
-    # So we have to check if we can run detect.
      if (!eval { require Encode::Detect::Detector }) {
          my $root = ROOT_USER;
          print STDERR <<EOT;
@@ -156,8 +107,6 @@ Encode::Detect, run the following command:
  EOT
          exit;
      }
-
-    import Encode::Detect::Detector qw(detect);
  }
  
  my %overrides;
@@ -255,7 +204,7 @@ foreach my $table ($dbh->bz_table_list_real) {
  
                  my $encoding;
                  if ($switch{'guess'}) {
-                    $encoding = do_guess($data);
+                    $encoding = detect_encoding($data);
  
                      # We only show failures if they don't appear to be
                      # ASCII.
diff --git a/template/en/default/admin/params/attachment.html.tmpl b/template/en/default/admin/params/attachment.html.tmpl

old mode 100644 (file)

new mode 100755 (executable)
diff --git a/template/en/default/setup/strings.txt.pl b/template/en/default/setup/strings.txt.pl

old mode 100644 (file)

new mode 100755 (executable)

index fe4f65e..2284c87
--- a/template/en/default/setup/strings.txt.pl
+++ b/template/en/default/setup/strings.txt.pl
@@ -108,6 +108,7 @@ END
      feature_smtp_auth         => 'SMTP Authentication',
      feature_updates           => 'Automatic Update Notifications',
      feature_xmlrpc            => 'XML-RPC Interface',
+    feature_detect_charset    => 'Automatic charset detection for text attachments',
  
      file_remove => 'Removing ##name##...',
      file_rename => 'Renaming ##from## to ##to##...',
author	Byron Jones <bjones@mozilla.com>
	Wed, 9 Mar 2011 09:46:02 +0000 (17:46 +0800)
committer	Byron Jones <bjones@mozilla.com>
	Wed, 9 Mar 2011 09:46:02 +0000 (17:46 +0800)
Bugzilla/Config/Attachment.pm	[changed mode: 0644->0755]	patch \| blob \| blame \| history
Bugzilla/Config/Common.pm	[changed mode: 0644->0755]	patch \| blob \| blame \| history
Bugzilla/Install/Requirements.pm	[changed mode: 0644->0755]	patch \| blob \| blame \| history
Bugzilla/Util.pm	[changed mode: 0644->0755]	patch \| blob \| blame \| history
attachment.cgi		patch \| blob \| blame \| history
contrib/recode.pl		patch \| blob \| blame \| history
template/en/default/admin/params/attachment.html.tmpl	[changed mode: 0644->0755]	patch \| blob \| blame \| history
template/en/default/setup/strings.txt.pl	[changed mode: 0644->0755]	patch \| blob \| blame \| history