Bug 579568: Search.pm: Improve the implementation and performance of

author Max Kanat-Alexander <mkanat@bugzilla.org>

Sun, 18 Jul 2010 00:52:52 +0000 (17:52 -0700)

committer Max Kanat-Alexander <mkanat@bugzilla.org>

Sun, 18 Jul 2010 00:52:52 +0000 (17:52 -0700)
author Max Kanat-Alexander <mkanat@bugzilla.org>
Sun, 18 Jul 2010 00:52:52 +0000 (17:52 -0700)
committer Max Kanat-Alexander <mkanat@bugzilla.org>
Sun, 18 Jul 2010 00:52:52 +0000 (17:52 -0700)
diff --git a/Bugzilla/DB.pm b/Bugzilla/DB.pm

index 31051dd8efcb40932a516e0ad07d0547ff562ee5..117c3a7b0143eefd502c16789fcd885f70ff396e 100644 (file)
--- a/Bugzilla/DB.pm
+++ b/Bugzilla/DB.pm
@@ -78,6 +78,21 @@ use constant ENUM_DEFAULTS => {
  # Used by Bugzilla::Bug::possible_duplicates.
  use constant FULLTEXT_OR => '';
  
+# These are used in regular expressions to mean "the start or end of a word".
+#
+# We don't use [[:<:]] and [[:>:]], even though they mean
+# "start and end of a word" and are supported by both MySQL and PostgreSQL,
+# because they don't work if your search starts or ends with a non-alphanumeric
+# character, and there's a fair chance somebody will want to use the "word"
+# search to search flags for something like "review+".
+#
+# We do use [:almum:] because it is supported by at least MySQL and
+# PostgreSQL, and hopefully will get us as much Unicode support as possible,
+# depending on how well the regexp engines of the various databases support
+# Unicode.
+use constant WORD_START => '(^|[^[:alnum:]])';
+use constant WORD_END   => '($|[^[:alnum:]])';
+
  #####################################################################
  # Connection Methods
  #####################################################################
diff --git a/Bugzilla/Search.pm b/Bugzilla/Search.pm

index 563e7590150c82074811f7ab2a725a7b32a3f20b..80e8dce4391bd54a87a4a0a415ba0a46dd0572a6 100644 (file)
--- a/Bugzilla/Search.pm
+++ b/Bugzilla/Search.pm
@@ -1050,7 +1050,9 @@ END
  
  sub _sql_where {
      my ($self, $where_terms) = @_;
-    return join(' AND ', $self->_standard_where, @$where_terms);
+    # The newline and this particular spacing makes the resulting
+    # SQL a bit more readable for debugging.
+    return join("\n   AND ", $self->_standard_where, @$where_terms);
  }
  
  ################################
@@ -1676,48 +1678,7 @@ sub SqlifyDate {
  
  sub build_subselect {
      my ($outer, $inner, $table, $cond) = @_;
-    my $q = "SELECT $inner FROM $table WHERE $cond";
-    #return "$outer IN ($q)";
-    my $dbh = Bugzilla->dbh;
-    my $list = $dbh->selectcol_arrayref($q);
-    return "1=2" unless @$list; # Could use boolean type on dbs which support it
-    return $dbh->sql_in($outer, $list);}
-
-sub GetByWordList {
-    my ($field, $strs) = (@_);
-    my @list;
-    my $dbh = Bugzilla->dbh;
-    return [] unless defined $strs;
-
-    foreach my $w (split(/[\s,]+/, $strs)) {
-        my $word = $w;
-        if ($word ne "") {
-            $word =~ tr/A-Z/a-z/;
-            $word = $dbh->quote('(^|[^a-z0-9])' . quotemeta($word) . '($|[^a-z0-9])');
-            trick_taint($word);
-            push(@list, $dbh->sql_regexp($field, $word));
-        }
-    }
-
-    return \@list;
-}
-
-# Support for "any/all/nowordssubstr" comparison type ("words as substrings")
-sub GetByWordListSubstr {
-    my ($field, $strs) = (@_);
-    my @list;
-    my $dbh = Bugzilla->dbh;
-    my $sql_word;
-
-    foreach my $word (split(/[\s,]+/, $strs)) {
-        if ($word ne "") {
-            $sql_word = $dbh->quote($word);
-            trick_taint($sql_word);
-            push(@list, $dbh->sql_iposition($sql_word, $field) . " > 0");
-        }
-    }
-
-    return \@list;
+    return "$outer IN (SELECT $inner FROM $table WHERE $cond)";
  }
  
  sub pronoun {
@@ -1741,6 +1702,10 @@ sub pronoun {
      return 0;
  }
  
+# Used by anyexact to get the list of input values. This allows us to
+# support values with commas inside of them in the standard charts, and
+# still accept string values for the boolean charts (and split them on
+# commas).
  sub _all_values {
      my ($self, $args, $split_on) = @_;
      $split_on ||= qr/[\s,]+/;
@@ -1764,6 +1729,50 @@ sub _all_values {
      return @array;
  }
  
+# Support for "any/all/nowordssubstr" comparison type ("words as substrings")
+sub _substring_terms {
+    my ($self, $args) = @_;
+    my $dbh = Bugzilla->dbh;
+
+    # We don't have to (or want to) use _all_values, because we'd just
+    # split each term on spaces and commas anyway.
+    my @words = split(/[\s,]+/, $args->{value});
+    @words = grep { defined $_ and $_ ne '' } @words;
+    @words = map { $dbh->quote($_) } @words;
+    my @terms = map { $dbh->sql_iposition($_, $args->{full_field}) . " > 0" }
+                    @words;
+    return @terms;
+}
+
+sub _word_terms {
+    my ($self, $args) = @_;
+    my $dbh = Bugzilla->dbh;
+    
+    my @values = split(/[\s,]+/, $args->{value});
+    @values = grep { defined $_ and $_ ne '' } @values;
+    my @substring_terms = $self->_substring_terms($args);
+    
+    my @terms;
+    my $start = $dbh->WORD_START;
+    my $end   = $dbh->WORD_END;
+    foreach my $word (@values) {
+        my $regex  = $start . quotemeta($word) . $end;
+        my $quoted = $dbh->quote($regex);
+        # We don't have to check the regexp, because we escaped it, so we're
+        # sure it's valid.
+        my $regex_term = $dbh->sql_regexp($args->{full_field}, $quoted,
+                                          'no check');
+        # Regular expressions are slow--substring searches are faster.
+        # If we're searching for a word, we're also certain that the
+        # substring will appear in the value. So we limit first by
+        # substring and then by a regex that will match just words.
+        my $substring_term = shift @substring_terms;
+        push(@terms, "$substring_term AND $regex_term");
+    }
+    
+    return @terms;
+}
+
  ######################
  # Public Subroutines #
  ######################
@@ -2338,9 +2347,6 @@ sub _flagtypes_name {
      };
      push(@$joins, $flagtypes_join);
      
-    # Generate the condition by running the operator-specific
-    # function. Afterwards the condition resides in the $args->{term}
-    # variable.
      my $full_field = $dbh->sql_string_concat("$flagtypes.name",
                                               "$flags.status");
      $args->{full_field} = $full_field;
@@ -2710,16 +2716,15 @@ sub _anywordsubstr {
      my ($self, $args) = @_;
      my ($full_field, $value) = @$args{qw(full_field value)};
      
-    my $list = GetByWordListSubstr($full_field, $value);
-    $args->{term} = join(" OR ", @$list);
+    my @terms = $self->_substring_terms($args);
+    $args->{term} = join("\n\tOR ", @terms);
  }
  
  sub _allwordssubstr {
      my ($self, $args) = @_;
-    my ($full_field, $value) = @$args{qw(full_field value)};
      
-    my $list = GetByWordListSubstr($full_field, $value);
-    $args->{term} = join(" AND ", @$list);
+    my @terms = $self->_substring_terms($args);
+    $args->{term} = join("\n\tAND ", @terms);
  }
  
  sub _nowordssubstr {
@@ -2731,18 +2736,19 @@ sub _nowordssubstr {
  
  sub _anywords {
      my ($self, $args) = @_;
-    my ($full_field, $value) = @$args{qw(full_field value)};
      
-    my $list = GetByWordList($full_field, $value);
-    $args->{term} = join(" OR ", @$list);
+    my @terms = $self->_word_terms($args);
+    # Because _word_terms uses AND, we need to parenthesize its terms
+    # if there are more than one.
+    @terms = map("($_)", @terms) if scalar(@terms) > 1;
+    $args->{term} = join("\n\tOR ", @terms);
  }
  
  sub _allwords {
      my ($self, $args) = @_;
-    my ($full_field, $value) = @$args{qw(full_field value)};
      
-    my $list = GetByWordList($full_field, $value);
-    $args->{term} = join(" AND ", @$list);
+    my @terms = $self->_word_terms($args);
+    $args->{term} = join("\n\tAND ", @terms);
  }
  
  sub _nowords {
diff --git a/xt/lib/Bugzilla/Test/Search/Constants.pm b/xt/lib/Bugzilla/Test/Search/Constants.pm

index b03ed30dbbebcd46f5d134d73ee301688a86e732..248b59f5327eedb7fc0e73e302556436fa3ad7fc 100644 (file)
--- a/xt/lib/Bugzilla/Test/Search/Constants.pm
+++ b/xt/lib/Bugzilla/Test/Search/Constants.pm
@@ -374,7 +374,6 @@ use constant KNOWN_BROKEN => {
      # Same for attach_data.thedata.
      'allwords-<1>' => {
          ALLWORDS_BROKEN,
-        'attach_data.thedata' => { contains => [1] },
          'flagtypes.name' => { contains => [1] },
      },
  
@@ -384,19 +383,16 @@ use constant KNOWN_BROKEN => {
      # attachments.
      nowords => {
          NOWORDS_BROKEN,
-        'attach_data.thedata' => { contains => [1,5] },
      },
  
      # anywords searches don't work on decimal values.
      # attach_data doesn't work (perhaps because it's the entire
      # data, or some problem with the regex?).
      anywords => {
-        'attach_data.thedata' => { contains => [1] },
          work_time => { contains => [1] },
      },
      'anywords-<1> <2>' => {
          percentage_complete => { contains => [2] },
-        'attach_data.thedata' => { contains => [1,2] },
          work_time => { contains => [1,2] },
      },
  
@@ -477,11 +473,8 @@ use constant KNOWN_BROKEN => {
  # where MySQL isn't, but the result is still a bit surprising to the user.
  use constant PG_BROKEN => {
      'attach_data.thedata' => {
-        allwords       => { },
-        allwordssubstr => { },
-        anywords       => { },
-        notregexp      => { contains => [5] },
-        nowords        => { contains => [5] },
+        notregexp => { contains => [5] },
+        nowords   => { contains => [5] },
      },
      percentage_complete => {
          'allwordssubstr-<1>' => { contains => [3] },
@@ -562,7 +555,6 @@ use constant NEGATIVE_BROKEN_NOT => (
  use constant BROKEN_NOT => {
      allwords       => {
          COMMON_BROKEN_NOT,
-        "attach_data.thedata" => { contains => [1,5] },
          bug_group => { contains => [1] },
          cc => { contains => [1] },
          "flagtypes.name"      => { contains => [1,5] },
@@ -617,13 +609,12 @@ use constant BROKEN_NOT => {
      },
      anywords => {
          COMMON_BROKEN_NOT,
-        "attach_data.thedata" => { contains => [1, 5] },
          "work_time"           => { contains => [1, 2] },
          "work_time"           => { contains => [1] },
          FIELD_TYPE_MULTI_SELECT, { contains => [5] },
      },
      'anywords-<1> <2>' => {
-        'attach_data.thedata' => { contains => [1,2,5] },
+        'attach_data.thedata' => { contains => [5] },
          "percentage_complete" => { contains => [1,3,4,5] },
          work_time => { contains => [1,2] },
      },
@@ -727,7 +718,6 @@ use constant BROKEN_NOT => {
      notsubstring   => { NEGATIVE_BROKEN_NOT },
      nowords        => {
          NEGATIVE_BROKEN_NOT,
-        "attach_data.thedata" => { contains => [1] },
          "work_time"           => { contains => [2, 3, 4] },
          "cc" => { contains => [5] },
          "flagtypes.name" => { },
author	Max Kanat-Alexander <mkanat@bugzilla.org>
	Sun, 18 Jul 2010 00:52:52 +0000 (17:52 -0700)
committer	Max Kanat-Alexander <mkanat@bugzilla.org>
	Sun, 18 Jul 2010 00:52:52 +0000 (17:52 -0700)
Bugzilla/DB.pm		patch \| blob \| blame \| history
Bugzilla/Search.pm		patch \| blob \| blame \| history
xt/lib/Bugzilla/Test/Search/Constants.pm		patch \| blob \| blame \| history