Bug 579568: Search.pm: Improve the implementation and performance of

substring and "words" searches, improve the formatting of generated SQL, and use real subselects instead of performing the subselect and using its results in an IN. r=mkanat, a=mkanat (module owner)
author: Max Kanat-Alexander <mkanat@bugzilla.org> 2010-07-17 17:52:52 -0700
committer: Max Kanat-Alexander <mkanat@bugzilla.org> 2010-07-17 17:52:52 -0700
commit: 7b5e8372bf4306cf99eddb474bd8b7754510a683 (patch)
tree: 4426d79fcdf18bc740273d852f8b0eeda6f1e13c
parent: Bug 67036: Allow searching for product, component, etc. names that contain (diff)
download: bugzilla-7b5e8372bf4306cf99eddb474bd8b7754510a683.tar.gz
bugzilla-7b5e8372bf4306cf99eddb474bd8b7754510a683.tar.bz2
bugzilla-7b5e8372bf4306cf99eddb474bd8b7754510a683.zip
3 files changed, 81 insertions, 70 deletions
diff --git a/Bugzilla/DB.pm b/Bugzilla/DB.pm
index 31051dd8e..117c3a7b0 100644
--- a/Bugzilla/DB.pm
+++ b/Bugzilla/DB.pm
@@ -78,6 +78,21 @@ use constant ENUM_DEFAULTS => {
 # Used by Bugzilla::Bug::possible_duplicates.
 use constant FULLTEXT_OR => '';
 
+# These are used in regular expressions to mean "the start or end of a word".
+#
+# We don't use [[:<:]] and [[:>:]], even though they mean
+# "start and end of a word" and are supported by both MySQL and PostgreSQL,
+# because they don't work if your search starts or ends with a non-alphanumeric
+# character, and there's a fair chance somebody will want to use the "word"
+# search to search flags for something like "review+".
+#
+# We do use [:almum:] because it is supported by at least MySQL and
+# PostgreSQL, and hopefully will get us as much Unicode support as possible,
+# depending on how well the regexp engines of the various databases support
+# Unicode.
+use constant WORD_START => '(^|[^[:alnum:]])';
+use constant WORD_END   => '($|[^[:alnum:]])';
+
 #####################################################################
 # Connection Methods
 #####################################################################
diff --git a/Bugzilla/Search.pm b/Bugzilla/Search.pm
index 563e75901..80e8dce43 100644
--- a/Bugzilla/Search.pm
+++ b/Bugzilla/Search.pm
@@ -1050,7 +1050,9 @@ END
 
 sub _sql_where {
     my ($self, $where_terms) = @_;
-    return join(' AND ', $self->_standard_where, @$where_terms);
+    # The newline and this particular spacing makes the resulting
+    # SQL a bit more readable for debugging.
+    return join("\n   AND ", $self->_standard_where, @$where_terms);
 }
 
 ################################
@@ -1676,48 +1678,7 @@ sub SqlifyDate {
 
 sub build_subselect {
     my ($outer, $inner, $table, $cond) = @_;
-    my $q = "SELECT $inner FROM $table WHERE $cond";
-    #return "$outer IN ($q)";
-    my $dbh = Bugzilla->dbh;
-    my $list = $dbh->selectcol_arrayref($q);
-    return "1=2" unless @$list; # Could use boolean type on dbs which support it
-    return $dbh->sql_in($outer, $list);}
-
-sub GetByWordList {
-    my ($field, $strs) = (@_);
-    my @list;
-    my $dbh = Bugzilla->dbh;
-    return [] unless defined $strs;
-
-    foreach my $w (split(/[\s,]+/, $strs)) {
-        my $word = $w;
-        if ($word ne "") {
-            $word =~ tr/A-Z/a-z/;
-            $word = $dbh->quote('(^|[^a-z0-9])' . quotemeta($word) . '($|[^a-z0-9])');
-            trick_taint($word);
-            push(@list, $dbh->sql_regexp($field, $word));
-        }
-    }
-
-    return \@list;
-}
-
-# Support for "any/all/nowordssubstr" comparison type ("words as substrings")
-sub GetByWordListSubstr {
-    my ($field, $strs) = (@_);
-    my @list;
-    my $dbh = Bugzilla->dbh;
-    my $sql_word;
-
-    foreach my $word (split(/[\s,]+/, $strs)) {
-        if ($word ne "") {
-            $sql_word = $dbh->quote($word);
-            trick_taint($sql_word);
-            push(@list, $dbh->sql_iposition($sql_word, $field) . " > 0");
-        }
-    }
-
-    return \@list;
+    return "$outer IN (SELECT $inner FROM $table WHERE $cond)";
 }
 
 sub pronoun {
@@ -1741,6 +1702,10 @@ sub pronoun {
     return 0;
 }
 
+# Used by anyexact to get the list of input values. This allows us to
+# support values with commas inside of them in the standard charts, and
+# still accept string values for the boolean charts (and split them on
+# commas).
 sub _all_values {
     my ($self, $args, $split_on) = @_;
     $split_on ||= qr/[\s,]+/;
@@ -1764,6 +1729,50 @@ sub _all_values {
     return @array;
 }
 
+# Support for "any/all/nowordssubstr" comparison type ("words as substrings")
+sub _substring_terms {
+    my ($self, $args) = @_;
+    my $dbh = Bugzilla->dbh;
+
+    # We don't have to (or want to) use _all_values, because we'd just
+    # split each term on spaces and commas anyway.
+    my @words = split(/[\s,]+/, $args->{value});
+    @words = grep { defined $_ and $_ ne '' } @words;
+    @words = map { $dbh->quote($_) } @words;
+    my @terms = map { $dbh->sql_iposition($_, $args->{full_field}) . " > 0" }
+                    @words;
+    return @terms;
+}
+
+sub _word_terms {
+    my ($self, $args) = @_;
+    my $dbh = Bugzilla->dbh;
+    
+    my @values = split(/[\s,]+/, $args->{value});
+    @values = grep { defined $_ and $_ ne '' } @values;
+    my @substring_terms = $self->_substring_terms($args);
+    
+    my @terms;
+    my $start = $dbh->WORD_START;
+    my $end   = $dbh->WORD_END;
+    foreach my $word (@values) {
+        my $regex  = $start . quotemeta($word) . $end;
+        my $quoted = $dbh->quote($regex);
+        # We don't have to check the regexp, because we escaped it, so we're
+        # sure it's valid.
+        my $regex_term = $dbh->sql_regexp($args->{full_field}, $quoted,
+                                          'no check');
+        # Regular expressions are slow--substring searches are faster.
+        # If we're searching for a word, we're also certain that the
+        # substring will appear in the value. So we limit first by
+        # substring and then by a regex that will match just words.
+        my $substring_term = shift @substring_terms;
+        push(@terms, "$substring_term AND $regex_term");
+    }
+    
+    return @terms;
+}
+
 ######################
 # Public Subroutines #
 ######################
@@ -2338,9 +2347,6 @@ sub _flagtypes_name {
     };
     push(@$joins, $flagtypes_join);
     
-    # Generate the condition by running the operator-specific
-    # function. Afterwards the condition resides in the $args->{term}
-    # variable.
     my $full_field = $dbh->sql_string_concat("$flagtypes.name",
                                              "$flags.status");
     $args->{full_field} = $full_field;
@@ -2710,16 +2716,15 @@ sub _anywordsubstr {
     my ($self, $args) = @_;
     my ($full_field, $value) = @$args{qw(full_field value)};
     
-    my $list = GetByWordListSubstr($full_field, $value);
-    $args->{term} = join(" OR ", @$list);
+    my @terms = $self->_substring_terms($args);
+    $args->{term} = join("\n\tOR ", @terms);
 }
 
 sub _allwordssubstr {
     my ($self, $args) = @_;
-    my ($full_field, $value) = @$args{qw(full_field value)};
     
-    my $list = GetByWordListSubstr($full_field, $value);
-    $args->{term} = join(" AND ", @$list);
+    my @terms = $self->_substring_terms($args);
+    $args->{term} = join("\n\tAND ", @terms);
 }
 
 sub _nowordssubstr {
@@ -2731,18 +2736,19 @@ sub _nowordssubstr {
 
 sub _anywords {
     my ($self, $args) = @_;
-    my ($full_field, $value) = @$args{qw(full_field value)};
     
-    my $list = GetByWordList($full_field, $value);
-    $args->{term} = join(" OR ", @$list);
+    my @terms = $self->_word_terms($args);
+    # Because _word_terms uses AND, we need to parenthesize its terms
+    # if there are more than one.
+    @terms = map("($_)", @terms) if scalar(@terms) > 1;
+    $args->{term} = join("\n\tOR ", @terms);
 }
 
 sub _allwords {
     my ($self, $args) = @_;
-    my ($full_field, $value) = @$args{qw(full_field value)};
     
-    my $list = GetByWordList($full_field, $value);
-    $args->{term} = join(" AND ", @$list);
+    my @terms = $self->_word_terms($args);
+    $args->{term} = join("\n\tAND ", @terms);
 }
 
 sub _nowords {
diff --git a/xt/lib/Bugzilla/Test/Search/Constants.pm b/xt/lib/Bugzilla/Test/Search/Constants.pm
index b03ed30db..248b59f53 100644
--- a/xt/lib/Bugzilla/Test/Search/Constants.pm
+++ b/xt/lib/Bugzilla/Test/Search/Constants.pm
@@ -374,7 +374,6 @@ use constant KNOWN_BROKEN => {
     # Same for attach_data.thedata.
     'allwords-<1>' => {
         ALLWORDS_BROKEN,
-        'attach_data.thedata' => { contains => [1] },
         'flagtypes.name' => { contains => [1] },
     },
 
@@ -384,19 +383,16 @@ use constant KNOWN_BROKEN => {
     # attachments.
     nowords => {
         NOWORDS_BROKEN,
-        'attach_data.thedata' => { contains => [1,5] },
     },
 
     # anywords searches don't work on decimal values.
     # attach_data doesn't work (perhaps because it's the entire
     # data, or some problem with the regex?).
     anywords => {
-        'attach_data.thedata' => { contains => [1] },
         work_time => { contains => [1] },
     },
     'anywords-<1> <2>' => {
         percentage_complete => { contains => [2] },
-        'attach_data.thedata' => { contains => [1,2] },
         work_time => { contains => [1,2] },
     },
 
@@ -477,11 +473,8 @@ use constant KNOWN_BROKEN => {
 # where MySQL isn't, but the result is still a bit surprising to the user.
 use constant PG_BROKEN => {
     'attach_data.thedata' => {
-        allwords       => { },
-        allwordssubstr => { },
-        anywords       => { },
-        notregexp      => { contains => [5] },
-        nowords        => { contains => [5] },
+        notregexp => { contains => [5] },
+        nowords   => { contains => [5] },
     },
     percentage_complete => {
         'allwordssubstr-<1>' => { contains => [3] },
@@ -562,7 +555,6 @@ use constant NEGATIVE_BROKEN_NOT => (
 use constant BROKEN_NOT => {
     allwords       => {
         COMMON_BROKEN_NOT,
-        "attach_data.thedata" => { contains => [1,5] },
         bug_group => { contains => [1] },
         cc => { contains => [1] },
         "flagtypes.name"      => { contains => [1,5] },
@@ -617,13 +609,12 @@ use constant BROKEN_NOT => {
     },
     anywords => {
         COMMON_BROKEN_NOT,
-        "attach_data.thedata" => { contains => [1, 5] },
         "work_time"           => { contains => [1, 2] },
         "work_time"           => { contains => [1] },
         FIELD_TYPE_MULTI_SELECT, { contains => [5] },
     },
     'anywords-<1> <2>' => {
-        'attach_data.thedata' => { contains => [1,2,5] },
+        'attach_data.thedata' => { contains => [5] },
         "percentage_complete" => { contains => [1,3,4,5] },
         work_time => { contains => [1,2] },
     },
@@ -727,7 +718,6 @@ use constant BROKEN_NOT => {
     notsubstring   => { NEGATIVE_BROKEN_NOT },
     nowords        => {
         NEGATIVE_BROKEN_NOT,
-        "attach_data.thedata" => { contains => [1] },
         "work_time"           => { contains => [2, 3, 4] },
         "cc" => { contains => [5] },
         "flagtypes.name" => { },
author	Max Kanat-Alexander <mkanat@bugzilla.org>	2010-07-17 17:52:52 -0700
committer	Max Kanat-Alexander <mkanat@bugzilla.org>	2010-07-17 17:52:52 -0700
commit	7b5e8372bf4306cf99eddb474bd8b7754510a683 (patch)
tree	4426d79fcdf18bc740273d852f8b0eeda6f1e13c
parent	Bug 67036: Allow searching for product, component, etc. names that contain (diff)
download	bugzilla-7b5e8372bf4306cf99eddb474bd8b7754510a683.tar.gz bugzilla-7b5e8372bf4306cf99eddb474bd8b7754510a683.tar.bz2 bugzilla-7b5e8372bf4306cf99eddb474bd8b7754510a683.zip