a somewhat messy improvement of re.sub. makes simple things 3x faster (not

quite beating CPython)
author: Carl Friedrich Bolz <cfbolz@gmx.de> 2016-03-04 10:27:07 +0100
committer: Carl Friedrich Bolz <cfbolz@gmx.de> 2016-03-04 10:27:07 +0100
commit: 0f08f31e6f620a71459dfc43964b0da23eaf1ec7 (patch)
tree: 458fb1ae70b3fc91d417e6a623b5f84a1a363d8c /pypy/module
parent: don't go via the less efficient BufMatchContext if the string is simply a str (diff)
download: pypy-0f08f31e6f620a71459dfc43964b0da23eaf1ec7.tar.gz
pypy-0f08f31e6f620a71459dfc43964b0da23eaf1ec7.tar.bz2
pypy-0f08f31e6f620a71459dfc43964b0da23eaf1ec7.zip
1 files changed, 67 insertions, 15 deletions
diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py
index 03bc219701..5546402e55 100644
--- a/pypy/module/_sre/interp_sre.py
+++ b/pypy/module/_sre/interp_sre.py
@@ -7,6 +7,7 @@ from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault
 from pypy.interpreter.error import OperationError
 from rpython.rlib.rarithmetic import intmask
 from rpython.rlib import jit
+from rpython.rlib.rstring import StringBuilder, UnicodeBuilder
 
 # ____________________________________________________________
 #
@@ -226,6 +227,11 @@ class W_SRE_Pattern(W_Root):
 
     def subx(self, w_ptemplate, w_string, count):
         space = self.space
+        # use a (much faster) string/unicode builder if w_ptemplate and
+        # w_string are both string or both unicode objects, and if w_ptemplate
+        # is a literal
+        use_builder = False
+        filter_as_unicode = filter_as_string = None
         if space.is_true(space.callable(w_ptemplate)):
             w_filter = w_ptemplate
             filter_is_callable = True
@@ -233,6 +239,8 @@ class W_SRE_Pattern(W_Root):
             if space.isinstance_w(w_ptemplate, space.w_unicode):
                 filter_as_unicode = space.unicode_w(w_ptemplate)
                 literal = u'\\' not in filter_as_unicode
+                use_builder = (
+                    space.isinstance_w(w_string, space.w_unicode) and literal)
             else:
                 try:
                     filter_as_string = space.str_w(w_ptemplate)
@@ -242,6 +250,8 @@ class W_SRE_Pattern(W_Root):
                     literal = False
                 else:
                     literal = '\\' not in filter_as_string
+                    use_builder = (
+                        space.isinstance_w(w_string, space.w_str) and literal)
             if literal:
                 w_filter = w_ptemplate
                 filter_is_callable = False
@@ -252,19 +262,28 @@ class W_SRE_Pattern(W_Root):
                                              space.wrap(self), w_ptemplate)
                 filter_is_callable = space.is_true(space.callable(w_filter))
         #
+        # XXX this is a bit of a mess, but it improves performance a lot
         ctx = self.make_ctx(w_string)
-        sublist_w = []
+        sublist_w = strbuilder = unicodebuilder = None
+        if use_builder:
+            if filter_as_unicode is not None:
+                unicodebuilder = UnicodeBuilder(ctx.end)
+            else:
+                assert filter_as_string is not None
+                strbuilder = StringBuilder(ctx.end)
+        else:
+            sublist_w = []
         n = last_pos = 0
         while not count or n < count:
             if not searchcontext(space, ctx):
                 break
             if last_pos < ctx.match_start:
-                sublist_w.append(slice_w(space, ctx, last_pos,
-                                         ctx.match_start, space.w_None))
+                _sub_append_slice(
+                    ctx, space, use_builder, sublist_w,
+                    strbuilder, unicodebuilder, last_pos, ctx.match_start)
             start = ctx.match_end
             if start == ctx.match_start:
                 start += 1
-            nextctx = ctx.fresh_copy(start)
             if not (last_pos == ctx.match_start
                              == ctx.match_end and n > 0):
                 # the above ignores empty matches on latest position
@@ -272,27 +291,60 @@ class W_SRE_Pattern(W_Root):
                     w_match = self.getmatch(ctx, True)
                     w_piece = space.call_function(w_filter, w_match)
                     if not space.is_w(w_piece, space.w_None):
+                        assert strbuilder is None and unicodebuilder is None
+                        assert not use_builder
                         sublist_w.append(w_piece)
                 else:
-                    sublist_w.append(w_filter)
+                    if use_builder:
+                        if strbuilder is not None:
+                            assert filter_as_string is not None
+                            strbuilder.append(filter_as_string)
+                        else:
+                            assert unicodebuilder is not None
+                            assert filter_as_unicode is not None
+                            unicodebuilder.append(filter_as_unicode)
+                    else:
+                        sublist_w.append(w_filter)
                 last_pos = ctx.match_end
                 n += 1
             elif last_pos >= ctx.end:
                 break    # empty match at the end: finished
-            ctx = nextctx
+            ctx.reset(start)
 
         if last_pos < ctx.end:
-            sublist_w.append(slice_w(space, ctx, last_pos, ctx.end,
-                                     space.w_None))
-
-        if space.isinstance_w(w_string, space.w_unicode):
-            w_emptystr = space.wrap(u'')
+            _sub_append_slice(ctx, space, use_builder, sublist_w,
+                              strbuilder, unicodebuilder, last_pos, ctx.end)
+        if use_builder:
+            if strbuilder is not None:
+                return space.wrap(strbuilder.build()), n
+            else:
+                assert unicodebuilder is not None
+                return space.wrap(unicodebuilder.build()), n
         else:
-            w_emptystr = space.wrap('')
-        w_item = space.call_method(w_emptystr, 'join',
-                                   space.newlist(sublist_w))
-        return w_item, n
+            if space.isinstance_w(w_string, space.w_unicode):
+                w_emptystr = space.wrap(u'')
+            else:
+                w_emptystr = space.wrap('')
+            w_item = space.call_method(w_emptystr, 'join',
+                                       space.newlist(sublist_w))
+            return w_item, n
+
 
+def _sub_append_slice(ctx, space, use_builder, sublist_w,
+                      strbuilder, unicodebuilder, start, end):
+    if use_builder:
+        if isinstance(ctx, rsre_core.BufMatchContext):
+            assert strbuilder is not None
+            return strbuilder.append(ctx._buffer.getslice(start, end, 1, end-start))
+        if isinstance(ctx, rsre_core.StrMatchContext):
+            assert strbuilder is not None
+            return strbuilder.append_slice(ctx._string, start, end)
+        elif isinstance(ctx, rsre_core.UnicodeMatchContext):
+            assert unicodebuilder is not None
+            return unicodebuilder.append_slice(ctx._unicodestr, start, end)
+        assert 0, "unreachable"
+    else:
+        sublist_w.append(slice_w(space, ctx, start, end, space.w_None))
 
 @unwrap_spec(flags=int, groups=int, w_groupindex=WrappedDefault(None),
              w_indexgroup=WrappedDefault(None))
author	Carl Friedrich Bolz <cfbolz@gmx.de>	2016-03-04 10:27:07 +0100
committer	Carl Friedrich Bolz <cfbolz@gmx.de>	2016-03-04 10:27:07 +0100
commit	0f08f31e6f620a71459dfc43964b0da23eaf1ec7 (patch)
tree	458fb1ae70b3fc91d417e6a623b5f84a1a363d8c /pypy/module
parent	don't go via the less efficient BufMatchContext if the string is simply a str (diff)
download	pypy-0f08f31e6f620a71459dfc43964b0da23eaf1ec7.tar.gz pypy-0f08f31e6f620a71459dfc43964b0da23eaf1ec7.tar.bz2 pypy-0f08f31e6f620a71459dfc43964b0da23eaf1ec7.zip