aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCarl Friedrich Bolz <cfbolz@gmx.de>2016-03-04 10:27:07 +0100
committerCarl Friedrich Bolz <cfbolz@gmx.de>2016-03-04 10:27:07 +0100
commit0f08f31e6f620a71459dfc43964b0da23eaf1ec7 (patch)
tree458fb1ae70b3fc91d417e6a623b5f84a1a363d8c /pypy/module
parentdon't go via the less efficient BufMatchContext if the string is simply a str (diff)
downloadpypy-0f08f31e6f620a71459dfc43964b0da23eaf1ec7.tar.gz
pypy-0f08f31e6f620a71459dfc43964b0da23eaf1ec7.tar.bz2
pypy-0f08f31e6f620a71459dfc43964b0da23eaf1ec7.zip
a somewhat messy improvement of re.sub. makes simple things 3x faster (not
quite beating CPython)
Diffstat (limited to 'pypy/module')
-rw-r--r--pypy/module/_sre/interp_sre.py82
1 files changed, 67 insertions, 15 deletions
diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py
index 03bc219701..5546402e55 100644
--- a/pypy/module/_sre/interp_sre.py
+++ b/pypy/module/_sre/interp_sre.py
@@ -7,6 +7,7 @@ from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault
from pypy.interpreter.error import OperationError
from rpython.rlib.rarithmetic import intmask
from rpython.rlib import jit
+from rpython.rlib.rstring import StringBuilder, UnicodeBuilder
# ____________________________________________________________
#
@@ -226,6 +227,11 @@ class W_SRE_Pattern(W_Root):
def subx(self, w_ptemplate, w_string, count):
space = self.space
+ # use a (much faster) string/unicode builder if w_ptemplate and
+ # w_string are both string or both unicode objects, and if w_ptemplate
+ # is a literal
+ use_builder = False
+ filter_as_unicode = filter_as_string = None
if space.is_true(space.callable(w_ptemplate)):
w_filter = w_ptemplate
filter_is_callable = True
@@ -233,6 +239,8 @@ class W_SRE_Pattern(W_Root):
if space.isinstance_w(w_ptemplate, space.w_unicode):
filter_as_unicode = space.unicode_w(w_ptemplate)
literal = u'\\' not in filter_as_unicode
+ use_builder = (
+ space.isinstance_w(w_string, space.w_unicode) and literal)
else:
try:
filter_as_string = space.str_w(w_ptemplate)
@@ -242,6 +250,8 @@ class W_SRE_Pattern(W_Root):
literal = False
else:
literal = '\\' not in filter_as_string
+ use_builder = (
+ space.isinstance_w(w_string, space.w_str) and literal)
if literal:
w_filter = w_ptemplate
filter_is_callable = False
@@ -252,19 +262,28 @@ class W_SRE_Pattern(W_Root):
space.wrap(self), w_ptemplate)
filter_is_callable = space.is_true(space.callable(w_filter))
#
+ # XXX this is a bit of a mess, but it improves performance a lot
ctx = self.make_ctx(w_string)
- sublist_w = []
+ sublist_w = strbuilder = unicodebuilder = None
+ if use_builder:
+ if filter_as_unicode is not None:
+ unicodebuilder = UnicodeBuilder(ctx.end)
+ else:
+ assert filter_as_string is not None
+ strbuilder = StringBuilder(ctx.end)
+ else:
+ sublist_w = []
n = last_pos = 0
while not count or n < count:
if not searchcontext(space, ctx):
break
if last_pos < ctx.match_start:
- sublist_w.append(slice_w(space, ctx, last_pos,
- ctx.match_start, space.w_None))
+ _sub_append_slice(
+ ctx, space, use_builder, sublist_w,
+ strbuilder, unicodebuilder, last_pos, ctx.match_start)
start = ctx.match_end
if start == ctx.match_start:
start += 1
- nextctx = ctx.fresh_copy(start)
if not (last_pos == ctx.match_start
== ctx.match_end and n > 0):
# the above ignores empty matches on latest position
@@ -272,27 +291,60 @@ class W_SRE_Pattern(W_Root):
w_match = self.getmatch(ctx, True)
w_piece = space.call_function(w_filter, w_match)
if not space.is_w(w_piece, space.w_None):
+ assert strbuilder is None and unicodebuilder is None
+ assert not use_builder
sublist_w.append(w_piece)
else:
- sublist_w.append(w_filter)
+ if use_builder:
+ if strbuilder is not None:
+ assert filter_as_string is not None
+ strbuilder.append(filter_as_string)
+ else:
+ assert unicodebuilder is not None
+ assert filter_as_unicode is not None
+ unicodebuilder.append(filter_as_unicode)
+ else:
+ sublist_w.append(w_filter)
last_pos = ctx.match_end
n += 1
elif last_pos >= ctx.end:
break # empty match at the end: finished
- ctx = nextctx
+ ctx.reset(start)
if last_pos < ctx.end:
- sublist_w.append(slice_w(space, ctx, last_pos, ctx.end,
- space.w_None))
-
- if space.isinstance_w(w_string, space.w_unicode):
- w_emptystr = space.wrap(u'')
+ _sub_append_slice(ctx, space, use_builder, sublist_w,
+ strbuilder, unicodebuilder, last_pos, ctx.end)
+ if use_builder:
+ if strbuilder is not None:
+ return space.wrap(strbuilder.build()), n
+ else:
+ assert unicodebuilder is not None
+ return space.wrap(unicodebuilder.build()), n
else:
- w_emptystr = space.wrap('')
- w_item = space.call_method(w_emptystr, 'join',
- space.newlist(sublist_w))
- return w_item, n
+ if space.isinstance_w(w_string, space.w_unicode):
+ w_emptystr = space.wrap(u'')
+ else:
+ w_emptystr = space.wrap('')
+ w_item = space.call_method(w_emptystr, 'join',
+ space.newlist(sublist_w))
+ return w_item, n
+
+def _sub_append_slice(ctx, space, use_builder, sublist_w,
+ strbuilder, unicodebuilder, start, end):
+ if use_builder:
+ if isinstance(ctx, rsre_core.BufMatchContext):
+ assert strbuilder is not None
+ return strbuilder.append(ctx._buffer.getslice(start, end, 1, end-start))
+ if isinstance(ctx, rsre_core.StrMatchContext):
+ assert strbuilder is not None
+ return strbuilder.append_slice(ctx._string, start, end)
+ elif isinstance(ctx, rsre_core.UnicodeMatchContext):
+ assert unicodebuilder is not None
+ return unicodebuilder.append_slice(ctx._unicodestr, start, end)
+ assert 0, "unreachable"
+ else:
+ sublist_w.append(slice_w(space, ctx, start, end, space.w_None))
@unwrap_spec(flags=int, groups=int, w_groupindex=WrappedDefault(None),
w_indexgroup=WrappedDefault(None))