diff options
author | Carl Friedrich Bolz <cfbolz@gmx.de> | 2016-03-04 10:27:07 +0100 |
---|---|---|
committer | Carl Friedrich Bolz <cfbolz@gmx.de> | 2016-03-04 10:27:07 +0100 |
commit | 0f08f31e6f620a71459dfc43964b0da23eaf1ec7 (patch) | |
tree | 458fb1ae70b3fc91d417e6a623b5f84a1a363d8c /pypy/module | |
parent | don't go via the less efficient BufMatchContext if the string is simply a str (diff) | |
download | pypy-0f08f31e6f620a71459dfc43964b0da23eaf1ec7.tar.gz pypy-0f08f31e6f620a71459dfc43964b0da23eaf1ec7.tar.bz2 pypy-0f08f31e6f620a71459dfc43964b0da23eaf1ec7.zip |
a somewhat messy improvement of re.sub. makes simple things 3x faster (not
quite beating CPython)
Diffstat (limited to 'pypy/module')
-rw-r--r-- | pypy/module/_sre/interp_sre.py | 82 |
1 files changed, 67 insertions, 15 deletions
diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py index 03bc219701..5546402e55 100644 --- a/pypy/module/_sre/interp_sre.py +++ b/pypy/module/_sre/interp_sre.py @@ -7,6 +7,7 @@ from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault from pypy.interpreter.error import OperationError from rpython.rlib.rarithmetic import intmask from rpython.rlib import jit +from rpython.rlib.rstring import StringBuilder, UnicodeBuilder # ____________________________________________________________ # @@ -226,6 +227,11 @@ class W_SRE_Pattern(W_Root): def subx(self, w_ptemplate, w_string, count): space = self.space + # use a (much faster) string/unicode builder if w_ptemplate and + # w_string are both string or both unicode objects, and if w_ptemplate + # is a literal + use_builder = False + filter_as_unicode = filter_as_string = None if space.is_true(space.callable(w_ptemplate)): w_filter = w_ptemplate filter_is_callable = True @@ -233,6 +239,8 @@ class W_SRE_Pattern(W_Root): if space.isinstance_w(w_ptemplate, space.w_unicode): filter_as_unicode = space.unicode_w(w_ptemplate) literal = u'\\' not in filter_as_unicode + use_builder = ( + space.isinstance_w(w_string, space.w_unicode) and literal) else: try: filter_as_string = space.str_w(w_ptemplate) @@ -242,6 +250,8 @@ class W_SRE_Pattern(W_Root): literal = False else: literal = '\\' not in filter_as_string + use_builder = ( + space.isinstance_w(w_string, space.w_str) and literal) if literal: w_filter = w_ptemplate filter_is_callable = False @@ -252,19 +262,28 @@ class W_SRE_Pattern(W_Root): space.wrap(self), w_ptemplate) filter_is_callable = space.is_true(space.callable(w_filter)) # + # XXX this is a bit of a mess, but it improves performance a lot ctx = self.make_ctx(w_string) - sublist_w = [] + sublist_w = strbuilder = unicodebuilder = None + if use_builder: + if filter_as_unicode is not None: + unicodebuilder = UnicodeBuilder(ctx.end) + else: + assert filter_as_string is not None + strbuilder = StringBuilder(ctx.end) + else: + sublist_w = [] n = last_pos = 0 while not count or n < count: if not searchcontext(space, ctx): break if last_pos < ctx.match_start: - sublist_w.append(slice_w(space, ctx, last_pos, - ctx.match_start, space.w_None)) + _sub_append_slice( + ctx, space, use_builder, sublist_w, + strbuilder, unicodebuilder, last_pos, ctx.match_start) start = ctx.match_end if start == ctx.match_start: start += 1 - nextctx = ctx.fresh_copy(start) if not (last_pos == ctx.match_start == ctx.match_end and n > 0): # the above ignores empty matches on latest position @@ -272,27 +291,60 @@ class W_SRE_Pattern(W_Root): w_match = self.getmatch(ctx, True) w_piece = space.call_function(w_filter, w_match) if not space.is_w(w_piece, space.w_None): + assert strbuilder is None and unicodebuilder is None + assert not use_builder sublist_w.append(w_piece) else: - sublist_w.append(w_filter) + if use_builder: + if strbuilder is not None: + assert filter_as_string is not None + strbuilder.append(filter_as_string) + else: + assert unicodebuilder is not None + assert filter_as_unicode is not None + unicodebuilder.append(filter_as_unicode) + else: + sublist_w.append(w_filter) last_pos = ctx.match_end n += 1 elif last_pos >= ctx.end: break # empty match at the end: finished - ctx = nextctx + ctx.reset(start) if last_pos < ctx.end: - sublist_w.append(slice_w(space, ctx, last_pos, ctx.end, - space.w_None)) - - if space.isinstance_w(w_string, space.w_unicode): - w_emptystr = space.wrap(u'') + _sub_append_slice(ctx, space, use_builder, sublist_w, + strbuilder, unicodebuilder, last_pos, ctx.end) + if use_builder: + if strbuilder is not None: + return space.wrap(strbuilder.build()), n + else: + assert unicodebuilder is not None + return space.wrap(unicodebuilder.build()), n else: - w_emptystr = space.wrap('') - w_item = space.call_method(w_emptystr, 'join', - space.newlist(sublist_w)) - return w_item, n + if space.isinstance_w(w_string, space.w_unicode): + w_emptystr = space.wrap(u'') + else: + w_emptystr = space.wrap('') + w_item = space.call_method(w_emptystr, 'join', + space.newlist(sublist_w)) + return w_item, n + +def _sub_append_slice(ctx, space, use_builder, sublist_w, + strbuilder, unicodebuilder, start, end): + if use_builder: + if isinstance(ctx, rsre_core.BufMatchContext): + assert strbuilder is not None + return strbuilder.append(ctx._buffer.getslice(start, end, 1, end-start)) + if isinstance(ctx, rsre_core.StrMatchContext): + assert strbuilder is not None + return strbuilder.append_slice(ctx._string, start, end) + elif isinstance(ctx, rsre_core.UnicodeMatchContext): + assert unicodebuilder is not None + return unicodebuilder.append_slice(ctx._unicodestr, start, end) + assert 0, "unreachable" + else: + sublist_w.append(slice_w(space, ctx, start, end, space.w_None)) @unwrap_spec(flags=int, groups=int, w_groupindex=WrappedDefault(None), w_indexgroup=WrappedDefault(None)) |