From 158a579505440c8945891562eaa1aca973ca94cb Mon Sep 17 00:00:00 2001 From: Carl Friedrich Bolz-Tereick Date: Tue, 2 Mar 2021 13:23:40 +0100 Subject: fast path for unicode.upper/lower for ascii --- pypy/objspace/std/test/test_unicodeobject.py | 9 +++++++++ pypy/objspace/std/unicodeobject.py | 26 ++++++++++++++++++++------ 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/pypy/objspace/std/test/test_unicodeobject.py b/pypy/objspace/std/test/test_unicodeobject.py index 6b1c7315da..e8763dc496 100644 --- a/pypy/objspace/std/test/test_unicodeobject.py +++ b/pypy/objspace/std/test/test_unicodeobject.py @@ -207,6 +207,15 @@ class TestUnicodeObject: for end in range(start, len(u)): assert w_u._unicode_sliced_constant_index_jit(space, start, end)._utf8 == u[start: end].encode("utf-8") + def test_lower_upper_ascii(self): + from pypy.module.unicodedata.interp_ucd import unicodedb + # check that ascii chars tolower/toupper still behave sensibly in the + # unicodedb - unlikely to ever change, but well + for ch in range(128): + unilower, = unicodedb.tolower_full(ch) + assert chr(unilower) == chr(ch).lower() + uniupper, = unicodedb.toupper_full(ch) + assert chr(uniupper) == chr(ch).upper() class AppTestUnicodeStringStdOnly: def test_compares(self): diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py index 4fa1a98437..0be4a9e55c 100644 --- a/pypy/objspace/std/unicodeobject.py +++ b/pypy/objspace/std/unicodeobject.py @@ -503,11 +503,18 @@ class W_UnicodeObject(W_Root): return tformat.formatter_field_name_split() def descr_lower(self, space): - builder = rutf8.Utf8StringBuilder(len(self._utf8)) - for ch in rutf8.Utf8StringIterator(self._utf8): + if self.is_ascii(): + return space.newutf8(self._utf8.lower(), len(self._utf8)) + return self._descr_lower(self._utf8) + + @staticmethod + @jit.elidable + def _descr_lower(utf8): + builder = rutf8.Utf8StringBuilder(len(utf8)) + for ch in rutf8.Utf8StringIterator(utf8): lower = unicodedb.tolower(ch) builder.append_code(lower) - return self.from_utf8builder(builder) + return W_UnicodeObject.from_utf8builder(builder) def descr_isdecimal(self, space): return self._is_generic(space, '_isdecimal') @@ -650,11 +657,18 @@ class W_UnicodeObject(W_Root): return space.newlist(strs_w) def descr_upper(self, space): - builder = rutf8.Utf8StringBuilder(len(self._utf8)) - for ch in rutf8.Utf8StringIterator(self._utf8): + if self.is_ascii(): + return space.newutf8(self._utf8.upper(), len(self._utf8)) + return self._descr_upper(self._utf8) + + @staticmethod + @jit.elidable + def _descr_upper(utf8): + builder = rutf8.Utf8StringBuilder(len(utf8)) + for ch in rutf8.Utf8StringIterator(utf8): ch = unicodedb.toupper(ch) builder.append_code(ch) - return self.from_utf8builder(builder) + return W_UnicodeObject.from_utf8builder(builder) @unwrap_spec(width=int) def descr_zfill(self, space, width): -- cgit v1.2.3-65-gdbad