hg merge default

author: Ronan Lamy <ronan.lamy@gmail.com> 2019-03-12 18:18:02 +0000
committer: Ronan Lamy <ronan.lamy@gmail.com> 2019-03-12 18:18:02 +0000
commit: 6eac96a58627e5638969e92335ac307c533aa644 (patch)
tree: 49b3e1101750ce1ebcebb479fff383d62b86a805 /pypy/objspace/std/unicodeobject.py
parent: hg merge default (diff)
parent: reduce code duplication (diff)
download: pypy-6eac96a58627e5638969e92335ac307c533aa644.tar.gz
pypy-6eac96a58627e5638969e92335ac307c533aa644.tar.bz2
pypy-6eac96a58627e5638969e92335ac307c533aa644.zip
1 files changed, 827 insertions, 211 deletions
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
index 5cea4b4802..efd72611c0 100644
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -1,14 +1,15 @@
 """The builtin unicode implementation"""
 
 from rpython.rlib.objectmodel import (
-    compute_hash, compute_unique_id, import_from_mixin,
-    enforceargs)
+    compute_hash, compute_unique_id, import_from_mixin, always_inline,
+    enforceargs, newlist_hint, specialize, we_are_translated)
 from rpython.rlib.buffer import StringBuffer
 from rpython.rlib.mutbuffer import MutableStringBuffer
-from rpython.rlib.rstring import StringBuilder, UnicodeBuilder
-from rpython.rlib.runicode import (
-    make_unicode_escape_function, str_decode_ascii, str_decode_utf_8,
-    unicode_encode_ascii, unicode_encode_utf_8, fast_str_decode_ascii)
+from rpython.rlib.rarithmetic import ovfcheck
+from rpython.rlib.rstring import (
+    StringBuilder, split, rsplit, UnicodeBuilder, replace_count, startswith,
+    endswith)
+from rpython.rlib import rutf8, jit
 
 from pypy.interpreter import unicodehelper
 from pypy.interpreter.baseobjspace import W_Root
@@ -19,6 +20,8 @@ from pypy.module.unicodedata.interp_ucd import unicodedb
 from pypy.objspace.std import newformat
 from pypy.objspace.std.basestringtype import basestring_typedef
 from pypy.objspace.std.formatting import mod_format
+from pypy.objspace.std.sliceobject import (W_SliceObject,
+    unwrap_start_stop, normalize_simple_slice)
 from pypy.objspace.std.stringmethods import StringMethods
 from pypy.objspace.std.util import IDTAG_SPECIAL, IDTAG_SHIFT
 
@@ -29,25 +32,33 @@ __all__ = ['W_UnicodeObject', 'wrapunicode', 'plain_str2unicode',
 
 class W_UnicodeObject(W_Root):
     import_from_mixin(StringMethods)
-    _immutable_fields_ = ['_value']
+    _immutable_fields_ = ['_utf8']
+
+    @enforceargs(utf8str=str)
+    def __init__(self, utf8str, length):
+        assert isinstance(utf8str, str)
+        assert length >= 0
+        self._utf8 = utf8str
+        self._length = length
+        self._index_storage = rutf8.null_storage()
+        if not we_are_translated():
+            try:
+                # best effort, too expensive to handle surrogates
+                ulength = rutf8.codepoints_in_utf(utf8str)
+            except:
+                ulength = length 
+            assert ulength == length
 
-    @enforceargs(uni=unicode)
-    def __init__(self, unistr):
-        assert isinstance(unistr, unicode)
-        self._value = unistr
 
-    def __repr__(self):
-        """representation for debugging purposes"""
-        return "%s(%r)" % (self.__class__.__name__, self._value)
 
-    def unwrap(self, space):
-        # for testing
-        return self._value
+    @staticmethod
+    def from_utf8builder(builder):
+        return W_UnicodeObject(
+            builder.build(), builder.getlength())
 
-    def create_if_subclassed(self):
-        if type(self) is W_UnicodeObject:
-            return self
-        return W_UnicodeObject(self._value)
+    def __repr__(self):
+        """representation for debugging purposes"""
+        return "%s(%r)" % (self.__class__.__name__, self._utf8)
 
     def is_w(self, space, w_other):
         if not isinstance(w_other, W_UnicodeObject):
@@ -56,9 +67,9 @@ class W_UnicodeObject(W_Root):
             return True
         if self.user_overridden_class or w_other.user_overridden_class:
             return False
-        s1 = space.unicode_w(self)
-        s2 = space.unicode_w(w_other)
-        if len(s2) > 1:
+        s1 = space.utf8_w(self)
+        s2 = space.utf8_w(w_other)
+        if len(s2) > 2:
             return s1 is s2
         else:            # strings of len <= 1 are unique-ified
             return s1 == s2
@@ -66,61 +77,62 @@ class W_UnicodeObject(W_Root):
     def immutable_unique_id(self, space):
         if self.user_overridden_class:
             return None
-        s = space.unicode_w(self)
-        if len(s) > 1:
+        s = space.utf8_w(self)
+        if len(s) > 2:
             uid = compute_unique_id(s)
         else:            # strings of len <= 1 are unique-ified
             if len(s) == 1:
                 base = ~ord(s[0])      # negative base values
+            elif len(s) == 2:
+                base = ~((ord(s[1]) << 8) | ord(s[0]))
             else:
                 base = 257       # empty unicode string: base value 257
             uid = (base << IDTAG_SHIFT) | IDTAG_SPECIAL
         return space.newint(uid)
 
     def str_w(self, space):
-        return space.text_w(space.str(self))
+        return space.text_w(encode_object(space, self, 'ascii', 'strict'))
 
-    def unicode_w(self, space):
-        return self._value
+    def utf8_w(self, space):
+        return self._utf8
 
     def readbuf_w(self, space):
-        from rpython.rlib.rstruct.unichar import pack_unichar, UNICODE_SIZE
-        buf = MutableStringBuffer(len(self._value) * UNICODE_SIZE)
+        # XXX for now
+        from rpython.rlib.rstruct.unichar import pack_codepoint, UNICODE_SIZE
+        builder = MutableStringBuffer(self._len() * UNICODE_SIZE)
         pos = 0
-        for unich in self._value:
-            pack_unichar(unich, buf, pos)
+        i = 0
+        while i < len(self._utf8):
+            unich = rutf8.codepoint_at_pos(self._utf8, i)
+            pack_codepoint(unich, builder, pos)
             pos += UNICODE_SIZE
-        return StringBuffer(buf.finish())
+            i = rutf8.next_codepoint_pos(self._utf8, i)
+        return StringBuffer(builder.finish())
 
     def writebuf_w(self, space):
         raise oefmt(space.w_TypeError,
                     "cannot use unicode as modifiable buffer")
 
-    charbuf_w = str_w
+    def charbuf_w(self, space):
+        # Returns ascii-encoded str
+        return space.text_w(encode_object(space, self, 'ascii', 'strict'))
 
-    def listview_unicode(self):
-        return _create_list_from_unicode(self._value)
+    def listview_utf8(self):
+        assert self.is_ascii()
+        return _create_list_from_unicode(self._utf8)
 
     def ord(self, space):
-        if len(self._value) != 1:
+        if self._len() != 1:
             raise oefmt(space.w_TypeError,
                          "ord() expected a character, but string of length %d "
-                         "found", len(self._value))
-        return space.newint(ord(self._value[0]))
-
-    def _new(self, value):
-        return W_UnicodeObject(value)
-
-    def _new_from_list(self, value):
-        return W_UnicodeObject(u''.join(value))
+                         "found", self._len())
+        return space.newint(rutf8.codepoint_at_pos(self._utf8, 0))
 
     def _empty(self):
         return W_UnicodeObject.EMPTY
 
     def _len(self):
-        return len(self._value)
-
-    _val = unicode_w
+        return self._length
 
     @staticmethod
     def _use_rstr_ops(space, w_other):
@@ -129,67 +141,64 @@ class W_UnicodeObject(W_Root):
         return True
 
     @staticmethod
-    def _op_val(space, w_other, strict=None):
-        if isinstance(w_other, W_UnicodeObject):
-            return w_other._value
+    def convert_arg_to_w_unicode(space, w_other, strict=None):
+        if space.is_w(space.type(w_other), space.w_unicode):
+            # XXX why do we need this for translation???
+            assert isinstance(w_other, W_UnicodeObject)
+            return w_other
         if space.isinstance_w(w_other, space.w_bytes):
-            return unicode_from_string(space, w_other)._value
+            return unicode_from_string(space, w_other)
         if strict:
             raise oefmt(space.w_TypeError,
                 "%s arg must be None, unicode or str", strict)
-        return unicode_from_encoded_object(
-            space, w_other, None, "strict")._value
+        return unicode_from_encoded_object(space, w_other, 'utf8', "strict")
+
+    def convert_to_w_unicode(self, space):
+        return self
 
+    @specialize.argtype(1)
     def _chr(self, char):
         assert len(char) == 1
-        return unicode(char)[0]
+        return unichr(ord(char[0]))
+
+    def _multi_chr(self, unichar):
+        return unichar
 
     _builder = UnicodeBuilder
 
     def _isupper(self, ch):
-        return unicodedb.isupper(ord(ch))
+        return unicodedb.isupper(ch)
 
     def _islower(self, ch):
-        return unicodedb.islower(ord(ch))
+        return unicodedb.islower(ch)
 
     def _isnumeric(self, ch):
-        return unicodedb.isnumeric(ord(ch))
+        return unicodedb.isnumeric(ch)
 
     def _istitle(self, ch):
-        return unicodedb.isupper(ord(ch)) or unicodedb.istitle(ord(ch))
+        return unicodedb.isupper(ch) or unicodedb.istitle(ch)
 
-    def _isspace(self, ch):
-        return unicodedb.isspace(ord(ch))
+    @staticmethod
+    def _isspace(ch):
+        return unicodedb.isspace(ch)
 
     def _isalpha(self, ch):
-        return unicodedb.isalpha(ord(ch))
+        return unicodedb.isalpha(ch)
 
     def _isalnum(self, ch):
-        return unicodedb.isalnum(ord(ch))
+        return unicodedb.isalnum(ch)
 
     def _isdigit(self, ch):
-        return unicodedb.isdigit(ord(ch))
+        return unicodedb.isdigit(ch)
 
     def _isdecimal(self, ch):
-        return unicodedb.isdecimal(ord(ch))
+        return unicodedb.isdecimal(ch)
 
     def _iscased(self, ch):
-        return unicodedb.iscased(ord(ch))
+        return unicodedb.iscased(ch)
 
     def _islinebreak(self, ch):
-        return unicodedb.islinebreak(ord(ch))
-
-    def _upper(self, ch):
-        return unichr(unicodedb.toupper(ord(ch)))
-
-    def _lower(self, ch):
-        return unichr(unicodedb.tolower(ord(ch)))
-
-    def _title(self, ch):
-        return unichr(unicodedb.totitle(ord(ch)))
-
-    def _newlist_unwrapped(self, space, lst):
-        return space.newlist_unicode(lst)
+        return unicodedb.islinebreak(ch)
 
     @staticmethod
     def descr_new(space, w_unicodetype, w_string=None, w_encoding=None,
@@ -212,26 +221,35 @@ class W_UnicodeObject(W_Root):
 
         assert isinstance(w_value, W_UnicodeObject)
         w_newobj = space.allocate_instance(W_UnicodeObject, w_unicodetype)
-        W_UnicodeObject.__init__(w_newobj, w_value._value)
+        W_UnicodeObject.__init__(w_newobj, w_value._utf8, w_value._length)
+        if w_value._index_storage:
+            # copy the storage if it's there
+            w_newobj._index_storage = w_value._index_storage
         return w_newobj
 
     def descr_repr(self, space):
-        chars = self._value
-        size = len(chars)
-        s = _repr_function(chars, size, "strict")
-        return space.newtext(s)
+        return space.newtext(_repr_function(self._utf8))
 
     def descr_str(self, space):
-        return encode_object(space, self, None, None)
+        return encode_object(space, self, 'ascii', 'strict')
 
-    def descr_hash(self, space):
-        x = compute_hash(self._value)
+    def hash_w(self):
+        # shortcut for UnicodeDictStrategy
+        x = compute_hash(self._utf8)
         x -= (x == -1) # convert -1 to -2 without creating a bridge
-        return space.newint(x)
+        return x
+
+    def descr_hash(self, space):
+        return space.newint(self.hash_w())
+
+    def eq_w(self, w_other):
+        # shortcut for UnicodeDictStrategy
+        assert isinstance(w_other, W_UnicodeObject)
+        return self._utf8 == w_other._utf8
 
     def descr_eq(self, space, w_other):
         try:
-            res = self._val(space) == self._op_val(space, w_other)
+            res = self._utf8 == self.convert_arg_to_w_unicode(space, w_other)._utf8
         except OperationError as e:
             if e.match(space, space.w_TypeError):
                 return space.w_NotImplemented
@@ -247,7 +265,7 @@ class W_UnicodeObject(W_Root):
 
     def descr_ne(self, space, w_other):
         try:
-            res = self._val(space) != self._op_val(space, w_other)
+            res = self._utf8 != self.convert_arg_to_w_unicode(space, w_other)._utf8
         except OperationError as e:
             if e.match(space, space.w_TypeError):
                 return space.w_NotImplemented
@@ -263,7 +281,7 @@ class W_UnicodeObject(W_Root):
 
     def descr_lt(self, space, w_other):
         try:
-            res = self._val(space) < self._op_val(space, w_other)
+            res = self._utf8 < self.convert_arg_to_w_unicode(space, w_other)._utf8
         except OperationError as e:
             if e.match(space, space.w_TypeError):
                 return space.w_NotImplemented
@@ -272,7 +290,7 @@ class W_UnicodeObject(W_Root):
 
     def descr_le(self, space, w_other):
         try:
-            res = self._val(space) <= self._op_val(space, w_other)
+            res = self._utf8 <= self.convert_arg_to_w_unicode(space, w_other)._utf8
         except OperationError as e:
             if e.match(space, space.w_TypeError):
                 return space.w_NotImplemented
@@ -281,7 +299,7 @@ class W_UnicodeObject(W_Root):
 
     def descr_gt(self, space, w_other):
         try:
-            res = self._val(space) > self._op_val(space, w_other)
+            res = self._utf8 > self.convert_arg_to_w_unicode(space, w_other)._utf8
         except OperationError as e:
             if e.match(space, space.w_TypeError):
                 return space.w_NotImplemented
@@ -290,7 +308,7 @@ class W_UnicodeObject(W_Root):
 
     def descr_ge(self, space, w_other):
         try:
-            res = self._val(space) >= self._op_val(space, w_other)
+            res = self._utf8 >= self.convert_arg_to_w_unicode(space, w_other)._utf8
         except OperationError as e:
             if e.match(space, space.w_TypeError):
                 return space.w_NotImplemented
@@ -303,11 +321,11 @@ class W_UnicodeObject(W_Root):
     def descr__format__(self, space, w_format_spec):
         if not space.isinstance_w(w_format_spec, space.w_unicode):
             w_format_spec = space.call_function(space.w_unicode, w_format_spec)
-        spec = space.unicode_w(w_format_spec)
+        spec = space.utf8_w(w_format_spec)
         formatter = newformat.unicode_formatter(space, spec)
         self2 = unicode_from_object(space, self)
         assert isinstance(self2, W_UnicodeObject)
-        return formatter.format_string(self2._value)
+        return formatter.format_string(self2)
 
     def descr_mod(self, space, w_values):
         return mod_format(space, self, w_values, do_unicode=True)
@@ -315,71 +333,169 @@ class W_UnicodeObject(W_Root):
     def descr_rmod(self, space, w_values):
         return mod_format(space, w_values, self, do_unicode=True)
 
+    def descr_swapcase(self, space):
+        input = self._utf8
+        builder = rutf8.Utf8StringBuilder(len(input))
+        for ch in rutf8.Utf8StringIterator(input):
+            if unicodedb.isupper(ch):
+                ch = unicodedb.tolower(ch)
+            elif unicodedb.islower(ch):
+                ch = unicodedb.toupper(ch)
+            builder.append_code(ch)
+        return self.from_utf8builder(builder)
+
+    def descr_title(self, space):
+        if len(self._utf8) == 0:
+            return self
+        return self.title_unicode(self._utf8)
+
+    @jit.elidable
+    def title_unicode(self, value):
+        input = self._utf8
+        builder = rutf8.Utf8StringBuilder(len(input))
+        previous_is_cased = False
+        for ch0 in rutf8.Utf8StringIterator(input):
+            if not previous_is_cased:
+                ch1 = unicodedb.totitle(ch0)
+            else:
+                ch1 = unicodedb.tolower(ch0)
+            builder.append_code(ch1)
+            previous_is_cased = unicodedb.iscased(ch0)
+        return self.from_utf8builder(builder)
+
     def descr_translate(self, space, w_table):
-        selfvalue = self._value
-        w_sys = space.getbuiltinmodule('sys')
-        maxunicode = space.int_w(space.getattr(w_sys,
-                                               space.newtext("maxunicode")))
-        result = []
-        for unichar in selfvalue:
+        builder = rutf8.Utf8StringBuilder(len(self._utf8))
+        for codepoint in rutf8.Utf8StringIterator(self._utf8):
             try:
-                w_newval = space.getitem(w_table, space.newint(ord(unichar)))
+                w_newval = space.getitem(w_table, space.newint(codepoint))
             except OperationError as e:
-                if e.match(space, space.w_LookupError):
-                    result.append(unichar)
-                else:
+                if not e.match(space, space.w_LookupError):
                     raise
             else:
                 if space.is_w(w_newval, space.w_None):
                     continue
                 elif space.isinstance_w(w_newval, space.w_int):
-                    newval = space.int_w(w_newval)
-                    if newval < 0 or newval > maxunicode:
-                        raise oefmt(space.w_TypeError,
-                                    "character mapping must be in range(%s)",
-                                    hex(maxunicode + 1))
-                    result.append(unichr(newval))
-                elif space.isinstance_w(w_newval, space.w_unicode):
-                    result.append(space.unicode_w(w_newval))
+                    codepoint = space.int_w(w_newval)
+                elif isinstance(w_newval, W_UnicodeObject):
+                    builder.append_utf8(w_newval._utf8, w_newval._length)
+                    continue
                 else:
                     raise oefmt(space.w_TypeError,
                                 "character mapping must return integer, None "
                                 "or unicode")
-        return W_UnicodeObject(u''.join(result))
+            try:
+                builder.append_code(codepoint)
+            except ValueError:
+                raise oefmt(space.w_TypeError,
+                            "character mapping must be in range(0x110000)")
+        return self.from_utf8builder(builder)
+
+    def descr_find(self, space, w_sub, w_start=None, w_end=None):
+        w_result = self._unwrap_and_search(space, w_sub, w_start, w_end)
+        if w_result is None:
+            w_result = space.newint(-1)
+        return w_result
+
+    def descr_rfind(self, space, w_sub, w_start=None, w_end=None):
+        w_result = self._unwrap_and_search(space, w_sub, w_start, w_end,
+                                           forward=False)
+        if w_result is None:
+            w_result = space.newint(-1)
+        return w_result
+
+    def descr_index(self, space, w_sub, w_start=None, w_end=None):
+        w_result = self._unwrap_and_search(space, w_sub, w_start, w_end)
+        if w_result is None:
+            raise oefmt(space.w_ValueError,
+                        "substring not found in string.index")
+        return w_result
+
+    def descr_rindex(self, space, w_sub, w_start=None, w_end=None):
+        w_result = self._unwrap_and_search(space, w_sub, w_start, w_end,
+                                           forward=False)
+        if w_result is None:
+            raise oefmt(space.w_ValueError,
+                        "substring not found in string.rindex")
+        return w_result
+
+    @specialize.arg(2)
+    def _is_generic(self, space, func_name):
+        func = getattr(self, func_name)
+        if self._length == 0:
+            return space.w_False
+        if self._length == 1:
+            return space.newbool(func(rutf8.codepoint_at_pos(self._utf8, 0)))
+        else:
+            return self._is_generic_loop(space, self._utf8, func_name)
+
+    @specialize.arg(3)
+    def _is_generic_loop(self, space, v, func_name):
+        func = getattr(self, func_name)
+        val = self._utf8
+        for uchar in rutf8.Utf8StringIterator(val):
+            if not func(uchar):
+                return space.w_False
+        return space.w_True
 
     def descr_encode(self, space, w_encoding=None, w_errors=None):
         encoding, errors = _get_encoding_and_errors(space, w_encoding,
                                                     w_errors)
         return encode_object(space, self, encoding, errors)
 
+    @unwrap_spec(tabsize=int)
+    def descr_expandtabs(self, space, tabsize=8):
+        value = self._utf8
+        if not value:
+            return self._empty()
+
+        splitted = value.split('\t')
+
+        try:
+            if tabsize > 0:
+                ovfcheck(len(splitted) * tabsize)
+        except OverflowError:
+            raise oefmt(space.w_OverflowError, "new string is too long")
+        expanded = oldtoken = splitted.pop(0)
+        newlen = self._len() - len(splitted)
+
+        for token in splitted:
+            dist = self._tabindent(oldtoken, tabsize)
+            expanded += ' ' * dist + token
+            newlen += dist
+            oldtoken = token
+
+        return W_UnicodeObject(expanded, newlen)
+
     _StringMethods_descr_join = descr_join
     def descr_join(self, space, w_list):
-        l = space.listview_unicode(w_list)
-        if l is not None:
+        l = space.listview_utf8(w_list)
+        if l is not None and self.is_ascii():
             if len(l) == 1:
-                return space.newunicode(l[0])
-            return space.newunicode(self._val(space).join(l))
+                return space.newutf8(l[0], len(l[0]))
+            s = self._utf8.join(l)
+            return space.newutf8(s, len(s))
         return self._StringMethods_descr_join(space, w_list)
 
     def _join_return_one(self, space, w_obj):
         return space.is_w(space.type(w_obj), space.w_unicode)
 
-    def _join_check_item(self, space, w_obj):
-        if (space.isinstance_w(w_obj, space.w_bytes) or
-            space.isinstance_w(w_obj, space.w_unicode)):
-            return 0
-        return 1
-
     def descr_formatter_parser(self, space):
         from pypy.objspace.std.newformat import unicode_template_formatter
-        tformat = unicode_template_formatter(space, space.unicode_w(self))
+        tformat = unicode_template_formatter(space, space.utf8_w(self))
         return tformat.formatter_parser()
 
     def descr_formatter_field_name_split(self, space):
         from pypy.objspace.std.newformat import unicode_template_formatter
-        tformat = unicode_template_formatter(space, space.unicode_w(self))
+        tformat = unicode_template_formatter(space, space.utf8_w(self))
         return tformat.formatter_field_name_split()
 
+    def descr_lower(self, space):
+        builder = rutf8.Utf8StringBuilder(len(self._utf8))
+        for ch in rutf8.Utf8StringIterator(self._utf8):
+            lower = unicodedb.tolower(ch)
+            builder.append_code(lower)
+        return self.from_utf8builder(builder)
+
     def descr_isdecimal(self, space):
         return self._is_generic(space, '_isdecimal')
 
@@ -388,24 +504,534 @@ class W_UnicodeObject(W_Root):
 
     def descr_islower(self, space):
         cased = False
-        for uchar in self._value:
-            if (unicodedb.isupper(ord(uchar)) or
-                unicodedb.istitle(ord(uchar))):
+        for uchar in rutf8.Utf8StringIterator(self._utf8):
+            if (unicodedb.isupper(uchar) or
+                unicodedb.istitle(uchar)):
                 return space.w_False
-            if not cased and unicodedb.islower(ord(uchar)):
+            if not cased and unicodedb.islower(uchar):
+                cased = True
+        return space.newbool(cased)
+
+    def descr_istitle(self, space):
+        cased = False
+        previous_is_cased = False
+        for uchar in rutf8.Utf8StringIterator(self._utf8):
+            if unicodedb.isupper(uchar) or unicodedb.istitle(uchar):
+                if previous_is_cased:
+                    return space.w_False
+                previous_is_cased = True
+                cased = True
+            elif unicodedb.islower(uchar):
+                if not previous_is_cased:
+                    return space.w_False
                 cased = True
+            else:
+                previous_is_cased = False
         return space.newbool(cased)
 
     def descr_isupper(self, space):
         cased = False
-        for uchar in self._value:
-            if (unicodedb.islower(ord(uchar)) or
-                unicodedb.istitle(ord(uchar))):
+        for uchar in rutf8.Utf8StringIterator(self._utf8):
+            if (unicodedb.islower(uchar) or
+                unicodedb.istitle(uchar)):
                 return space.w_False
-            if not cased and unicodedb.isupper(ord(uchar)):
+            if not cased and unicodedb.isupper(uchar):
                 cased = True
         return space.newbool(cased)
 
+    def descr_startswith(self, space, w_prefix, w_start=None, w_end=None):
+        start, end = self._unwrap_and_compute_idx_params(space, w_start, w_end)
+        value = self._utf8
+        if space.isinstance_w(w_prefix, space.w_tuple):
+            return self._startswith_tuple(space, value, w_prefix, start, end)
+        return space.newbool(self._startswith(space, value, w_prefix, start,
+                                              end))
+
+    def _startswith(self, space, value, w_prefix, start, end):
+        prefix = self.convert_arg_to_w_unicode(space, w_prefix)._utf8
+        if len(prefix) == 0:
+            return True
+        return startswith(value, prefix, start, end)
+
+    def descr_endswith(self, space, w_suffix, w_start=None, w_end=None):
+        start, end = self._unwrap_and_compute_idx_params(space, w_start, w_end)
+        value = self._utf8
+        if space.isinstance_w(w_suffix, space.w_tuple):
+            return self._endswith_tuple(space, value, w_suffix, start, end)
+        return space.newbool(self._endswith(space, value, w_suffix, start,
+                                            end))
+
+    def _endswith(self, space, value, w_prefix, start, end):
+        prefix = self.convert_arg_to_w_unicode(space, w_prefix)._utf8
+        if len(prefix) == 0:
+            return True
+        return endswith(value, prefix, start, end)
+
+    def descr_add(self, space, w_other):
+        try:
+            w_other = self.convert_arg_to_w_unicode(space, w_other)
+        except OperationError as e:
+            if e.match(space, space.w_TypeError):
+                return space.w_NotImplemented
+            raise
+        return W_UnicodeObject(self._utf8 + w_other._utf8,
+                               self._len() + w_other._len())
+
+    @jit.look_inside_iff(lambda self, space, list_w, size:
+                         jit.loop_unrolling_heuristic(list_w, size))
+    def _str_join_many_items(self, space, list_w, size):
+        value = self._utf8
+        lgt = self._len() * (size - 1)
+
+        prealloc_size = len(value) * (size - 1)
+        unwrapped = newlist_hint(size)
+        for i in range(size):
+            w_s = list_w[i]
+            if not (space.isinstance_w(w_s, space.w_bytes) or
+                    space.isinstance_w(w_s, space.w_unicode)):
+                raise oefmt(space.w_TypeError,
+                            "sequence item %d: expected string or unicode, %T found",
+                            i, w_s)
+            # XXX Maybe the extra copy here is okay? It was basically going to
+            #     happen anyway, what with being placed into the builder
+            w_u = self.convert_arg_to_w_unicode(space, w_s)
+            unwrapped.append(w_u._utf8)
+            lgt += w_u._length
+            prealloc_size += len(unwrapped[i])
+
+        sb = StringBuilder(prealloc_size)
+        for i in range(size):
+            if value and i != 0:
+                sb.append(value)
+            sb.append(unwrapped[i])
+        return W_UnicodeObject(sb.build(), lgt)
+
+    @unwrap_spec(keepends=bool)
+    def descr_splitlines(self, space, keepends=False):
+        value = self._utf8
+        length = len(value)
+        strs_w = []
+        pos = 0
+        while pos < length:
+            sol = pos
+            lgt = 0
+            while pos < length and not self._islinebreak(rutf8.codepoint_at_pos(value, pos)):
+                pos = rutf8.next_codepoint_pos(value, pos)
+                lgt += 1
+            eol = pos
+            if pos < length:
+                # read CRLF as one line break
+                if (value[pos] == '\r' and pos + 1 < length
+                                       and value[pos + 1] == '\n'):
+                    pos += 2
+                    line_end_chars = 2
+                else:
+                    pos = rutf8.next_codepoint_pos(value, pos)
+                    line_end_chars = 1
+                if keepends:
+                    eol = pos
+                    lgt += line_end_chars
+            assert eol >= 0
+            assert sol >= 0
+            strs_w.append(W_UnicodeObject(value[sol:eol], lgt))
+        return space.newlist(strs_w)
+
+    def descr_upper(self, space):
+        builder = rutf8.Utf8StringBuilder(len(self._utf8))
+        for ch in rutf8.Utf8StringIterator(self._utf8):
+            ch = unicodedb.toupper(ch)
+            builder.append_code(ch)
+        return self.from_utf8builder(builder)
+
+    @unwrap_spec(width=int)
+    def descr_zfill(self, space, width):
+        selfval = self._utf8
+        if len(selfval) == 0:
+            return W_UnicodeObject('0' * width, width)
+        num_zeros = width - self._len()
+        if num_zeros <= 0:
+            # cannot return self, in case it is a subclass of str
+            return W_UnicodeObject(selfval, self._len())
+        builder = StringBuilder(num_zeros + len(selfval))
+        if len(selfval) > 0 and (selfval[0] == '+' or selfval[0] == '-'):
+            # copy sign to first position
+            builder.append(selfval[0])
+            start = 1
+        else:
+            start = 0
+        builder.append_multiple_char('0', num_zeros)
+        builder.append_slice(selfval, start, len(selfval))
+        return W_UnicodeObject(builder.build(), width)
+
+    @unwrap_spec(maxsplit=int)
+    def descr_split(self, space, w_sep=None, maxsplit=-1):
+        res = []
+        value = self._utf8
+        if space.is_none(w_sep):
+            res = split(value, maxsplit=maxsplit, isutf8=True)
+            return space.newlist_utf8(res, self.is_ascii())
+
+        by = self.convert_arg_to_w_unicode(space, w_sep)._utf8
+        if len(by) == 0:
+            raise oefmt(space.w_ValueError, "empty separator")
+        res = split(value, by, maxsplit, isutf8=True)
+
+        return space.newlist_utf8(res, self.is_ascii())
+
+    @unwrap_spec(maxsplit=int)
+    def descr_rsplit(self, space, w_sep=None, maxsplit=-1):
+        res = []
+        value = self._utf8
+        if space.is_none(w_sep):
+            res = rsplit(value, maxsplit=maxsplit, isutf8=True)
+            return space.newlist_utf8(res, self.is_ascii())
+
+        by = self.convert_arg_to_w_unicode(space, w_sep)._utf8
+        if len(by) == 0:
+            raise oefmt(space.w_ValueError, "empty separator")
+        res = rsplit(value, by, maxsplit, isutf8=True)
+
+        return space.newlist_utf8(res, self.is_ascii())
+
+    def descr_getitem(self, space, w_index):
+        if isinstance(w_index, W_SliceObject):
+            length = self._len()
+            start, stop, step, sl = w_index.indices4(space, length)
+            if sl == 0:
+                return self._empty()
+            elif step == 1:
+                assert start >= 0 and stop >= 0
+                return self._unicode_sliced(space, start, stop)
+            else:
+                return self._getitem_slice_slowpath(space, start, step, sl)
+
+        index = space.getindex_w(w_index, space.w_IndexError, "string index")
+        return self._getitem_result(space, index)
+
+    def _getitem_slice_slowpath(self, space, start, step, sl):
+        # XXX same comment as in _unicode_sliced
+        builder = StringBuilder(step * sl)
+        byte_pos = self._index_to_byte(start)
+        i = 0
+        while True:
+            next_pos = rutf8.next_codepoint_pos(self._utf8, byte_pos)
+            builder.append(self._utf8[byte_pos:next_pos])
+            if i == sl - 1:
+                break
+            i += 1
+            byte_pos = self._index_to_byte(start + i * step)
+        return W_UnicodeObject(builder.build(), sl)
+
+    def descr_getslice(self, space, w_start, w_stop):
+        start, stop = normalize_simple_slice(
+            space, self._len(), w_start, w_stop)
+        if start == stop:
+            return self._empty()
+        else:
+            return self._unicode_sliced(space, start, stop)
+
+    def _unicode_sliced(self, space, start, stop):
+        # XXX maybe some heuristic, like first slice does not create
+        #     full index, but second does?
+        assert start >= 0
+        assert stop >= 0
+        byte_start = self._index_to_byte(start)
+        byte_stop = self._index_to_byte(stop)
+        return W_UnicodeObject(self._utf8[byte_start:byte_stop], stop - start)
+
+    def descr_capitalize(self, space):
+        value = self._utf8
+        if len(value) == 0:
+            return self._empty()
+
+        builder = rutf8.Utf8StringBuilder(len(self._utf8))
+        it = rutf8.Utf8StringIterator(self._utf8)
+        uchar = it.next()
+        ch = unicodedb.toupper(uchar)
+        builder.append_code(ch)
+        for ch in it:
+            ch = unicodedb.tolower(ch)
+            builder.append_code(ch)
+        return self.from_utf8builder(builder)
+
+    @unwrap_spec(width=int, w_fillchar=WrappedDefault(' '))
+    def descr_center(self, space, width, w_fillchar):
+        value = self._utf8
+        fillchar = self.convert_arg_to_w_unicode(space, w_fillchar)._utf8
+        if len(fillchar) != 1:
+            raise oefmt(space.w_TypeError,
+                        "center() argument 2 must be a single character")
+
+        d = width - self._len()
+        if d > 0:
+            offset = d//2 + (d & width & 1)
+            fillchar = fillchar[0]
+            centered = offset * fillchar + value + (d - offset) * fillchar
+        else:
+            centered = value
+            d = 0
+
+        return W_UnicodeObject(centered, self._len() + d)
+
+    def descr_count(self, space, w_sub, w_start=None, w_end=None):
+        value = self._utf8
+        start_index, end_index = self._unwrap_and_compute_idx_params(
+            space, w_start, w_end)
+        sub = self.convert_arg_to_w_unicode(space, w_sub)._utf8
+        return space.newint(value.count(sub, start_index, end_index))
+
+    def descr_contains(self, space, w_sub):
+        value = self._utf8
+        w_other = self.convert_arg_to_w_unicode(space, w_sub)
+        return space.newbool(value.find(w_other._utf8) >= 0)
+
+    def descr_partition(self, space, w_sub):
+        value = self._utf8
+        sub = self.convert_arg_to_w_unicode(space, w_sub)
+        sublen = sub._len()
+        if sublen == 0:
+            raise oefmt(space.w_ValueError, "empty separator")
+
+        pos = value.find(sub._utf8)
+
+        if pos < 0:
+            return space.newtuple([self, self._empty(), self._empty()])
+        else:
+            lgt = rutf8.check_utf8(value, True, stop=pos)
+            return space.newtuple(
+                [W_UnicodeObject(value[0:pos], lgt), w_sub,
+                 W_UnicodeObject(value[pos + len(sub._utf8):len(value)],
+                    self._len() - lgt - sublen)])
+
+    def descr_rpartition(self, space, w_sub):
+        value = self._utf8
+        sub = self.convert_arg_to_w_unicode(space, w_sub)
+        sublen = sub._len()
+        if sublen == 0:
+            raise oefmt(space.w_ValueError, "empty separator")
+
+        pos = value.rfind(sub._utf8)
+
+        if pos < 0:
+            return space.newtuple([self._empty(), self._empty(), self])
+        else:
+            lgt = rutf8.check_utf8(value, True, stop=pos)
+            return space.newtuple(
+                [W_UnicodeObject(value[0:pos], lgt), w_sub,
+                 W_UnicodeObject(value[pos + len(sub._utf8):len(value)],
+                    self._len() - lgt - sublen)])
+
+    @unwrap_spec(count=int)
+    def descr_replace(self, space, w_old, w_new, count=-1):
+        input = self._utf8
+
+        w_sub = self.convert_arg_to_w_unicode(space, w_old)
+        w_by = self.convert_arg_to_w_unicode(space, w_new)
+        # the following two lines are for being bug-to-bug compatible
+        # with CPython: see issue #2448
+        if count >= 0 and len(input) == 0:
+            return self._empty()
+        try:
+            res, replacements = replace_count(input, w_sub._utf8, w_by._utf8,
+                                              count, isutf8=True)
+        except OverflowError:
+            raise oefmt(space.w_OverflowError, "replace string is too long")
+
+        newlength = self._length + replacements * (w_by._length - w_sub._length)
+        return W_UnicodeObject(res, newlength)
+
+    def descr_mul(self, space, w_times):
+        try:
+            times = space.getindex_w(w_times, space.w_OverflowError)
+        except OperationError as e:
+            if e.match(space, space.w_TypeError):
+                return space.w_NotImplemented
+            raise
+        if times <= 0:
+            return self._empty()
+        if len(self._utf8) == 1:
+            return W_UnicodeObject(self._utf8[0] * times, times)
+        return W_UnicodeObject(self._utf8 * times, times * self._len())
+
+    descr_rmul = descr_mul
+
+    def _get_index_storage(self):
+        return jit.conditional_call_elidable(self._index_storage,
+                    W_UnicodeObject._compute_index_storage, self)
+
+    def _compute_index_storage(self):
+        storage = rutf8.create_utf8_index_storage(self._utf8, self._length)
+        self._index_storage = storage
+        return storage
+
+    def _getitem_result(self, space, index):
+        if index < 0:
+            index += self._length
+        if index < 0 or index >= self._length:
+            raise oefmt(space.w_IndexError, "string index out of range")
+        start = self._index_to_byte(index)
+        end = rutf8.next_codepoint_pos(self._utf8, start)
+        return W_UnicodeObject(self._utf8[start:end], 1)
+
+    def is_ascii(self):
+        return self._length == len(self._utf8)
+
+    def _has_surrogates(self):
+        if self.is_ascii():
+            return False
+        return rutf8.has_surrogates(self._utf8)
+
+    def _index_to_byte(self, index):
+        if self.is_ascii():
+            assert index >= 0
+            return index
+        return rutf8.codepoint_position_at_index(
+            self._utf8, self._get_index_storage(), index)
+
+    @always_inline
+    def _unwrap_and_search(self, space, w_sub, w_start, w_end, forward=True):
+        w_sub = self.convert_arg_to_w_unicode(space, w_sub)
+        start, end = unwrap_start_stop(space, self._length, w_start, w_end)
+        if start == 0:
+            start_index = 0
+        elif start > self._length:
+            return None
+        else:
+            start_index = self._index_to_byte(start)
+
+        if end >= self._length:
+            end = self._length
+            end_index = len(self._utf8)
+        else:
+            end_index = self._index_to_byte(end)
+
+        if forward:
+            res_index = self._utf8.find(w_sub._utf8, start_index, end_index)
+            if res_index < 0:
+                return None
+            skip = rutf8.codepoints_in_utf8(self._utf8, start_index, res_index)
+            res = start + skip
+            assert res >= 0
+            return space.newint(res)
+        else:
+            res_index = self._utf8.rfind(w_sub._utf8, start_index, end_index)
+            if res_index < 0:
+                return None
+            skip = rutf8.codepoints_in_utf8(self._utf8, res_index, end_index)
+            res = end - skip
+            assert res >= 0
+            return space.newint(res)
+
+    def _unwrap_and_compute_idx_params(self, space, w_start, w_end):
+        # unwrap start and stop indices, optimized for the case where
+        # start == 0 and end == self._length.  Note that 'start' and
+        # 'end' are measured in codepoints whereas 'start_index' and
+        # 'end_index' are measured in bytes.
+        start, end = unwrap_start_stop(space, self._length, w_start, w_end)
+        start_index = 0
+        end_index = len(self._utf8) + 1
+        if start > 0:
+            if start > self._length:
+                start_index = end_index
+            else:
+                start_index = self._index_to_byte(start)
+        if end < self._length:
+            end_index = self._index_to_byte(end)
+        return (start_index, end_index)
+
+    @unwrap_spec(width=int, w_fillchar=WrappedDefault(' '))
+    def descr_rjust(self, space, width, w_fillchar):
+        value = self._utf8
+        lgt = self._len()
+        w_fillchar = self.convert_arg_to_w_unicode(space, w_fillchar)
+        if w_fillchar._len() != 1:
+            raise oefmt(space.w_TypeError,
+                        "rjust() argument 2 must be a single character")
+        d = width - lgt
+        if d > 0:
+            if len(w_fillchar._utf8) == 1:
+                # speedup
+                value = d * w_fillchar._utf8[0] + value
+            else:
+                value = d * w_fillchar._utf8 + value
+            return W_UnicodeObject(value, width)
+
+        return W_UnicodeObject(value, lgt)
+
+    @unwrap_spec(width=int, w_fillchar=WrappedDefault(' '))
+    def descr_ljust(self, space, width, w_fillchar):
+        value = self._utf8
+        w_fillchar = self.convert_arg_to_w_unicode(space, w_fillchar)
+        if w_fillchar._len() != 1:
+            raise oefmt(space.w_TypeError,
+                        "ljust() argument 2 must be a single character")
+        d = width - self._len()
+        if d > 0:
+            if len(w_fillchar._utf8) == 1:
+                # speedup
+                value = value + d * w_fillchar._utf8[0]
+            else:
+                value = value + d * w_fillchar._utf8
+            return W_UnicodeObject(value, width)
+
+        return W_UnicodeObject(value, self._len())
+
+    def _utf8_sliced(self, start, stop, lgt):
+        assert start >= 0
+        assert stop >= 0
+        #if start == 0 and stop == len(s) and space.is_w(space.type(orig_obj),
+        #                                                space.w_bytes):
+        #    return orig_obj
+        return W_UnicodeObject(self._utf8[start:stop], lgt)
+
+    def _strip_none(self, space, left, right):
+        "internal function called by str_xstrip methods"
+        value = self._utf8
+
+        lpos = 0
+        rpos = len(value)
+        lgt = self._len()
+
+        if left:
+            while lpos < rpos and rutf8.isspace(value, lpos):
+                lpos = rutf8.next_codepoint_pos(value, lpos)
+                lgt -= 1
+
+        if right:
+            while rpos > lpos and rutf8.isspace(value,
+                                         rutf8.prev_codepoint_pos(value, rpos)):
+                rpos = rutf8.prev_codepoint_pos(value, rpos)
+                lgt -= 1
+
+        assert rpos >= lpos    # annotator hint, don't remove
+        return self._utf8_sliced(lpos, rpos, lgt)
+
+    def _strip(self, space, w_chars, left, right, name='strip'):
+        "internal function called by str_xstrip methods"
+        value = self._utf8
+        chars = self.convert_arg_to_w_unicode(space, w_chars, strict=name)._utf8
+
+        lpos = 0
+        rpos = len(value)
+        lgt = self._len()
+
+        if left:
+            while lpos < rpos and rutf8.utf8_in_chars(value, lpos, chars):
+                lpos = rutf8.next_codepoint_pos(value, lpos)
+                lgt -= 1
+
+        if right:
+            while rpos > lpos and rutf8.utf8_in_chars(value,
+                    rutf8.prev_codepoint_pos(value, rpos), chars):
+                rpos = rutf8.prev_codepoint_pos(value, rpos)
+                lgt -= 1
+
+        assert rpos >= lpos    # annotator hint, don't remove
+        return self._utf8_sliced(lpos, rpos, lgt)
+
+    def descr_getnewargs(self, space):
+        return space.newtuple([W_UnicodeObject(self._utf8, self._length)])
+
     _starts_ends_unicode = True
 
 
@@ -445,68 +1071,57 @@ def _get_encoding_and_errors(space, w_encoding, w_errors):
     return encoding, errors
 
 
-def encode_object(space, w_object, encoding, errors):
-    if encoding is None:
-        # Get the encoder functions as a wrapped object.
-        # This lookup is cached.
-        w_encoder = space.sys.get_w_default_encoder()
-    else:
-        if errors is None or errors == 'strict':
-            if encoding == 'ascii':
-                u = space.unicode_w(w_object)
-                eh = unicodehelper.encode_error_handler(space)
-                return space.newbytes(unicode_encode_ascii(
-                        u, len(u), None, errorhandler=eh))
-            if encoding == 'utf-8':
-                u = space.unicode_w(w_object)
-                eh = unicodehelper.encode_error_handler(space)
-                return space.newbytes(unicode_encode_utf_8(
-                        u, len(u), None, errorhandler=eh,
-                        allow_surrogates=True))
-        from pypy.module._codecs.interp_codecs import lookup_codec
-        w_encoder = space.getitem(lookup_codec(space, encoding), space.newint(0))
-    if errors is None:
-        w_errors = space.newtext('strict')
-    else:
-        w_errors = space.newtext(errors)
-    w_restuple = space.call_function(w_encoder, w_object, w_errors)
-    w_retval = space.getitem(w_restuple, space.newint(0))
-    if not space.isinstance_w(w_retval, space.w_bytes):
-        raise oefmt(space.w_TypeError,
-                    "encoder did not return an string object (type '%T')",
-                    w_retval)
-    return w_retval
+def encode_object(space, w_obj, encoding, errors):
+    from pypy.module._codecs.interp_codecs import encode
+    if errors is None or errors == 'strict':
+        # fast path
+        if ((encoding is None and space.sys.defaultencoding == 'ascii') or
+             encoding == 'ascii'):
+            s = space.utf8_w(w_obj)
+            try:
+                rutf8.check_ascii(s)
+            except rutf8.CheckError as a:
+                if space.isinstance_w(w_obj, space.w_unicode):
+                    eh = unicodehelper.encode_error_handler(space)
+                else:
+                    # must be a bytes-like object. In order to encode it,
+                    # first "decode" to unicode. Since we cannot, raise a
+                    # UnicodeDecodeError, not a UnicodeEncodeError
+                    eh = unicodehelper.decode_error_handler(space)
+                eh(None, "ascii", "ordinal not in range(128)", s,
+                    a.pos, a.pos + 1)
+                assert False, "always raises"
+            return space.newbytes(s)
+        if ((encoding is None and space.sys.defaultencoding == 'utf8') or
+             encoding == 'utf-8' or encoding == 'utf8' or encoding == 'UTF-8'):
+            utf8 = space.utf8_w(w_obj)
+            if rutf8.has_surrogates(utf8):
+                utf8 = rutf8.reencode_utf8_with_surrogates(utf8)
+            return space.newbytes(utf8)
+    return encode(space, w_obj, encoding, errors) 
 
 
 def decode_object(space, w_obj, encoding, errors):
-    if encoding is None:
-        encoding = getdefaultencoding(space)
+    from pypy.module._codecs.interp_codecs import lookup_codec, decode 
     if errors is None or errors == 'strict':
+        # fast paths
+        if encoding is None:
+            encoding = getdefaultencoding(space)
         if encoding == 'ascii':
-            # XXX error handling
             s = space.charbuf_w(w_obj)
-            try:
-                u = fast_str_decode_ascii(s)
-            except ValueError:
-                eh = unicodehelper.decode_error_handler(space)
-                u = str_decode_ascii(     # try again, to get the error right
-                    s, len(s), None, final=True, errorhandler=eh)[0]
-            return space.newunicode(u)
-        if encoding == 'utf-8':
-            s = space.charbuf_w(w_obj)
-            eh = unicodehelper.decode_error_handler(space)
-            return space.newunicode(str_decode_utf_8(
-                    s, len(s), None, final=True, errorhandler=eh,
-                    allow_surrogates=True)[0])
-    w_codecs = space.getbuiltinmodule("_codecs")
-    w_decode = space.getattr(w_codecs, space.newtext("decode"))
-    if errors is None:
-        w_retval = space.call_function(w_decode, w_obj, space.newtext(encoding))
-    else:
-        w_retval = space.call_function(w_decode, w_obj, space.newtext(encoding),
-                                       space.newtext(errors))
-    return w_retval
-
+            unicodehelper.check_ascii_or_raise(space, s)
+            return space.newutf8(s, len(s))
+        if encoding == 'utf-8' or encoding == 'utf8':
+            if (space.isinstance_w(w_obj, space.w_unicode) or 
+                space.isinstance_w(w_obj, space.w_bytes)):
+                s = space.utf8_w(w_obj)
+            else:
+                s = space.charbuf_w(w_obj)
+            lgt = unicodehelper.check_utf8_or_raise(space, s)
+            return space.newutf8(s, lgt)
+    if encoding is None:
+        encoding = space.sys.defaultencoding
+    return decode(space, w_obj, encoding, errors)
 
 def unicode_from_encoded_object(space, w_obj, encoding, errors):
     # explicitly block bytearray on 2.7
@@ -534,7 +1149,7 @@ def unicode_from_object(space, w_obj):
         # test_unicode_conversion_with__str__
         if w_unicode_method is None:
             if space.isinstance_w(w_obj, space.w_unicode):
-                return space.newunicode(space.unicode_w(w_obj))
+                return space.convert_arg_to_w_unicode(w_obj)
             w_unicode_method = space.lookup(w_obj, "__str__")
         if w_unicode_method is not None:
             w_res = space.get_and_call_function(w_unicode_method, w_obj)
@@ -551,11 +1166,8 @@ def unicode_from_string(space, w_bytes):
     if encoding != 'ascii':
         return unicode_from_encoded_object(space, w_bytes, encoding, "strict")
     s = space.bytes_w(w_bytes)
-    try:
-        return W_UnicodeObject(s.decode("ascii"))
-    except UnicodeDecodeError:
-        # raising UnicodeDecodeError is messy, "please crash for me"
-        return unicode_from_encoded_object(space, w_bytes, "ascii", "strict")
+    unicodehelper.check_ascii_or_raise(space, s)
+    return W_UnicodeObject(s, len(s))
 
 
 class UnicodeDocstrings:
@@ -1102,38 +1714,42 @@ def _create_list_from_unicode(value):
     return [s for s in value]
 
 
-W_UnicodeObject.EMPTY = W_UnicodeObject(u'')
+W_UnicodeObject.EMPTY = W_UnicodeObject('', 0)
 
 
 # Helper for converting int/long
 def unicode_to_decimal_w(space, w_unistr):
     if not isinstance(w_unistr, W_UnicodeObject):
         raise oefmt(space.w_TypeError, "expected unicode, got '%T'", w_unistr)
-    unistr = w_unistr._value
-    result = ['\0'] * len(unistr)
+    unistr = w_unistr._utf8
+    result = ['\0'] * w_unistr._length
     digits = ['0', '1', '2', '3', '4',
               '5', '6', '7', '8', '9']
-    for i in xrange(len(unistr)):
-        uchr = ord(unistr[i])
-        if unicodedb.isspace(uchr):
-            result[i] = ' '
+    res_pos = 0
+    iter = rutf8.Utf8StringIterator(unistr)
+    for uchr in iter:
+        if W_UnicodeObject._isspace(uchr):
+            result[res_pos] = ' '
+            res_pos += 1
             continue
         try:
-            result[i] = digits[unicodedb.decimal(uchr)]
+            result[res_pos] = digits[unicodedb.decimal(uchr)]
         except KeyError:
             if 0 < uchr < 256:
-                result[i] = chr(uchr)
+                result[res_pos] = chr(uchr)
             else:
                 w_encoding = space.newtext('decimal')
-                w_start = space.newint(i)
-                w_end = space.newint(i+1)
+                pos = iter.get_pos()
+                w_start = space.newint(pos)
+                w_end = space.newint(pos+1)
                 w_reason = space.newtext('invalid decimal Unicode string')
                 raise OperationError(space.w_UnicodeEncodeError,
                                      space.newtuple([w_encoding, w_unistr,
                                                      w_start, w_end,
                                                      w_reason]))
+        res_pos += 1
     return ''.join(result)
 
 
-_repr_function, _ = make_unicode_escape_function(
-    pass_printable=False, unicode_output=False, quotes=True, prefix='u')
+_repr_function = rutf8.make_utf8_escape_function(
+    pass_printable=False, quotes=True, prefix='u')
author	Ronan Lamy <ronan.lamy@gmail.com>	2019-03-12 18:18:02 +0000
committer	Ronan Lamy <ronan.lamy@gmail.com>	2019-03-12 18:18:02 +0000
commit	6eac96a58627e5638969e92335ac307c533aa644 (patch)
tree	49b3e1101750ce1ebcebb479fff383d62b86a805 /pypy/objspace/std/unicodeobject.py
parent	hg merge default (diff)
parent	reduce code duplication (diff)
download	pypy-6eac96a58627e5638969e92335ac307c533aa644.tar.gz pypy-6eac96a58627e5638969e92335ac307c533aa644.tar.bz2 pypy-6eac96a58627e5638969e92335ac307c533aa644.zip