aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRonan Lamy <ronan.lamy@gmail.com>2019-03-12 18:18:02 +0000
committerRonan Lamy <ronan.lamy@gmail.com>2019-03-12 18:18:02 +0000
commit6eac96a58627e5638969e92335ac307c533aa644 (patch)
tree49b3e1101750ce1ebcebb479fff383d62b86a805 /pypy/objspace/std/unicodeobject.py
parenthg merge default (diff)
parentreduce code duplication (diff)
downloadpypy-6eac96a58627e5638969e92335ac307c533aa644.tar.gz
pypy-6eac96a58627e5638969e92335ac307c533aa644.tar.bz2
pypy-6eac96a58627e5638969e92335ac307c533aa644.zip
hg merge default
Diffstat (limited to 'pypy/objspace/std/unicodeobject.py')
-rw-r--r--pypy/objspace/std/unicodeobject.py1038
1 files changed, 827 insertions, 211 deletions
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
index 5cea4b4802..efd72611c0 100644
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -1,14 +1,15 @@
"""The builtin unicode implementation"""
from rpython.rlib.objectmodel import (
- compute_hash, compute_unique_id, import_from_mixin,
- enforceargs)
+ compute_hash, compute_unique_id, import_from_mixin, always_inline,
+ enforceargs, newlist_hint, specialize, we_are_translated)
from rpython.rlib.buffer import StringBuffer
from rpython.rlib.mutbuffer import MutableStringBuffer
-from rpython.rlib.rstring import StringBuilder, UnicodeBuilder
-from rpython.rlib.runicode import (
- make_unicode_escape_function, str_decode_ascii, str_decode_utf_8,
- unicode_encode_ascii, unicode_encode_utf_8, fast_str_decode_ascii)
+from rpython.rlib.rarithmetic import ovfcheck
+from rpython.rlib.rstring import (
+ StringBuilder, split, rsplit, UnicodeBuilder, replace_count, startswith,
+ endswith)
+from rpython.rlib import rutf8, jit
from pypy.interpreter import unicodehelper
from pypy.interpreter.baseobjspace import W_Root
@@ -19,6 +20,8 @@ from pypy.module.unicodedata.interp_ucd import unicodedb
from pypy.objspace.std import newformat
from pypy.objspace.std.basestringtype import basestring_typedef
from pypy.objspace.std.formatting import mod_format
+from pypy.objspace.std.sliceobject import (W_SliceObject,
+ unwrap_start_stop, normalize_simple_slice)
from pypy.objspace.std.stringmethods import StringMethods
from pypy.objspace.std.util import IDTAG_SPECIAL, IDTAG_SHIFT
@@ -29,25 +32,33 @@ __all__ = ['W_UnicodeObject', 'wrapunicode', 'plain_str2unicode',
class W_UnicodeObject(W_Root):
import_from_mixin(StringMethods)
- _immutable_fields_ = ['_value']
+ _immutable_fields_ = ['_utf8']
+
+ @enforceargs(utf8str=str)
+ def __init__(self, utf8str, length):
+ assert isinstance(utf8str, str)
+ assert length >= 0
+ self._utf8 = utf8str
+ self._length = length
+ self._index_storage = rutf8.null_storage()
+ if not we_are_translated():
+ try:
+ # best effort, too expensive to handle surrogates
+ ulength = rutf8.codepoints_in_utf(utf8str)
+ except:
+ ulength = length
+ assert ulength == length
- @enforceargs(uni=unicode)
- def __init__(self, unistr):
- assert isinstance(unistr, unicode)
- self._value = unistr
- def __repr__(self):
- """representation for debugging purposes"""
- return "%s(%r)" % (self.__class__.__name__, self._value)
- def unwrap(self, space):
- # for testing
- return self._value
+ @staticmethod
+ def from_utf8builder(builder):
+ return W_UnicodeObject(
+ builder.build(), builder.getlength())
- def create_if_subclassed(self):
- if type(self) is W_UnicodeObject:
- return self
- return W_UnicodeObject(self._value)
+ def __repr__(self):
+ """representation for debugging purposes"""
+ return "%s(%r)" % (self.__class__.__name__, self._utf8)
def is_w(self, space, w_other):
if not isinstance(w_other, W_UnicodeObject):
@@ -56,9 +67,9 @@ class W_UnicodeObject(W_Root):
return True
if self.user_overridden_class or w_other.user_overridden_class:
return False
- s1 = space.unicode_w(self)
- s2 = space.unicode_w(w_other)
- if len(s2) > 1:
+ s1 = space.utf8_w(self)
+ s2 = space.utf8_w(w_other)
+ if len(s2) > 2:
return s1 is s2
else: # strings of len <= 1 are unique-ified
return s1 == s2
@@ -66,61 +77,62 @@ class W_UnicodeObject(W_Root):
def immutable_unique_id(self, space):
if self.user_overridden_class:
return None
- s = space.unicode_w(self)
- if len(s) > 1:
+ s = space.utf8_w(self)
+ if len(s) > 2:
uid = compute_unique_id(s)
else: # strings of len <= 1 are unique-ified
if len(s) == 1:
base = ~ord(s[0]) # negative base values
+ elif len(s) == 2:
+ base = ~((ord(s[1]) << 8) | ord(s[0]))
else:
base = 257 # empty unicode string: base value 257
uid = (base << IDTAG_SHIFT) | IDTAG_SPECIAL
return space.newint(uid)
def str_w(self, space):
- return space.text_w(space.str(self))
+ return space.text_w(encode_object(space, self, 'ascii', 'strict'))
- def unicode_w(self, space):
- return self._value
+ def utf8_w(self, space):
+ return self._utf8
def readbuf_w(self, space):
- from rpython.rlib.rstruct.unichar import pack_unichar, UNICODE_SIZE
- buf = MutableStringBuffer(len(self._value) * UNICODE_SIZE)
+ # XXX for now
+ from rpython.rlib.rstruct.unichar import pack_codepoint, UNICODE_SIZE
+ builder = MutableStringBuffer(self._len() * UNICODE_SIZE)
pos = 0
- for unich in self._value:
- pack_unichar(unich, buf, pos)
+ i = 0
+ while i < len(self._utf8):
+ unich = rutf8.codepoint_at_pos(self._utf8, i)
+ pack_codepoint(unich, builder, pos)
pos += UNICODE_SIZE
- return StringBuffer(buf.finish())
+ i = rutf8.next_codepoint_pos(self._utf8, i)
+ return StringBuffer(builder.finish())
def writebuf_w(self, space):
raise oefmt(space.w_TypeError,
"cannot use unicode as modifiable buffer")
- charbuf_w = str_w
+ def charbuf_w(self, space):
+ # Returns ascii-encoded str
+ return space.text_w(encode_object(space, self, 'ascii', 'strict'))
- def listview_unicode(self):
- return _create_list_from_unicode(self._value)
+ def listview_utf8(self):
+ assert self.is_ascii()
+ return _create_list_from_unicode(self._utf8)
def ord(self, space):
- if len(self._value) != 1:
+ if self._len() != 1:
raise oefmt(space.w_TypeError,
"ord() expected a character, but string of length %d "
- "found", len(self._value))
- return space.newint(ord(self._value[0]))
-
- def _new(self, value):
- return W_UnicodeObject(value)
-
- def _new_from_list(self, value):
- return W_UnicodeObject(u''.join(value))
+ "found", self._len())
+ return space.newint(rutf8.codepoint_at_pos(self._utf8, 0))
def _empty(self):
return W_UnicodeObject.EMPTY
def _len(self):
- return len(self._value)
-
- _val = unicode_w
+ return self._length
@staticmethod
def _use_rstr_ops(space, w_other):
@@ -129,67 +141,64 @@ class W_UnicodeObject(W_Root):
return True
@staticmethod
- def _op_val(space, w_other, strict=None):
- if isinstance(w_other, W_UnicodeObject):
- return w_other._value
+ def convert_arg_to_w_unicode(space, w_other, strict=None):
+ if space.is_w(space.type(w_other), space.w_unicode):
+ # XXX why do we need this for translation???
+ assert isinstance(w_other, W_UnicodeObject)
+ return w_other
if space.isinstance_w(w_other, space.w_bytes):
- return unicode_from_string(space, w_other)._value
+ return unicode_from_string(space, w_other)
if strict:
raise oefmt(space.w_TypeError,
"%s arg must be None, unicode or str", strict)
- return unicode_from_encoded_object(
- space, w_other, None, "strict")._value
+ return unicode_from_encoded_object(space, w_other, 'utf8', "strict")
+
+ def convert_to_w_unicode(self, space):
+ return self
+ @specialize.argtype(1)
def _chr(self, char):
assert len(char) == 1
- return unicode(char)[0]
+ return unichr(ord(char[0]))
+
+ def _multi_chr(self, unichar):
+ return unichar
_builder = UnicodeBuilder
def _isupper(self, ch):
- return unicodedb.isupper(ord(ch))
+ return unicodedb.isupper(ch)
def _islower(self, ch):
- return unicodedb.islower(ord(ch))
+ return unicodedb.islower(ch)
def _isnumeric(self, ch):
- return unicodedb.isnumeric(ord(ch))
+ return unicodedb.isnumeric(ch)
def _istitle(self, ch):
- return unicodedb.isupper(ord(ch)) or unicodedb.istitle(ord(ch))
+ return unicodedb.isupper(ch) or unicodedb.istitle(ch)
- def _isspace(self, ch):
- return unicodedb.isspace(ord(ch))
+ @staticmethod
+ def _isspace(ch):
+ return unicodedb.isspace(ch)
def _isalpha(self, ch):
- return unicodedb.isalpha(ord(ch))
+ return unicodedb.isalpha(ch)
def _isalnum(self, ch):
- return unicodedb.isalnum(ord(ch))
+ return unicodedb.isalnum(ch)
def _isdigit(self, ch):
- return unicodedb.isdigit(ord(ch))
+ return unicodedb.isdigit(ch)
def _isdecimal(self, ch):
- return unicodedb.isdecimal(ord(ch))
+ return unicodedb.isdecimal(ch)
def _iscased(self, ch):
- return unicodedb.iscased(ord(ch))
+ return unicodedb.iscased(ch)
def _islinebreak(self, ch):
- return unicodedb.islinebreak(ord(ch))
-
- def _upper(self, ch):
- return unichr(unicodedb.toupper(ord(ch)))
-
- def _lower(self, ch):
- return unichr(unicodedb.tolower(ord(ch)))
-
- def _title(self, ch):
- return unichr(unicodedb.totitle(ord(ch)))
-
- def _newlist_unwrapped(self, space, lst):
- return space.newlist_unicode(lst)
+ return unicodedb.islinebreak(ch)
@staticmethod
def descr_new(space, w_unicodetype, w_string=None, w_encoding=None,
@@ -212,26 +221,35 @@ class W_UnicodeObject(W_Root):
assert isinstance(w_value, W_UnicodeObject)
w_newobj = space.allocate_instance(W_UnicodeObject, w_unicodetype)
- W_UnicodeObject.__init__(w_newobj, w_value._value)
+ W_UnicodeObject.__init__(w_newobj, w_value._utf8, w_value._length)
+ if w_value._index_storage:
+ # copy the storage if it's there
+ w_newobj._index_storage = w_value._index_storage
return w_newobj
def descr_repr(self, space):
- chars = self._value
- size = len(chars)
- s = _repr_function(chars, size, "strict")
- return space.newtext(s)
+ return space.newtext(_repr_function(self._utf8))
def descr_str(self, space):
- return encode_object(space, self, None, None)
+ return encode_object(space, self, 'ascii', 'strict')
- def descr_hash(self, space):
- x = compute_hash(self._value)
+ def hash_w(self):
+ # shortcut for UnicodeDictStrategy
+ x = compute_hash(self._utf8)
x -= (x == -1) # convert -1 to -2 without creating a bridge
- return space.newint(x)
+ return x
+
+ def descr_hash(self, space):
+ return space.newint(self.hash_w())
+
+ def eq_w(self, w_other):
+ # shortcut for UnicodeDictStrategy
+ assert isinstance(w_other, W_UnicodeObject)
+ return self._utf8 == w_other._utf8
def descr_eq(self, space, w_other):
try:
- res = self._val(space) == self._op_val(space, w_other)
+ res = self._utf8 == self.convert_arg_to_w_unicode(space, w_other)._utf8
except OperationError as e:
if e.match(space, space.w_TypeError):
return space.w_NotImplemented
@@ -247,7 +265,7 @@ class W_UnicodeObject(W_Root):
def descr_ne(self, space, w_other):
try:
- res = self._val(space) != self._op_val(space, w_other)
+ res = self._utf8 != self.convert_arg_to_w_unicode(space, w_other)._utf8
except OperationError as e:
if e.match(space, space.w_TypeError):
return space.w_NotImplemented
@@ -263,7 +281,7 @@ class W_UnicodeObject(W_Root):
def descr_lt(self, space, w_other):
try:
- res = self._val(space) < self._op_val(space, w_other)
+ res = self._utf8 < self.convert_arg_to_w_unicode(space, w_other)._utf8
except OperationError as e:
if e.match(space, space.w_TypeError):
return space.w_NotImplemented
@@ -272,7 +290,7 @@ class W_UnicodeObject(W_Root):
def descr_le(self, space, w_other):
try:
- res = self._val(space) <= self._op_val(space, w_other)
+ res = self._utf8 <= self.convert_arg_to_w_unicode(space, w_other)._utf8
except OperationError as e:
if e.match(space, space.w_TypeError):
return space.w_NotImplemented
@@ -281,7 +299,7 @@ class W_UnicodeObject(W_Root):
def descr_gt(self, space, w_other):
try:
- res = self._val(space) > self._op_val(space, w_other)
+ res = self._utf8 > self.convert_arg_to_w_unicode(space, w_other)._utf8
except OperationError as e:
if e.match(space, space.w_TypeError):
return space.w_NotImplemented
@@ -290,7 +308,7 @@ class W_UnicodeObject(W_Root):
def descr_ge(self, space, w_other):
try:
- res = self._val(space) >= self._op_val(space, w_other)
+ res = self._utf8 >= self.convert_arg_to_w_unicode(space, w_other)._utf8
except OperationError as e:
if e.match(space, space.w_TypeError):
return space.w_NotImplemented
@@ -303,11 +321,11 @@ class W_UnicodeObject(W_Root):
def descr__format__(self, space, w_format_spec):
if not space.isinstance_w(w_format_spec, space.w_unicode):
w_format_spec = space.call_function(space.w_unicode, w_format_spec)
- spec = space.unicode_w(w_format_spec)
+ spec = space.utf8_w(w_format_spec)
formatter = newformat.unicode_formatter(space, spec)
self2 = unicode_from_object(space, self)
assert isinstance(self2, W_UnicodeObject)
- return formatter.format_string(self2._value)
+ return formatter.format_string(self2)
def descr_mod(self, space, w_values):
return mod_format(space, self, w_values, do_unicode=True)
@@ -315,71 +333,169 @@ class W_UnicodeObject(W_Root):
def descr_rmod(self, space, w_values):
return mod_format(space, w_values, self, do_unicode=True)
+ def descr_swapcase(self, space):
+ input = self._utf8
+ builder = rutf8.Utf8StringBuilder(len(input))
+ for ch in rutf8.Utf8StringIterator(input):
+ if unicodedb.isupper(ch):
+ ch = unicodedb.tolower(ch)
+ elif unicodedb.islower(ch):
+ ch = unicodedb.toupper(ch)
+ builder.append_code(ch)
+ return self.from_utf8builder(builder)
+
+ def descr_title(self, space):
+ if len(self._utf8) == 0:
+ return self
+ return self.title_unicode(self._utf8)
+
+ @jit.elidable
+ def title_unicode(self, value):
+ input = self._utf8
+ builder = rutf8.Utf8StringBuilder(len(input))
+ previous_is_cased = False
+ for ch0 in rutf8.Utf8StringIterator(input):
+ if not previous_is_cased:
+ ch1 = unicodedb.totitle(ch0)
+ else:
+ ch1 = unicodedb.tolower(ch0)
+ builder.append_code(ch1)
+ previous_is_cased = unicodedb.iscased(ch0)
+ return self.from_utf8builder(builder)
+
def descr_translate(self, space, w_table):
- selfvalue = self._value
- w_sys = space.getbuiltinmodule('sys')
- maxunicode = space.int_w(space.getattr(w_sys,
- space.newtext("maxunicode")))
- result = []
- for unichar in selfvalue:
+ builder = rutf8.Utf8StringBuilder(len(self._utf8))
+ for codepoint in rutf8.Utf8StringIterator(self._utf8):
try:
- w_newval = space.getitem(w_table, space.newint(ord(unichar)))
+ w_newval = space.getitem(w_table, space.newint(codepoint))
except OperationError as e:
- if e.match(space, space.w_LookupError):
- result.append(unichar)
- else:
+ if not e.match(space, space.w_LookupError):
raise
else:
if space.is_w(w_newval, space.w_None):
continue
elif space.isinstance_w(w_newval, space.w_int):
- newval = space.int_w(w_newval)
- if newval < 0 or newval > maxunicode:
- raise oefmt(space.w_TypeError,
- "character mapping must be in range(%s)",
- hex(maxunicode + 1))
- result.append(unichr(newval))
- elif space.isinstance_w(w_newval, space.w_unicode):
- result.append(space.unicode_w(w_newval))
+ codepoint = space.int_w(w_newval)
+ elif isinstance(w_newval, W_UnicodeObject):
+ builder.append_utf8(w_newval._utf8, w_newval._length)
+ continue
else:
raise oefmt(space.w_TypeError,
"character mapping must return integer, None "
"or unicode")
- return W_UnicodeObject(u''.join(result))
+ try:
+ builder.append_code(codepoint)
+ except ValueError:
+ raise oefmt(space.w_TypeError,
+ "character mapping must be in range(0x110000)")
+ return self.from_utf8builder(builder)
+
+ def descr_find(self, space, w_sub, w_start=None, w_end=None):
+ w_result = self._unwrap_and_search(space, w_sub, w_start, w_end)
+ if w_result is None:
+ w_result = space.newint(-1)
+ return w_result
+
+ def descr_rfind(self, space, w_sub, w_start=None, w_end=None):
+ w_result = self._unwrap_and_search(space, w_sub, w_start, w_end,
+ forward=False)
+ if w_result is None:
+ w_result = space.newint(-1)
+ return w_result
+
+ def descr_index(self, space, w_sub, w_start=None, w_end=None):
+ w_result = self._unwrap_and_search(space, w_sub, w_start, w_end)
+ if w_result is None:
+ raise oefmt(space.w_ValueError,
+ "substring not found in string.index")
+ return w_result
+
+ def descr_rindex(self, space, w_sub, w_start=None, w_end=None):
+ w_result = self._unwrap_and_search(space, w_sub, w_start, w_end,
+ forward=False)
+ if w_result is None:
+ raise oefmt(space.w_ValueError,
+ "substring not found in string.rindex")
+ return w_result
+
+ @specialize.arg(2)
+ def _is_generic(self, space, func_name):
+ func = getattr(self, func_name)
+ if self._length == 0:
+ return space.w_False
+ if self._length == 1:
+ return space.newbool(func(rutf8.codepoint_at_pos(self._utf8, 0)))
+ else:
+ return self._is_generic_loop(space, self._utf8, func_name)
+
+ @specialize.arg(3)
+ def _is_generic_loop(self, space, v, func_name):
+ func = getattr(self, func_name)
+ val = self._utf8
+ for uchar in rutf8.Utf8StringIterator(val):
+ if not func(uchar):
+ return space.w_False
+ return space.w_True
def descr_encode(self, space, w_encoding=None, w_errors=None):
encoding, errors = _get_encoding_and_errors(space, w_encoding,
w_errors)
return encode_object(space, self, encoding, errors)
+ @unwrap_spec(tabsize=int)
+ def descr_expandtabs(self, space, tabsize=8):
+ value = self._utf8
+ if not value:
+ return self._empty()
+
+ splitted = value.split('\t')
+
+ try:
+ if tabsize > 0:
+ ovfcheck(len(splitted) * tabsize)
+ except OverflowError:
+ raise oefmt(space.w_OverflowError, "new string is too long")
+ expanded = oldtoken = splitted.pop(0)
+ newlen = self._len() - len(splitted)
+
+ for token in splitted:
+ dist = self._tabindent(oldtoken, tabsize)
+ expanded += ' ' * dist + token
+ newlen += dist
+ oldtoken = token
+
+ return W_UnicodeObject(expanded, newlen)
+
_StringMethods_descr_join = descr_join
def descr_join(self, space, w_list):
- l = space.listview_unicode(w_list)
- if l is not None:
+ l = space.listview_utf8(w_list)
+ if l is not None and self.is_ascii():
if len(l) == 1:
- return space.newunicode(l[0])
- return space.newunicode(self._val(space).join(l))
+ return space.newutf8(l[0], len(l[0]))
+ s = self._utf8.join(l)
+ return space.newutf8(s, len(s))
return self._StringMethods_descr_join(space, w_list)
def _join_return_one(self, space, w_obj):
return space.is_w(space.type(w_obj), space.w_unicode)
- def _join_check_item(self, space, w_obj):
- if (space.isinstance_w(w_obj, space.w_bytes) or
- space.isinstance_w(w_obj, space.w_unicode)):
- return 0
- return 1
-
def descr_formatter_parser(self, space):
from pypy.objspace.std.newformat import unicode_template_formatter
- tformat = unicode_template_formatter(space, space.unicode_w(self))
+ tformat = unicode_template_formatter(space, space.utf8_w(self))
return tformat.formatter_parser()
def descr_formatter_field_name_split(self, space):
from pypy.objspace.std.newformat import unicode_template_formatter
- tformat = unicode_template_formatter(space, space.unicode_w(self))
+ tformat = unicode_template_formatter(space, space.utf8_w(self))
return tformat.formatter_field_name_split()
+ def descr_lower(self, space):
+ builder = rutf8.Utf8StringBuilder(len(self._utf8))
+ for ch in rutf8.Utf8StringIterator(self._utf8):
+ lower = unicodedb.tolower(ch)
+ builder.append_code(lower)
+ return self.from_utf8builder(builder)
+
def descr_isdecimal(self, space):
return self._is_generic(space, '_isdecimal')
@@ -388,24 +504,534 @@ class W_UnicodeObject(W_Root):
def descr_islower(self, space):
cased = False
- for uchar in self._value:
- if (unicodedb.isupper(ord(uchar)) or
- unicodedb.istitle(ord(uchar))):
+ for uchar in rutf8.Utf8StringIterator(self._utf8):
+ if (unicodedb.isupper(uchar) or
+ unicodedb.istitle(uchar)):
return space.w_False
- if not cased and unicodedb.islower(ord(uchar)):
+ if not cased and unicodedb.islower(uchar):
+ cased = True
+ return space.newbool(cased)
+
+ def descr_istitle(self, space):
+ cased = False
+ previous_is_cased = False
+ for uchar in rutf8.Utf8StringIterator(self._utf8):
+ if unicodedb.isupper(uchar) or unicodedb.istitle(uchar):
+ if previous_is_cased:
+ return space.w_False
+ previous_is_cased = True
+ cased = True
+ elif unicodedb.islower(uchar):
+ if not previous_is_cased:
+ return space.w_False
cased = True
+ else:
+ previous_is_cased = False
return space.newbool(cased)
def descr_isupper(self, space):
cased = False
- for uchar in self._value:
- if (unicodedb.islower(ord(uchar)) or
- unicodedb.istitle(ord(uchar))):
+ for uchar in rutf8.Utf8StringIterator(self._utf8):
+ if (unicodedb.islower(uchar) or
+ unicodedb.istitle(uchar)):
return space.w_False
- if not cased and unicodedb.isupper(ord(uchar)):
+ if not cased and unicodedb.isupper(uchar):
cased = True
return space.newbool(cased)
+ def descr_startswith(self, space, w_prefix, w_start=None, w_end=None):
+ start, end = self._unwrap_and_compute_idx_params(space, w_start, w_end)
+ value = self._utf8
+ if space.isinstance_w(w_prefix, space.w_tuple):
+ return self._startswith_tuple(space, value, w_prefix, start, end)
+ return space.newbool(self._startswith(space, value, w_prefix, start,
+ end))
+
+ def _startswith(self, space, value, w_prefix, start, end):
+ prefix = self.convert_arg_to_w_unicode(space, w_prefix)._utf8
+ if len(prefix) == 0:
+ return True
+ return startswith(value, prefix, start, end)
+
+ def descr_endswith(self, space, w_suffix, w_start=None, w_end=None):
+ start, end = self._unwrap_and_compute_idx_params(space, w_start, w_end)
+ value = self._utf8
+ if space.isinstance_w(w_suffix, space.w_tuple):
+ return self._endswith_tuple(space, value, w_suffix, start, end)
+ return space.newbool(self._endswith(space, value, w_suffix, start,
+ end))
+
+ def _endswith(self, space, value, w_prefix, start, end):
+ prefix = self.convert_arg_to_w_unicode(space, w_prefix)._utf8
+ if len(prefix) == 0:
+ return True
+ return endswith(value, prefix, start, end)
+
+ def descr_add(self, space, w_other):
+ try:
+ w_other = self.convert_arg_to_w_unicode(space, w_other)
+ except OperationError as e:
+ if e.match(space, space.w_TypeError):
+ return space.w_NotImplemented
+ raise
+ return W_UnicodeObject(self._utf8 + w_other._utf8,
+ self._len() + w_other._len())
+
+ @jit.look_inside_iff(lambda self, space, list_w, size:
+ jit.loop_unrolling_heuristic(list_w, size))
+ def _str_join_many_items(self, space, list_w, size):
+ value = self._utf8
+ lgt = self._len() * (size - 1)
+
+ prealloc_size = len(value) * (size - 1)
+ unwrapped = newlist_hint(size)
+ for i in range(size):
+ w_s = list_w[i]
+ if not (space.isinstance_w(w_s, space.w_bytes) or
+ space.isinstance_w(w_s, space.w_unicode)):
+ raise oefmt(space.w_TypeError,
+ "sequence item %d: expected string or unicode, %T found",
+ i, w_s)
+ # XXX Maybe the extra copy here is okay? It was basically going to
+ # happen anyway, what with being placed into the builder
+ w_u = self.convert_arg_to_w_unicode(space, w_s)
+ unwrapped.append(w_u._utf8)
+ lgt += w_u._length
+ prealloc_size += len(unwrapped[i])
+
+ sb = StringBuilder(prealloc_size)
+ for i in range(size):
+ if value and i != 0:
+ sb.append(value)
+ sb.append(unwrapped[i])
+ return W_UnicodeObject(sb.build(), lgt)
+
+ @unwrap_spec(keepends=bool)
+ def descr_splitlines(self, space, keepends=False):
+ value = self._utf8
+ length = len(value)
+ strs_w = []
+ pos = 0
+ while pos < length:
+ sol = pos
+ lgt = 0
+ while pos < length and not self._islinebreak(rutf8.codepoint_at_pos(value, pos)):
+ pos = rutf8.next_codepoint_pos(value, pos)
+ lgt += 1
+ eol = pos
+ if pos < length:
+ # read CRLF as one line break
+ if (value[pos] == '\r' and pos + 1 < length
+ and value[pos + 1] == '\n'):
+ pos += 2
+ line_end_chars = 2
+ else:
+ pos = rutf8.next_codepoint_pos(value, pos)
+ line_end_chars = 1
+ if keepends:
+ eol = pos
+ lgt += line_end_chars
+ assert eol >= 0
+ assert sol >= 0
+ strs_w.append(W_UnicodeObject(value[sol:eol], lgt))
+ return space.newlist(strs_w)
+
+ def descr_upper(self, space):
+ builder = rutf8.Utf8StringBuilder(len(self._utf8))
+ for ch in rutf8.Utf8StringIterator(self._utf8):
+ ch = unicodedb.toupper(ch)
+ builder.append_code(ch)
+ return self.from_utf8builder(builder)
+
+ @unwrap_spec(width=int)
+ def descr_zfill(self, space, width):
+ selfval = self._utf8
+ if len(selfval) == 0:
+ return W_UnicodeObject('0' * width, width)
+ num_zeros = width - self._len()
+ if num_zeros <= 0:
+ # cannot return self, in case it is a subclass of str
+ return W_UnicodeObject(selfval, self._len())
+ builder = StringBuilder(num_zeros + len(selfval))
+ if len(selfval) > 0 and (selfval[0] == '+' or selfval[0] == '-'):
+ # copy sign to first position
+ builder.append(selfval[0])
+ start = 1
+ else:
+ start = 0
+ builder.append_multiple_char('0', num_zeros)
+ builder.append_slice(selfval, start, len(selfval))
+ return W_UnicodeObject(builder.build(), width)
+
+ @unwrap_spec(maxsplit=int)
+ def descr_split(self, space, w_sep=None, maxsplit=-1):
+ res = []
+ value = self._utf8
+ if space.is_none(w_sep):
+ res = split(value, maxsplit=maxsplit, isutf8=True)
+ return space.newlist_utf8(res, self.is_ascii())
+
+ by = self.convert_arg_to_w_unicode(space, w_sep)._utf8
+ if len(by) == 0:
+ raise oefmt(space.w_ValueError, "empty separator")
+ res = split(value, by, maxsplit, isutf8=True)
+
+ return space.newlist_utf8(res, self.is_ascii())
+
+ @unwrap_spec(maxsplit=int)
+ def descr_rsplit(self, space, w_sep=None, maxsplit=-1):
+ res = []
+ value = self._utf8
+ if space.is_none(w_sep):
+ res = rsplit(value, maxsplit=maxsplit, isutf8=True)
+ return space.newlist_utf8(res, self.is_ascii())
+
+ by = self.convert_arg_to_w_unicode(space, w_sep)._utf8
+ if len(by) == 0:
+ raise oefmt(space.w_ValueError, "empty separator")
+ res = rsplit(value, by, maxsplit, isutf8=True)
+
+ return space.newlist_utf8(res, self.is_ascii())
+
+ def descr_getitem(self, space, w_index):
+ if isinstance(w_index, W_SliceObject):
+ length = self._len()
+ start, stop, step, sl = w_index.indices4(space, length)
+ if sl == 0:
+ return self._empty()
+ elif step == 1:
+ assert start >= 0 and stop >= 0
+ return self._unicode_sliced(space, start, stop)
+ else:
+ return self._getitem_slice_slowpath(space, start, step, sl)
+
+ index = space.getindex_w(w_index, space.w_IndexError, "string index")
+ return self._getitem_result(space, index)
+
+ def _getitem_slice_slowpath(self, space, start, step, sl):
+ # XXX same comment as in _unicode_sliced
+ builder = StringBuilder(step * sl)
+ byte_pos = self._index_to_byte(start)
+ i = 0
+ while True:
+ next_pos = rutf8.next_codepoint_pos(self._utf8, byte_pos)
+ builder.append(self._utf8[byte_pos:next_pos])
+ if i == sl - 1:
+ break
+ i += 1
+ byte_pos = self._index_to_byte(start + i * step)
+ return W_UnicodeObject(builder.build(), sl)
+
+ def descr_getslice(self, space, w_start, w_stop):
+ start, stop = normalize_simple_slice(
+ space, self._len(), w_start, w_stop)
+ if start == stop:
+ return self._empty()
+ else:
+ return self._unicode_sliced(space, start, stop)
+
+ def _unicode_sliced(self, space, start, stop):
+ # XXX maybe some heuristic, like first slice does not create
+ # full index, but second does?
+ assert start >= 0
+ assert stop >= 0
+ byte_start = self._index_to_byte(start)
+ byte_stop = self._index_to_byte(stop)
+ return W_UnicodeObject(self._utf8[byte_start:byte_stop], stop - start)
+
+ def descr_capitalize(self, space):
+ value = self._utf8
+ if len(value) == 0:
+ return self._empty()
+
+ builder = rutf8.Utf8StringBuilder(len(self._utf8))
+ it = rutf8.Utf8StringIterator(self._utf8)
+ uchar = it.next()
+ ch = unicodedb.toupper(uchar)
+ builder.append_code(ch)
+ for ch in it:
+ ch = unicodedb.tolower(ch)
+ builder.append_code(ch)
+ return self.from_utf8builder(builder)
+
+ @unwrap_spec(width=int, w_fillchar=WrappedDefault(' '))
+ def descr_center(self, space, width, w_fillchar):
+ value = self._utf8
+ fillchar = self.convert_arg_to_w_unicode(space, w_fillchar)._utf8
+ if len(fillchar) != 1:
+ raise oefmt(space.w_TypeError,
+ "center() argument 2 must be a single character")
+
+ d = width - self._len()
+ if d > 0:
+ offset = d//2 + (d & width & 1)
+ fillchar = fillchar[0]
+ centered = offset * fillchar + value + (d - offset) * fillchar
+ else:
+ centered = value
+ d = 0
+
+ return W_UnicodeObject(centered, self._len() + d)
+
+ def descr_count(self, space, w_sub, w_start=None, w_end=None):
+ value = self._utf8
+ start_index, end_index = self._unwrap_and_compute_idx_params(
+ space, w_start, w_end)
+ sub = self.convert_arg_to_w_unicode(space, w_sub)._utf8
+ return space.newint(value.count(sub, start_index, end_index))
+
+ def descr_contains(self, space, w_sub):
+ value = self._utf8
+ w_other = self.convert_arg_to_w_unicode(space, w_sub)
+ return space.newbool(value.find(w_other._utf8) >= 0)
+
+ def descr_partition(self, space, w_sub):
+ value = self._utf8
+ sub = self.convert_arg_to_w_unicode(space, w_sub)
+ sublen = sub._len()
+ if sublen == 0:
+ raise oefmt(space.w_ValueError, "empty separator")
+
+ pos = value.find(sub._utf8)
+
+ if pos < 0:
+ return space.newtuple([self, self._empty(), self._empty()])
+ else:
+ lgt = rutf8.check_utf8(value, True, stop=pos)
+ return space.newtuple(
+ [W_UnicodeObject(value[0:pos], lgt), w_sub,
+ W_UnicodeObject(value[pos + len(sub._utf8):len(value)],
+ self._len() - lgt - sublen)])
+
+ def descr_rpartition(self, space, w_sub):
+ value = self._utf8
+ sub = self.convert_arg_to_w_unicode(space, w_sub)
+ sublen = sub._len()
+ if sublen == 0:
+ raise oefmt(space.w_ValueError, "empty separator")
+
+ pos = value.rfind(sub._utf8)
+
+ if pos < 0:
+ return space.newtuple([self._empty(), self._empty(), self])
+ else:
+ lgt = rutf8.check_utf8(value, True, stop=pos)
+ return space.newtuple(
+ [W_UnicodeObject(value[0:pos], lgt), w_sub,
+ W_UnicodeObject(value[pos + len(sub._utf8):len(value)],
+ self._len() - lgt - sublen)])
+
+ @unwrap_spec(count=int)
+ def descr_replace(self, space, w_old, w_new, count=-1):
+ input = self._utf8
+
+ w_sub = self.convert_arg_to_w_unicode(space, w_old)
+ w_by = self.convert_arg_to_w_unicode(space, w_new)
+ # the following two lines are for being bug-to-bug compatible
+ # with CPython: see issue #2448
+ if count >= 0 and len(input) == 0:
+ return self._empty()
+ try:
+ res, replacements = replace_count(input, w_sub._utf8, w_by._utf8,
+ count, isutf8=True)
+ except OverflowError:
+ raise oefmt(space.w_OverflowError, "replace string is too long")
+
+ newlength = self._length + replacements * (w_by._length - w_sub._length)
+ return W_UnicodeObject(res, newlength)
+
+ def descr_mul(self, space, w_times):
+ try:
+ times = space.getindex_w(w_times, space.w_OverflowError)
+ except OperationError as e:
+ if e.match(space, space.w_TypeError):
+ return space.w_NotImplemented
+ raise
+ if times <= 0:
+ return self._empty()
+ if len(self._utf8) == 1:
+ return W_UnicodeObject(self._utf8[0] * times, times)
+ return W_UnicodeObject(self._utf8 * times, times * self._len())
+
+ descr_rmul = descr_mul
+
+ def _get_index_storage(self):
+ return jit.conditional_call_elidable(self._index_storage,
+ W_UnicodeObject._compute_index_storage, self)
+
+ def _compute_index_storage(self):
+ storage = rutf8.create_utf8_index_storage(self._utf8, self._length)
+ self._index_storage = storage
+ return storage
+
+ def _getitem_result(self, space, index):
+ if index < 0:
+ index += self._length
+ if index < 0 or index >= self._length:
+ raise oefmt(space.w_IndexError, "string index out of range")
+ start = self._index_to_byte(index)
+ end = rutf8.next_codepoint_pos(self._utf8, start)
+ return W_UnicodeObject(self._utf8[start:end], 1)
+
+ def is_ascii(self):
+ return self._length == len(self._utf8)
+
+ def _has_surrogates(self):
+ if self.is_ascii():
+ return False
+ return rutf8.has_surrogates(self._utf8)
+
+ def _index_to_byte(self, index):
+ if self.is_ascii():
+ assert index >= 0
+ return index
+ return rutf8.codepoint_position_at_index(
+ self._utf8, self._get_index_storage(), index)
+
+ @always_inline
+ def _unwrap_and_search(self, space, w_sub, w_start, w_end, forward=True):
+ w_sub = self.convert_arg_to_w_unicode(space, w_sub)
+ start, end = unwrap_start_stop(space, self._length, w_start, w_end)
+ if start == 0:
+ start_index = 0
+ elif start > self._length:
+ return None
+ else:
+ start_index = self._index_to_byte(start)
+
+ if end >= self._length:
+ end = self._length
+ end_index = len(self._utf8)
+ else:
+ end_index = self._index_to_byte(end)
+
+ if forward:
+ res_index = self._utf8.find(w_sub._utf8, start_index, end_index)
+ if res_index < 0:
+ return None
+ skip = rutf8.codepoints_in_utf8(self._utf8, start_index, res_index)
+ res = start + skip
+ assert res >= 0
+ return space.newint(res)
+ else:
+ res_index = self._utf8.rfind(w_sub._utf8, start_index, end_index)
+ if res_index < 0:
+ return None
+ skip = rutf8.codepoints_in_utf8(self._utf8, res_index, end_index)
+ res = end - skip
+ assert res >= 0
+ return space.newint(res)
+
+ def _unwrap_and_compute_idx_params(self, space, w_start, w_end):
+ # unwrap start and stop indices, optimized for the case where
+ # start == 0 and end == self._length. Note that 'start' and
+ # 'end' are measured in codepoints whereas 'start_index' and
+ # 'end_index' are measured in bytes.
+ start, end = unwrap_start_stop(space, self._length, w_start, w_end)
+ start_index = 0
+ end_index = len(self._utf8) + 1
+ if start > 0:
+ if start > self._length:
+ start_index = end_index
+ else:
+ start_index = self._index_to_byte(start)
+ if end < self._length:
+ end_index = self._index_to_byte(end)
+ return (start_index, end_index)
+
+ @unwrap_spec(width=int, w_fillchar=WrappedDefault(' '))
+ def descr_rjust(self, space, width, w_fillchar):
+ value = self._utf8
+ lgt = self._len()
+ w_fillchar = self.convert_arg_to_w_unicode(space, w_fillchar)
+ if w_fillchar._len() != 1:
+ raise oefmt(space.w_TypeError,
+ "rjust() argument 2 must be a single character")
+ d = width - lgt
+ if d > 0:
+ if len(w_fillchar._utf8) == 1:
+ # speedup
+ value = d * w_fillchar._utf8[0] + value
+ else:
+ value = d * w_fillchar._utf8 + value
+ return W_UnicodeObject(value, width)
+
+ return W_UnicodeObject(value, lgt)
+
+ @unwrap_spec(width=int, w_fillchar=WrappedDefault(' '))
+ def descr_ljust(self, space, width, w_fillchar):
+ value = self._utf8
+ w_fillchar = self.convert_arg_to_w_unicode(space, w_fillchar)
+ if w_fillchar._len() != 1:
+ raise oefmt(space.w_TypeError,
+ "ljust() argument 2 must be a single character")
+ d = width - self._len()
+ if d > 0:
+ if len(w_fillchar._utf8) == 1:
+ # speedup
+ value = value + d * w_fillchar._utf8[0]
+ else:
+ value = value + d * w_fillchar._utf8
+ return W_UnicodeObject(value, width)
+
+ return W_UnicodeObject(value, self._len())
+
+ def _utf8_sliced(self, start, stop, lgt):
+ assert start >= 0
+ assert stop >= 0
+ #if start == 0 and stop == len(s) and space.is_w(space.type(orig_obj),
+ # space.w_bytes):
+ # return orig_obj
+ return W_UnicodeObject(self._utf8[start:stop], lgt)
+
+ def _strip_none(self, space, left, right):
+ "internal function called by str_xstrip methods"
+ value = self._utf8
+
+ lpos = 0
+ rpos = len(value)
+ lgt = self._len()
+
+ if left:
+ while lpos < rpos and rutf8.isspace(value, lpos):
+ lpos = rutf8.next_codepoint_pos(value, lpos)
+ lgt -= 1
+
+ if right:
+ while rpos > lpos and rutf8.isspace(value,
+ rutf8.prev_codepoint_pos(value, rpos)):
+ rpos = rutf8.prev_codepoint_pos(value, rpos)
+ lgt -= 1
+
+ assert rpos >= lpos # annotator hint, don't remove
+ return self._utf8_sliced(lpos, rpos, lgt)
+
+ def _strip(self, space, w_chars, left, right, name='strip'):
+ "internal function called by str_xstrip methods"
+ value = self._utf8
+ chars = self.convert_arg_to_w_unicode(space, w_chars, strict=name)._utf8
+
+ lpos = 0
+ rpos = len(value)
+ lgt = self._len()
+
+ if left:
+ while lpos < rpos and rutf8.utf8_in_chars(value, lpos, chars):
+ lpos = rutf8.next_codepoint_pos(value, lpos)
+ lgt -= 1
+
+ if right:
+ while rpos > lpos and rutf8.utf8_in_chars(value,
+ rutf8.prev_codepoint_pos(value, rpos), chars):
+ rpos = rutf8.prev_codepoint_pos(value, rpos)
+ lgt -= 1
+
+ assert rpos >= lpos # annotator hint, don't remove
+ return self._utf8_sliced(lpos, rpos, lgt)
+
+ def descr_getnewargs(self, space):
+ return space.newtuple([W_UnicodeObject(self._utf8, self._length)])
+
_starts_ends_unicode = True
@@ -445,68 +1071,57 @@ def _get_encoding_and_errors(space, w_encoding, w_errors):
return encoding, errors
-def encode_object(space, w_object, encoding, errors):
- if encoding is None:
- # Get the encoder functions as a wrapped object.
- # This lookup is cached.
- w_encoder = space.sys.get_w_default_encoder()
- else:
- if errors is None or errors == 'strict':
- if encoding == 'ascii':
- u = space.unicode_w(w_object)
- eh = unicodehelper.encode_error_handler(space)
- return space.newbytes(unicode_encode_ascii(
- u, len(u), None, errorhandler=eh))
- if encoding == 'utf-8':
- u = space.unicode_w(w_object)
- eh = unicodehelper.encode_error_handler(space)
- return space.newbytes(unicode_encode_utf_8(
- u, len(u), None, errorhandler=eh,
- allow_surrogates=True))
- from pypy.module._codecs.interp_codecs import lookup_codec
- w_encoder = space.getitem(lookup_codec(space, encoding), space.newint(0))
- if errors is None:
- w_errors = space.newtext('strict')
- else:
- w_errors = space.newtext(errors)
- w_restuple = space.call_function(w_encoder, w_object, w_errors)
- w_retval = space.getitem(w_restuple, space.newint(0))
- if not space.isinstance_w(w_retval, space.w_bytes):
- raise oefmt(space.w_TypeError,
- "encoder did not return an string object (type '%T')",
- w_retval)
- return w_retval
+def encode_object(space, w_obj, encoding, errors):
+ from pypy.module._codecs.interp_codecs import encode
+ if errors is None or errors == 'strict':
+ # fast path
+ if ((encoding is None and space.sys.defaultencoding == 'ascii') or
+ encoding == 'ascii'):
+ s = space.utf8_w(w_obj)
+ try:
+ rutf8.check_ascii(s)
+ except rutf8.CheckError as a:
+ if space.isinstance_w(w_obj, space.w_unicode):
+ eh = unicodehelper.encode_error_handler(space)
+ else:
+ # must be a bytes-like object. In order to encode it,
+ # first "decode" to unicode. Since we cannot, raise a
+ # UnicodeDecodeError, not a UnicodeEncodeError
+ eh = unicodehelper.decode_error_handler(space)
+ eh(None, "ascii", "ordinal not in range(128)", s,
+ a.pos, a.pos + 1)
+ assert False, "always raises"
+ return space.newbytes(s)
+ if ((encoding is None and space.sys.defaultencoding == 'utf8') or
+ encoding == 'utf-8' or encoding == 'utf8' or encoding == 'UTF-8'):
+ utf8 = space.utf8_w(w_obj)
+ if rutf8.has_surrogates(utf8):
+ utf8 = rutf8.reencode_utf8_with_surrogates(utf8)
+ return space.newbytes(utf8)
+ return encode(space, w_obj, encoding, errors)
def decode_object(space, w_obj, encoding, errors):
- if encoding is None:
- encoding = getdefaultencoding(space)
+ from pypy.module._codecs.interp_codecs import lookup_codec, decode
if errors is None or errors == 'strict':
+ # fast paths
+ if encoding is None:
+ encoding = getdefaultencoding(space)
if encoding == 'ascii':
- # XXX error handling
s = space.charbuf_w(w_obj)
- try:
- u = fast_str_decode_ascii(s)
- except ValueError:
- eh = unicodehelper.decode_error_handler(space)
- u = str_decode_ascii( # try again, to get the error right
- s, len(s), None, final=True, errorhandler=eh)[0]
- return space.newunicode(u)
- if encoding == 'utf-8':
- s = space.charbuf_w(w_obj)
- eh = unicodehelper.decode_error_handler(space)
- return space.newunicode(str_decode_utf_8(
- s, len(s), None, final=True, errorhandler=eh,
- allow_surrogates=True)[0])
- w_codecs = space.getbuiltinmodule("_codecs")
- w_decode = space.getattr(w_codecs, space.newtext("decode"))
- if errors is None:
- w_retval = space.call_function(w_decode, w_obj, space.newtext(encoding))
- else:
- w_retval = space.call_function(w_decode, w_obj, space.newtext(encoding),
- space.newtext(errors))
- return w_retval
-
+ unicodehelper.check_ascii_or_raise(space, s)
+ return space.newutf8(s, len(s))
+ if encoding == 'utf-8' or encoding == 'utf8':
+ if (space.isinstance_w(w_obj, space.w_unicode) or
+ space.isinstance_w(w_obj, space.w_bytes)):
+ s = space.utf8_w(w_obj)
+ else:
+ s = space.charbuf_w(w_obj)
+ lgt = unicodehelper.check_utf8_or_raise(space, s)
+ return space.newutf8(s, lgt)
+ if encoding is None:
+ encoding = space.sys.defaultencoding
+ return decode(space, w_obj, encoding, errors)
def unicode_from_encoded_object(space, w_obj, encoding, errors):
# explicitly block bytearray on 2.7
@@ -534,7 +1149,7 @@ def unicode_from_object(space, w_obj):
# test_unicode_conversion_with__str__
if w_unicode_method is None:
if space.isinstance_w(w_obj, space.w_unicode):
- return space.newunicode(space.unicode_w(w_obj))
+ return space.convert_arg_to_w_unicode(w_obj)
w_unicode_method = space.lookup(w_obj, "__str__")
if w_unicode_method is not None:
w_res = space.get_and_call_function(w_unicode_method, w_obj)
@@ -551,11 +1166,8 @@ def unicode_from_string(space, w_bytes):
if encoding != 'ascii':
return unicode_from_encoded_object(space, w_bytes, encoding, "strict")
s = space.bytes_w(w_bytes)
- try:
- return W_UnicodeObject(s.decode("ascii"))
- except UnicodeDecodeError:
- # raising UnicodeDecodeError is messy, "please crash for me"
- return unicode_from_encoded_object(space, w_bytes, "ascii", "strict")
+ unicodehelper.check_ascii_or_raise(space, s)
+ return W_UnicodeObject(s, len(s))
class UnicodeDocstrings:
@@ -1102,38 +1714,42 @@ def _create_list_from_unicode(value):
return [s for s in value]
-W_UnicodeObject.EMPTY = W_UnicodeObject(u'')
+W_UnicodeObject.EMPTY = W_UnicodeObject('', 0)
# Helper for converting int/long
def unicode_to_decimal_w(space, w_unistr):
if not isinstance(w_unistr, W_UnicodeObject):
raise oefmt(space.w_TypeError, "expected unicode, got '%T'", w_unistr)
- unistr = w_unistr._value
- result = ['\0'] * len(unistr)
+ unistr = w_unistr._utf8
+ result = ['\0'] * w_unistr._length
digits = ['0', '1', '2', '3', '4',
'5', '6', '7', '8', '9']
- for i in xrange(len(unistr)):
- uchr = ord(unistr[i])
- if unicodedb.isspace(uchr):
- result[i] = ' '
+ res_pos = 0
+ iter = rutf8.Utf8StringIterator(unistr)
+ for uchr in iter:
+ if W_UnicodeObject._isspace(uchr):
+ result[res_pos] = ' '
+ res_pos += 1
continue
try:
- result[i] = digits[unicodedb.decimal(uchr)]
+ result[res_pos] = digits[unicodedb.decimal(uchr)]
except KeyError:
if 0 < uchr < 256:
- result[i] = chr(uchr)
+ result[res_pos] = chr(uchr)
else:
w_encoding = space.newtext('decimal')
- w_start = space.newint(i)
- w_end = space.newint(i+1)
+ pos = iter.get_pos()
+ w_start = space.newint(pos)
+ w_end = space.newint(pos+1)
w_reason = space.newtext('invalid decimal Unicode string')
raise OperationError(space.w_UnicodeEncodeError,
space.newtuple([w_encoding, w_unistr,
w_start, w_end,
w_reason]))
+ res_pos += 1
return ''.join(result)
-_repr_function, _ = make_unicode_escape_function(
- pass_printable=False, unicode_output=False, quotes=True, prefix='u')
+_repr_function = rutf8.make_utf8_escape_function(
+ pass_printable=False, quotes=True, prefix='u')