diff options
author | 2019-03-12 18:18:02 +0000 | |
---|---|---|
committer | 2019-03-12 18:18:02 +0000 | |
commit | 6eac96a58627e5638969e92335ac307c533aa644 (patch) | |
tree | 49b3e1101750ce1ebcebb479fff383d62b86a805 /pypy/objspace/std/unicodeobject.py | |
parent | hg merge default (diff) | |
parent | reduce code duplication (diff) | |
download | pypy-6eac96a58627e5638969e92335ac307c533aa644.tar.gz pypy-6eac96a58627e5638969e92335ac307c533aa644.tar.bz2 pypy-6eac96a58627e5638969e92335ac307c533aa644.zip |
hg merge default
Diffstat (limited to 'pypy/objspace/std/unicodeobject.py')
-rw-r--r-- | pypy/objspace/std/unicodeobject.py | 1038 |
1 files changed, 827 insertions, 211 deletions
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py index 5cea4b4802..efd72611c0 100644 --- a/pypy/objspace/std/unicodeobject.py +++ b/pypy/objspace/std/unicodeobject.py @@ -1,14 +1,15 @@ """The builtin unicode implementation""" from rpython.rlib.objectmodel import ( - compute_hash, compute_unique_id, import_from_mixin, - enforceargs) + compute_hash, compute_unique_id, import_from_mixin, always_inline, + enforceargs, newlist_hint, specialize, we_are_translated) from rpython.rlib.buffer import StringBuffer from rpython.rlib.mutbuffer import MutableStringBuffer -from rpython.rlib.rstring import StringBuilder, UnicodeBuilder -from rpython.rlib.runicode import ( - make_unicode_escape_function, str_decode_ascii, str_decode_utf_8, - unicode_encode_ascii, unicode_encode_utf_8, fast_str_decode_ascii) +from rpython.rlib.rarithmetic import ovfcheck +from rpython.rlib.rstring import ( + StringBuilder, split, rsplit, UnicodeBuilder, replace_count, startswith, + endswith) +from rpython.rlib import rutf8, jit from pypy.interpreter import unicodehelper from pypy.interpreter.baseobjspace import W_Root @@ -19,6 +20,8 @@ from pypy.module.unicodedata.interp_ucd import unicodedb from pypy.objspace.std import newformat from pypy.objspace.std.basestringtype import basestring_typedef from pypy.objspace.std.formatting import mod_format +from pypy.objspace.std.sliceobject import (W_SliceObject, + unwrap_start_stop, normalize_simple_slice) from pypy.objspace.std.stringmethods import StringMethods from pypy.objspace.std.util import IDTAG_SPECIAL, IDTAG_SHIFT @@ -29,25 +32,33 @@ __all__ = ['W_UnicodeObject', 'wrapunicode', 'plain_str2unicode', class W_UnicodeObject(W_Root): import_from_mixin(StringMethods) - _immutable_fields_ = ['_value'] + _immutable_fields_ = ['_utf8'] + + @enforceargs(utf8str=str) + def __init__(self, utf8str, length): + assert isinstance(utf8str, str) + assert length >= 0 + self._utf8 = utf8str + self._length = length + self._index_storage = rutf8.null_storage() + if not we_are_translated(): + try: + # best effort, too expensive to handle surrogates + ulength = rutf8.codepoints_in_utf(utf8str) + except: + ulength = length + assert ulength == length - @enforceargs(uni=unicode) - def __init__(self, unistr): - assert isinstance(unistr, unicode) - self._value = unistr - def __repr__(self): - """representation for debugging purposes""" - return "%s(%r)" % (self.__class__.__name__, self._value) - def unwrap(self, space): - # for testing - return self._value + @staticmethod + def from_utf8builder(builder): + return W_UnicodeObject( + builder.build(), builder.getlength()) - def create_if_subclassed(self): - if type(self) is W_UnicodeObject: - return self - return W_UnicodeObject(self._value) + def __repr__(self): + """representation for debugging purposes""" + return "%s(%r)" % (self.__class__.__name__, self._utf8) def is_w(self, space, w_other): if not isinstance(w_other, W_UnicodeObject): @@ -56,9 +67,9 @@ class W_UnicodeObject(W_Root): return True if self.user_overridden_class or w_other.user_overridden_class: return False - s1 = space.unicode_w(self) - s2 = space.unicode_w(w_other) - if len(s2) > 1: + s1 = space.utf8_w(self) + s2 = space.utf8_w(w_other) + if len(s2) > 2: return s1 is s2 else: # strings of len <= 1 are unique-ified return s1 == s2 @@ -66,61 +77,62 @@ class W_UnicodeObject(W_Root): def immutable_unique_id(self, space): if self.user_overridden_class: return None - s = space.unicode_w(self) - if len(s) > 1: + s = space.utf8_w(self) + if len(s) > 2: uid = compute_unique_id(s) else: # strings of len <= 1 are unique-ified if len(s) == 1: base = ~ord(s[0]) # negative base values + elif len(s) == 2: + base = ~((ord(s[1]) << 8) | ord(s[0])) else: base = 257 # empty unicode string: base value 257 uid = (base << IDTAG_SHIFT) | IDTAG_SPECIAL return space.newint(uid) def str_w(self, space): - return space.text_w(space.str(self)) + return space.text_w(encode_object(space, self, 'ascii', 'strict')) - def unicode_w(self, space): - return self._value + def utf8_w(self, space): + return self._utf8 def readbuf_w(self, space): - from rpython.rlib.rstruct.unichar import pack_unichar, UNICODE_SIZE - buf = MutableStringBuffer(len(self._value) * UNICODE_SIZE) + # XXX for now + from rpython.rlib.rstruct.unichar import pack_codepoint, UNICODE_SIZE + builder = MutableStringBuffer(self._len() * UNICODE_SIZE) pos = 0 - for unich in self._value: - pack_unichar(unich, buf, pos) + i = 0 + while i < len(self._utf8): + unich = rutf8.codepoint_at_pos(self._utf8, i) + pack_codepoint(unich, builder, pos) pos += UNICODE_SIZE - return StringBuffer(buf.finish()) + i = rutf8.next_codepoint_pos(self._utf8, i) + return StringBuffer(builder.finish()) def writebuf_w(self, space): raise oefmt(space.w_TypeError, "cannot use unicode as modifiable buffer") - charbuf_w = str_w + def charbuf_w(self, space): + # Returns ascii-encoded str + return space.text_w(encode_object(space, self, 'ascii', 'strict')) - def listview_unicode(self): - return _create_list_from_unicode(self._value) + def listview_utf8(self): + assert self.is_ascii() + return _create_list_from_unicode(self._utf8) def ord(self, space): - if len(self._value) != 1: + if self._len() != 1: raise oefmt(space.w_TypeError, "ord() expected a character, but string of length %d " - "found", len(self._value)) - return space.newint(ord(self._value[0])) - - def _new(self, value): - return W_UnicodeObject(value) - - def _new_from_list(self, value): - return W_UnicodeObject(u''.join(value)) + "found", self._len()) + return space.newint(rutf8.codepoint_at_pos(self._utf8, 0)) def _empty(self): return W_UnicodeObject.EMPTY def _len(self): - return len(self._value) - - _val = unicode_w + return self._length @staticmethod def _use_rstr_ops(space, w_other): @@ -129,67 +141,64 @@ class W_UnicodeObject(W_Root): return True @staticmethod - def _op_val(space, w_other, strict=None): - if isinstance(w_other, W_UnicodeObject): - return w_other._value + def convert_arg_to_w_unicode(space, w_other, strict=None): + if space.is_w(space.type(w_other), space.w_unicode): + # XXX why do we need this for translation??? + assert isinstance(w_other, W_UnicodeObject) + return w_other if space.isinstance_w(w_other, space.w_bytes): - return unicode_from_string(space, w_other)._value + return unicode_from_string(space, w_other) if strict: raise oefmt(space.w_TypeError, "%s arg must be None, unicode or str", strict) - return unicode_from_encoded_object( - space, w_other, None, "strict")._value + return unicode_from_encoded_object(space, w_other, 'utf8', "strict") + + def convert_to_w_unicode(self, space): + return self + @specialize.argtype(1) def _chr(self, char): assert len(char) == 1 - return unicode(char)[0] + return unichr(ord(char[0])) + + def _multi_chr(self, unichar): + return unichar _builder = UnicodeBuilder def _isupper(self, ch): - return unicodedb.isupper(ord(ch)) + return unicodedb.isupper(ch) def _islower(self, ch): - return unicodedb.islower(ord(ch)) + return unicodedb.islower(ch) def _isnumeric(self, ch): - return unicodedb.isnumeric(ord(ch)) + return unicodedb.isnumeric(ch) def _istitle(self, ch): - return unicodedb.isupper(ord(ch)) or unicodedb.istitle(ord(ch)) + return unicodedb.isupper(ch) or unicodedb.istitle(ch) - def _isspace(self, ch): - return unicodedb.isspace(ord(ch)) + @staticmethod + def _isspace(ch): + return unicodedb.isspace(ch) def _isalpha(self, ch): - return unicodedb.isalpha(ord(ch)) + return unicodedb.isalpha(ch) def _isalnum(self, ch): - return unicodedb.isalnum(ord(ch)) + return unicodedb.isalnum(ch) def _isdigit(self, ch): - return unicodedb.isdigit(ord(ch)) + return unicodedb.isdigit(ch) def _isdecimal(self, ch): - return unicodedb.isdecimal(ord(ch)) + return unicodedb.isdecimal(ch) def _iscased(self, ch): - return unicodedb.iscased(ord(ch)) + return unicodedb.iscased(ch) def _islinebreak(self, ch): - return unicodedb.islinebreak(ord(ch)) - - def _upper(self, ch): - return unichr(unicodedb.toupper(ord(ch))) - - def _lower(self, ch): - return unichr(unicodedb.tolower(ord(ch))) - - def _title(self, ch): - return unichr(unicodedb.totitle(ord(ch))) - - def _newlist_unwrapped(self, space, lst): - return space.newlist_unicode(lst) + return unicodedb.islinebreak(ch) @staticmethod def descr_new(space, w_unicodetype, w_string=None, w_encoding=None, @@ -212,26 +221,35 @@ class W_UnicodeObject(W_Root): assert isinstance(w_value, W_UnicodeObject) w_newobj = space.allocate_instance(W_UnicodeObject, w_unicodetype) - W_UnicodeObject.__init__(w_newobj, w_value._value) + W_UnicodeObject.__init__(w_newobj, w_value._utf8, w_value._length) + if w_value._index_storage: + # copy the storage if it's there + w_newobj._index_storage = w_value._index_storage return w_newobj def descr_repr(self, space): - chars = self._value - size = len(chars) - s = _repr_function(chars, size, "strict") - return space.newtext(s) + return space.newtext(_repr_function(self._utf8)) def descr_str(self, space): - return encode_object(space, self, None, None) + return encode_object(space, self, 'ascii', 'strict') - def descr_hash(self, space): - x = compute_hash(self._value) + def hash_w(self): + # shortcut for UnicodeDictStrategy + x = compute_hash(self._utf8) x -= (x == -1) # convert -1 to -2 without creating a bridge - return space.newint(x) + return x + + def descr_hash(self, space): + return space.newint(self.hash_w()) + + def eq_w(self, w_other): + # shortcut for UnicodeDictStrategy + assert isinstance(w_other, W_UnicodeObject) + return self._utf8 == w_other._utf8 def descr_eq(self, space, w_other): try: - res = self._val(space) == self._op_val(space, w_other) + res = self._utf8 == self.convert_arg_to_w_unicode(space, w_other)._utf8 except OperationError as e: if e.match(space, space.w_TypeError): return space.w_NotImplemented @@ -247,7 +265,7 @@ class W_UnicodeObject(W_Root): def descr_ne(self, space, w_other): try: - res = self._val(space) != self._op_val(space, w_other) + res = self._utf8 != self.convert_arg_to_w_unicode(space, w_other)._utf8 except OperationError as e: if e.match(space, space.w_TypeError): return space.w_NotImplemented @@ -263,7 +281,7 @@ class W_UnicodeObject(W_Root): def descr_lt(self, space, w_other): try: - res = self._val(space) < self._op_val(space, w_other) + res = self._utf8 < self.convert_arg_to_w_unicode(space, w_other)._utf8 except OperationError as e: if e.match(space, space.w_TypeError): return space.w_NotImplemented @@ -272,7 +290,7 @@ class W_UnicodeObject(W_Root): def descr_le(self, space, w_other): try: - res = self._val(space) <= self._op_val(space, w_other) + res = self._utf8 <= self.convert_arg_to_w_unicode(space, w_other)._utf8 except OperationError as e: if e.match(space, space.w_TypeError): return space.w_NotImplemented @@ -281,7 +299,7 @@ class W_UnicodeObject(W_Root): def descr_gt(self, space, w_other): try: - res = self._val(space) > self._op_val(space, w_other) + res = self._utf8 > self.convert_arg_to_w_unicode(space, w_other)._utf8 except OperationError as e: if e.match(space, space.w_TypeError): return space.w_NotImplemented @@ -290,7 +308,7 @@ class W_UnicodeObject(W_Root): def descr_ge(self, space, w_other): try: - res = self._val(space) >= self._op_val(space, w_other) + res = self._utf8 >= self.convert_arg_to_w_unicode(space, w_other)._utf8 except OperationError as e: if e.match(space, space.w_TypeError): return space.w_NotImplemented @@ -303,11 +321,11 @@ class W_UnicodeObject(W_Root): def descr__format__(self, space, w_format_spec): if not space.isinstance_w(w_format_spec, space.w_unicode): w_format_spec = space.call_function(space.w_unicode, w_format_spec) - spec = space.unicode_w(w_format_spec) + spec = space.utf8_w(w_format_spec) formatter = newformat.unicode_formatter(space, spec) self2 = unicode_from_object(space, self) assert isinstance(self2, W_UnicodeObject) - return formatter.format_string(self2._value) + return formatter.format_string(self2) def descr_mod(self, space, w_values): return mod_format(space, self, w_values, do_unicode=True) @@ -315,71 +333,169 @@ class W_UnicodeObject(W_Root): def descr_rmod(self, space, w_values): return mod_format(space, w_values, self, do_unicode=True) + def descr_swapcase(self, space): + input = self._utf8 + builder = rutf8.Utf8StringBuilder(len(input)) + for ch in rutf8.Utf8StringIterator(input): + if unicodedb.isupper(ch): + ch = unicodedb.tolower(ch) + elif unicodedb.islower(ch): + ch = unicodedb.toupper(ch) + builder.append_code(ch) + return self.from_utf8builder(builder) + + def descr_title(self, space): + if len(self._utf8) == 0: + return self + return self.title_unicode(self._utf8) + + @jit.elidable + def title_unicode(self, value): + input = self._utf8 + builder = rutf8.Utf8StringBuilder(len(input)) + previous_is_cased = False + for ch0 in rutf8.Utf8StringIterator(input): + if not previous_is_cased: + ch1 = unicodedb.totitle(ch0) + else: + ch1 = unicodedb.tolower(ch0) + builder.append_code(ch1) + previous_is_cased = unicodedb.iscased(ch0) + return self.from_utf8builder(builder) + def descr_translate(self, space, w_table): - selfvalue = self._value - w_sys = space.getbuiltinmodule('sys') - maxunicode = space.int_w(space.getattr(w_sys, - space.newtext("maxunicode"))) - result = [] - for unichar in selfvalue: + builder = rutf8.Utf8StringBuilder(len(self._utf8)) + for codepoint in rutf8.Utf8StringIterator(self._utf8): try: - w_newval = space.getitem(w_table, space.newint(ord(unichar))) + w_newval = space.getitem(w_table, space.newint(codepoint)) except OperationError as e: - if e.match(space, space.w_LookupError): - result.append(unichar) - else: + if not e.match(space, space.w_LookupError): raise else: if space.is_w(w_newval, space.w_None): continue elif space.isinstance_w(w_newval, space.w_int): - newval = space.int_w(w_newval) - if newval < 0 or newval > maxunicode: - raise oefmt(space.w_TypeError, - "character mapping must be in range(%s)", - hex(maxunicode + 1)) - result.append(unichr(newval)) - elif space.isinstance_w(w_newval, space.w_unicode): - result.append(space.unicode_w(w_newval)) + codepoint = space.int_w(w_newval) + elif isinstance(w_newval, W_UnicodeObject): + builder.append_utf8(w_newval._utf8, w_newval._length) + continue else: raise oefmt(space.w_TypeError, "character mapping must return integer, None " "or unicode") - return W_UnicodeObject(u''.join(result)) + try: + builder.append_code(codepoint) + except ValueError: + raise oefmt(space.w_TypeError, + "character mapping must be in range(0x110000)") + return self.from_utf8builder(builder) + + def descr_find(self, space, w_sub, w_start=None, w_end=None): + w_result = self._unwrap_and_search(space, w_sub, w_start, w_end) + if w_result is None: + w_result = space.newint(-1) + return w_result + + def descr_rfind(self, space, w_sub, w_start=None, w_end=None): + w_result = self._unwrap_and_search(space, w_sub, w_start, w_end, + forward=False) + if w_result is None: + w_result = space.newint(-1) + return w_result + + def descr_index(self, space, w_sub, w_start=None, w_end=None): + w_result = self._unwrap_and_search(space, w_sub, w_start, w_end) + if w_result is None: + raise oefmt(space.w_ValueError, + "substring not found in string.index") + return w_result + + def descr_rindex(self, space, w_sub, w_start=None, w_end=None): + w_result = self._unwrap_and_search(space, w_sub, w_start, w_end, + forward=False) + if w_result is None: + raise oefmt(space.w_ValueError, + "substring not found in string.rindex") + return w_result + + @specialize.arg(2) + def _is_generic(self, space, func_name): + func = getattr(self, func_name) + if self._length == 0: + return space.w_False + if self._length == 1: + return space.newbool(func(rutf8.codepoint_at_pos(self._utf8, 0))) + else: + return self._is_generic_loop(space, self._utf8, func_name) + + @specialize.arg(3) + def _is_generic_loop(self, space, v, func_name): + func = getattr(self, func_name) + val = self._utf8 + for uchar in rutf8.Utf8StringIterator(val): + if not func(uchar): + return space.w_False + return space.w_True def descr_encode(self, space, w_encoding=None, w_errors=None): encoding, errors = _get_encoding_and_errors(space, w_encoding, w_errors) return encode_object(space, self, encoding, errors) + @unwrap_spec(tabsize=int) + def descr_expandtabs(self, space, tabsize=8): + value = self._utf8 + if not value: + return self._empty() + + splitted = value.split('\t') + + try: + if tabsize > 0: + ovfcheck(len(splitted) * tabsize) + except OverflowError: + raise oefmt(space.w_OverflowError, "new string is too long") + expanded = oldtoken = splitted.pop(0) + newlen = self._len() - len(splitted) + + for token in splitted: + dist = self._tabindent(oldtoken, tabsize) + expanded += ' ' * dist + token + newlen += dist + oldtoken = token + + return W_UnicodeObject(expanded, newlen) + _StringMethods_descr_join = descr_join def descr_join(self, space, w_list): - l = space.listview_unicode(w_list) - if l is not None: + l = space.listview_utf8(w_list) + if l is not None and self.is_ascii(): if len(l) == 1: - return space.newunicode(l[0]) - return space.newunicode(self._val(space).join(l)) + return space.newutf8(l[0], len(l[0])) + s = self._utf8.join(l) + return space.newutf8(s, len(s)) return self._StringMethods_descr_join(space, w_list) def _join_return_one(self, space, w_obj): return space.is_w(space.type(w_obj), space.w_unicode) - def _join_check_item(self, space, w_obj): - if (space.isinstance_w(w_obj, space.w_bytes) or - space.isinstance_w(w_obj, space.w_unicode)): - return 0 - return 1 - def descr_formatter_parser(self, space): from pypy.objspace.std.newformat import unicode_template_formatter - tformat = unicode_template_formatter(space, space.unicode_w(self)) + tformat = unicode_template_formatter(space, space.utf8_w(self)) return tformat.formatter_parser() def descr_formatter_field_name_split(self, space): from pypy.objspace.std.newformat import unicode_template_formatter - tformat = unicode_template_formatter(space, space.unicode_w(self)) + tformat = unicode_template_formatter(space, space.utf8_w(self)) return tformat.formatter_field_name_split() + def descr_lower(self, space): + builder = rutf8.Utf8StringBuilder(len(self._utf8)) + for ch in rutf8.Utf8StringIterator(self._utf8): + lower = unicodedb.tolower(ch) + builder.append_code(lower) + return self.from_utf8builder(builder) + def descr_isdecimal(self, space): return self._is_generic(space, '_isdecimal') @@ -388,24 +504,534 @@ class W_UnicodeObject(W_Root): def descr_islower(self, space): cased = False - for uchar in self._value: - if (unicodedb.isupper(ord(uchar)) or - unicodedb.istitle(ord(uchar))): + for uchar in rutf8.Utf8StringIterator(self._utf8): + if (unicodedb.isupper(uchar) or + unicodedb.istitle(uchar)): return space.w_False - if not cased and unicodedb.islower(ord(uchar)): + if not cased and unicodedb.islower(uchar): + cased = True + return space.newbool(cased) + + def descr_istitle(self, space): + cased = False + previous_is_cased = False + for uchar in rutf8.Utf8StringIterator(self._utf8): + if unicodedb.isupper(uchar) or unicodedb.istitle(uchar): + if previous_is_cased: + return space.w_False + previous_is_cased = True + cased = True + elif unicodedb.islower(uchar): + if not previous_is_cased: + return space.w_False cased = True + else: + previous_is_cased = False return space.newbool(cased) def descr_isupper(self, space): cased = False - for uchar in self._value: - if (unicodedb.islower(ord(uchar)) or - unicodedb.istitle(ord(uchar))): + for uchar in rutf8.Utf8StringIterator(self._utf8): + if (unicodedb.islower(uchar) or + unicodedb.istitle(uchar)): return space.w_False - if not cased and unicodedb.isupper(ord(uchar)): + if not cased and unicodedb.isupper(uchar): cased = True return space.newbool(cased) + def descr_startswith(self, space, w_prefix, w_start=None, w_end=None): + start, end = self._unwrap_and_compute_idx_params(space, w_start, w_end) + value = self._utf8 + if space.isinstance_w(w_prefix, space.w_tuple): + return self._startswith_tuple(space, value, w_prefix, start, end) + return space.newbool(self._startswith(space, value, w_prefix, start, + end)) + + def _startswith(self, space, value, w_prefix, start, end): + prefix = self.convert_arg_to_w_unicode(space, w_prefix)._utf8 + if len(prefix) == 0: + return True + return startswith(value, prefix, start, end) + + def descr_endswith(self, space, w_suffix, w_start=None, w_end=None): + start, end = self._unwrap_and_compute_idx_params(space, w_start, w_end) + value = self._utf8 + if space.isinstance_w(w_suffix, space.w_tuple): + return self._endswith_tuple(space, value, w_suffix, start, end) + return space.newbool(self._endswith(space, value, w_suffix, start, + end)) + + def _endswith(self, space, value, w_prefix, start, end): + prefix = self.convert_arg_to_w_unicode(space, w_prefix)._utf8 + if len(prefix) == 0: + return True + return endswith(value, prefix, start, end) + + def descr_add(self, space, w_other): + try: + w_other = self.convert_arg_to_w_unicode(space, w_other) + except OperationError as e: + if e.match(space, space.w_TypeError): + return space.w_NotImplemented + raise + return W_UnicodeObject(self._utf8 + w_other._utf8, + self._len() + w_other._len()) + + @jit.look_inside_iff(lambda self, space, list_w, size: + jit.loop_unrolling_heuristic(list_w, size)) + def _str_join_many_items(self, space, list_w, size): + value = self._utf8 + lgt = self._len() * (size - 1) + + prealloc_size = len(value) * (size - 1) + unwrapped = newlist_hint(size) + for i in range(size): + w_s = list_w[i] + if not (space.isinstance_w(w_s, space.w_bytes) or + space.isinstance_w(w_s, space.w_unicode)): + raise oefmt(space.w_TypeError, + "sequence item %d: expected string or unicode, %T found", + i, w_s) + # XXX Maybe the extra copy here is okay? It was basically going to + # happen anyway, what with being placed into the builder + w_u = self.convert_arg_to_w_unicode(space, w_s) + unwrapped.append(w_u._utf8) + lgt += w_u._length + prealloc_size += len(unwrapped[i]) + + sb = StringBuilder(prealloc_size) + for i in range(size): + if value and i != 0: + sb.append(value) + sb.append(unwrapped[i]) + return W_UnicodeObject(sb.build(), lgt) + + @unwrap_spec(keepends=bool) + def descr_splitlines(self, space, keepends=False): + value = self._utf8 + length = len(value) + strs_w = [] + pos = 0 + while pos < length: + sol = pos + lgt = 0 + while pos < length and not self._islinebreak(rutf8.codepoint_at_pos(value, pos)): + pos = rutf8.next_codepoint_pos(value, pos) + lgt += 1 + eol = pos + if pos < length: + # read CRLF as one line break + if (value[pos] == '\r' and pos + 1 < length + and value[pos + 1] == '\n'): + pos += 2 + line_end_chars = 2 + else: + pos = rutf8.next_codepoint_pos(value, pos) + line_end_chars = 1 + if keepends: + eol = pos + lgt += line_end_chars + assert eol >= 0 + assert sol >= 0 + strs_w.append(W_UnicodeObject(value[sol:eol], lgt)) + return space.newlist(strs_w) + + def descr_upper(self, space): + builder = rutf8.Utf8StringBuilder(len(self._utf8)) + for ch in rutf8.Utf8StringIterator(self._utf8): + ch = unicodedb.toupper(ch) + builder.append_code(ch) + return self.from_utf8builder(builder) + + @unwrap_spec(width=int) + def descr_zfill(self, space, width): + selfval = self._utf8 + if len(selfval) == 0: + return W_UnicodeObject('0' * width, width) + num_zeros = width - self._len() + if num_zeros <= 0: + # cannot return self, in case it is a subclass of str + return W_UnicodeObject(selfval, self._len()) + builder = StringBuilder(num_zeros + len(selfval)) + if len(selfval) > 0 and (selfval[0] == '+' or selfval[0] == '-'): + # copy sign to first position + builder.append(selfval[0]) + start = 1 + else: + start = 0 + builder.append_multiple_char('0', num_zeros) + builder.append_slice(selfval, start, len(selfval)) + return W_UnicodeObject(builder.build(), width) + + @unwrap_spec(maxsplit=int) + def descr_split(self, space, w_sep=None, maxsplit=-1): + res = [] + value = self._utf8 + if space.is_none(w_sep): + res = split(value, maxsplit=maxsplit, isutf8=True) + return space.newlist_utf8(res, self.is_ascii()) + + by = self.convert_arg_to_w_unicode(space, w_sep)._utf8 + if len(by) == 0: + raise oefmt(space.w_ValueError, "empty separator") + res = split(value, by, maxsplit, isutf8=True) + + return space.newlist_utf8(res, self.is_ascii()) + + @unwrap_spec(maxsplit=int) + def descr_rsplit(self, space, w_sep=None, maxsplit=-1): + res = [] + value = self._utf8 + if space.is_none(w_sep): + res = rsplit(value, maxsplit=maxsplit, isutf8=True) + return space.newlist_utf8(res, self.is_ascii()) + + by = self.convert_arg_to_w_unicode(space, w_sep)._utf8 + if len(by) == 0: + raise oefmt(space.w_ValueError, "empty separator") + res = rsplit(value, by, maxsplit, isutf8=True) + + return space.newlist_utf8(res, self.is_ascii()) + + def descr_getitem(self, space, w_index): + if isinstance(w_index, W_SliceObject): + length = self._len() + start, stop, step, sl = w_index.indices4(space, length) + if sl == 0: + return self._empty() + elif step == 1: + assert start >= 0 and stop >= 0 + return self._unicode_sliced(space, start, stop) + else: + return self._getitem_slice_slowpath(space, start, step, sl) + + index = space.getindex_w(w_index, space.w_IndexError, "string index") + return self._getitem_result(space, index) + + def _getitem_slice_slowpath(self, space, start, step, sl): + # XXX same comment as in _unicode_sliced + builder = StringBuilder(step * sl) + byte_pos = self._index_to_byte(start) + i = 0 + while True: + next_pos = rutf8.next_codepoint_pos(self._utf8, byte_pos) + builder.append(self._utf8[byte_pos:next_pos]) + if i == sl - 1: + break + i += 1 + byte_pos = self._index_to_byte(start + i * step) + return W_UnicodeObject(builder.build(), sl) + + def descr_getslice(self, space, w_start, w_stop): + start, stop = normalize_simple_slice( + space, self._len(), w_start, w_stop) + if start == stop: + return self._empty() + else: + return self._unicode_sliced(space, start, stop) + + def _unicode_sliced(self, space, start, stop): + # XXX maybe some heuristic, like first slice does not create + # full index, but second does? + assert start >= 0 + assert stop >= 0 + byte_start = self._index_to_byte(start) + byte_stop = self._index_to_byte(stop) + return W_UnicodeObject(self._utf8[byte_start:byte_stop], stop - start) + + def descr_capitalize(self, space): + value = self._utf8 + if len(value) == 0: + return self._empty() + + builder = rutf8.Utf8StringBuilder(len(self._utf8)) + it = rutf8.Utf8StringIterator(self._utf8) + uchar = it.next() + ch = unicodedb.toupper(uchar) + builder.append_code(ch) + for ch in it: + ch = unicodedb.tolower(ch) + builder.append_code(ch) + return self.from_utf8builder(builder) + + @unwrap_spec(width=int, w_fillchar=WrappedDefault(' ')) + def descr_center(self, space, width, w_fillchar): + value = self._utf8 + fillchar = self.convert_arg_to_w_unicode(space, w_fillchar)._utf8 + if len(fillchar) != 1: + raise oefmt(space.w_TypeError, + "center() argument 2 must be a single character") + + d = width - self._len() + if d > 0: + offset = d//2 + (d & width & 1) + fillchar = fillchar[0] + centered = offset * fillchar + value + (d - offset) * fillchar + else: + centered = value + d = 0 + + return W_UnicodeObject(centered, self._len() + d) + + def descr_count(self, space, w_sub, w_start=None, w_end=None): + value = self._utf8 + start_index, end_index = self._unwrap_and_compute_idx_params( + space, w_start, w_end) + sub = self.convert_arg_to_w_unicode(space, w_sub)._utf8 + return space.newint(value.count(sub, start_index, end_index)) + + def descr_contains(self, space, w_sub): + value = self._utf8 + w_other = self.convert_arg_to_w_unicode(space, w_sub) + return space.newbool(value.find(w_other._utf8) >= 0) + + def descr_partition(self, space, w_sub): + value = self._utf8 + sub = self.convert_arg_to_w_unicode(space, w_sub) + sublen = sub._len() + if sublen == 0: + raise oefmt(space.w_ValueError, "empty separator") + + pos = value.find(sub._utf8) + + if pos < 0: + return space.newtuple([self, self._empty(), self._empty()]) + else: + lgt = rutf8.check_utf8(value, True, stop=pos) + return space.newtuple( + [W_UnicodeObject(value[0:pos], lgt), w_sub, + W_UnicodeObject(value[pos + len(sub._utf8):len(value)], + self._len() - lgt - sublen)]) + + def descr_rpartition(self, space, w_sub): + value = self._utf8 + sub = self.convert_arg_to_w_unicode(space, w_sub) + sublen = sub._len() + if sublen == 0: + raise oefmt(space.w_ValueError, "empty separator") + + pos = value.rfind(sub._utf8) + + if pos < 0: + return space.newtuple([self._empty(), self._empty(), self]) + else: + lgt = rutf8.check_utf8(value, True, stop=pos) + return space.newtuple( + [W_UnicodeObject(value[0:pos], lgt), w_sub, + W_UnicodeObject(value[pos + len(sub._utf8):len(value)], + self._len() - lgt - sublen)]) + + @unwrap_spec(count=int) + def descr_replace(self, space, w_old, w_new, count=-1): + input = self._utf8 + + w_sub = self.convert_arg_to_w_unicode(space, w_old) + w_by = self.convert_arg_to_w_unicode(space, w_new) + # the following two lines are for being bug-to-bug compatible + # with CPython: see issue #2448 + if count >= 0 and len(input) == 0: + return self._empty() + try: + res, replacements = replace_count(input, w_sub._utf8, w_by._utf8, + count, isutf8=True) + except OverflowError: + raise oefmt(space.w_OverflowError, "replace string is too long") + + newlength = self._length + replacements * (w_by._length - w_sub._length) + return W_UnicodeObject(res, newlength) + + def descr_mul(self, space, w_times): + try: + times = space.getindex_w(w_times, space.w_OverflowError) + except OperationError as e: + if e.match(space, space.w_TypeError): + return space.w_NotImplemented + raise + if times <= 0: + return self._empty() + if len(self._utf8) == 1: + return W_UnicodeObject(self._utf8[0] * times, times) + return W_UnicodeObject(self._utf8 * times, times * self._len()) + + descr_rmul = descr_mul + + def _get_index_storage(self): + return jit.conditional_call_elidable(self._index_storage, + W_UnicodeObject._compute_index_storage, self) + + def _compute_index_storage(self): + storage = rutf8.create_utf8_index_storage(self._utf8, self._length) + self._index_storage = storage + return storage + + def _getitem_result(self, space, index): + if index < 0: + index += self._length + if index < 0 or index >= self._length: + raise oefmt(space.w_IndexError, "string index out of range") + start = self._index_to_byte(index) + end = rutf8.next_codepoint_pos(self._utf8, start) + return W_UnicodeObject(self._utf8[start:end], 1) + + def is_ascii(self): + return self._length == len(self._utf8) + + def _has_surrogates(self): + if self.is_ascii(): + return False + return rutf8.has_surrogates(self._utf8) + + def _index_to_byte(self, index): + if self.is_ascii(): + assert index >= 0 + return index + return rutf8.codepoint_position_at_index( + self._utf8, self._get_index_storage(), index) + + @always_inline + def _unwrap_and_search(self, space, w_sub, w_start, w_end, forward=True): + w_sub = self.convert_arg_to_w_unicode(space, w_sub) + start, end = unwrap_start_stop(space, self._length, w_start, w_end) + if start == 0: + start_index = 0 + elif start > self._length: + return None + else: + start_index = self._index_to_byte(start) + + if end >= self._length: + end = self._length + end_index = len(self._utf8) + else: + end_index = self._index_to_byte(end) + + if forward: + res_index = self._utf8.find(w_sub._utf8, start_index, end_index) + if res_index < 0: + return None + skip = rutf8.codepoints_in_utf8(self._utf8, start_index, res_index) + res = start + skip + assert res >= 0 + return space.newint(res) + else: + res_index = self._utf8.rfind(w_sub._utf8, start_index, end_index) + if res_index < 0: + return None + skip = rutf8.codepoints_in_utf8(self._utf8, res_index, end_index) + res = end - skip + assert res >= 0 + return space.newint(res) + + def _unwrap_and_compute_idx_params(self, space, w_start, w_end): + # unwrap start and stop indices, optimized for the case where + # start == 0 and end == self._length. Note that 'start' and + # 'end' are measured in codepoints whereas 'start_index' and + # 'end_index' are measured in bytes. + start, end = unwrap_start_stop(space, self._length, w_start, w_end) + start_index = 0 + end_index = len(self._utf8) + 1 + if start > 0: + if start > self._length: + start_index = end_index + else: + start_index = self._index_to_byte(start) + if end < self._length: + end_index = self._index_to_byte(end) + return (start_index, end_index) + + @unwrap_spec(width=int, w_fillchar=WrappedDefault(' ')) + def descr_rjust(self, space, width, w_fillchar): + value = self._utf8 + lgt = self._len() + w_fillchar = self.convert_arg_to_w_unicode(space, w_fillchar) + if w_fillchar._len() != 1: + raise oefmt(space.w_TypeError, + "rjust() argument 2 must be a single character") + d = width - lgt + if d > 0: + if len(w_fillchar._utf8) == 1: + # speedup + value = d * w_fillchar._utf8[0] + value + else: + value = d * w_fillchar._utf8 + value + return W_UnicodeObject(value, width) + + return W_UnicodeObject(value, lgt) + + @unwrap_spec(width=int, w_fillchar=WrappedDefault(' ')) + def descr_ljust(self, space, width, w_fillchar): + value = self._utf8 + w_fillchar = self.convert_arg_to_w_unicode(space, w_fillchar) + if w_fillchar._len() != 1: + raise oefmt(space.w_TypeError, + "ljust() argument 2 must be a single character") + d = width - self._len() + if d > 0: + if len(w_fillchar._utf8) == 1: + # speedup + value = value + d * w_fillchar._utf8[0] + else: + value = value + d * w_fillchar._utf8 + return W_UnicodeObject(value, width) + + return W_UnicodeObject(value, self._len()) + + def _utf8_sliced(self, start, stop, lgt): + assert start >= 0 + assert stop >= 0 + #if start == 0 and stop == len(s) and space.is_w(space.type(orig_obj), + # space.w_bytes): + # return orig_obj + return W_UnicodeObject(self._utf8[start:stop], lgt) + + def _strip_none(self, space, left, right): + "internal function called by str_xstrip methods" + value = self._utf8 + + lpos = 0 + rpos = len(value) + lgt = self._len() + + if left: + while lpos < rpos and rutf8.isspace(value, lpos): + lpos = rutf8.next_codepoint_pos(value, lpos) + lgt -= 1 + + if right: + while rpos > lpos and rutf8.isspace(value, + rutf8.prev_codepoint_pos(value, rpos)): + rpos = rutf8.prev_codepoint_pos(value, rpos) + lgt -= 1 + + assert rpos >= lpos # annotator hint, don't remove + return self._utf8_sliced(lpos, rpos, lgt) + + def _strip(self, space, w_chars, left, right, name='strip'): + "internal function called by str_xstrip methods" + value = self._utf8 + chars = self.convert_arg_to_w_unicode(space, w_chars, strict=name)._utf8 + + lpos = 0 + rpos = len(value) + lgt = self._len() + + if left: + while lpos < rpos and rutf8.utf8_in_chars(value, lpos, chars): + lpos = rutf8.next_codepoint_pos(value, lpos) + lgt -= 1 + + if right: + while rpos > lpos and rutf8.utf8_in_chars(value, + rutf8.prev_codepoint_pos(value, rpos), chars): + rpos = rutf8.prev_codepoint_pos(value, rpos) + lgt -= 1 + + assert rpos >= lpos # annotator hint, don't remove + return self._utf8_sliced(lpos, rpos, lgt) + + def descr_getnewargs(self, space): + return space.newtuple([W_UnicodeObject(self._utf8, self._length)]) + _starts_ends_unicode = True @@ -445,68 +1071,57 @@ def _get_encoding_and_errors(space, w_encoding, w_errors): return encoding, errors -def encode_object(space, w_object, encoding, errors): - if encoding is None: - # Get the encoder functions as a wrapped object. - # This lookup is cached. - w_encoder = space.sys.get_w_default_encoder() - else: - if errors is None or errors == 'strict': - if encoding == 'ascii': - u = space.unicode_w(w_object) - eh = unicodehelper.encode_error_handler(space) - return space.newbytes(unicode_encode_ascii( - u, len(u), None, errorhandler=eh)) - if encoding == 'utf-8': - u = space.unicode_w(w_object) - eh = unicodehelper.encode_error_handler(space) - return space.newbytes(unicode_encode_utf_8( - u, len(u), None, errorhandler=eh, - allow_surrogates=True)) - from pypy.module._codecs.interp_codecs import lookup_codec - w_encoder = space.getitem(lookup_codec(space, encoding), space.newint(0)) - if errors is None: - w_errors = space.newtext('strict') - else: - w_errors = space.newtext(errors) - w_restuple = space.call_function(w_encoder, w_object, w_errors) - w_retval = space.getitem(w_restuple, space.newint(0)) - if not space.isinstance_w(w_retval, space.w_bytes): - raise oefmt(space.w_TypeError, - "encoder did not return an string object (type '%T')", - w_retval) - return w_retval +def encode_object(space, w_obj, encoding, errors): + from pypy.module._codecs.interp_codecs import encode + if errors is None or errors == 'strict': + # fast path + if ((encoding is None and space.sys.defaultencoding == 'ascii') or + encoding == 'ascii'): + s = space.utf8_w(w_obj) + try: + rutf8.check_ascii(s) + except rutf8.CheckError as a: + if space.isinstance_w(w_obj, space.w_unicode): + eh = unicodehelper.encode_error_handler(space) + else: + # must be a bytes-like object. In order to encode it, + # first "decode" to unicode. Since we cannot, raise a + # UnicodeDecodeError, not a UnicodeEncodeError + eh = unicodehelper.decode_error_handler(space) + eh(None, "ascii", "ordinal not in range(128)", s, + a.pos, a.pos + 1) + assert False, "always raises" + return space.newbytes(s) + if ((encoding is None and space.sys.defaultencoding == 'utf8') or + encoding == 'utf-8' or encoding == 'utf8' or encoding == 'UTF-8'): + utf8 = space.utf8_w(w_obj) + if rutf8.has_surrogates(utf8): + utf8 = rutf8.reencode_utf8_with_surrogates(utf8) + return space.newbytes(utf8) + return encode(space, w_obj, encoding, errors) def decode_object(space, w_obj, encoding, errors): - if encoding is None: - encoding = getdefaultencoding(space) + from pypy.module._codecs.interp_codecs import lookup_codec, decode if errors is None or errors == 'strict': + # fast paths + if encoding is None: + encoding = getdefaultencoding(space) if encoding == 'ascii': - # XXX error handling s = space.charbuf_w(w_obj) - try: - u = fast_str_decode_ascii(s) - except ValueError: - eh = unicodehelper.decode_error_handler(space) - u = str_decode_ascii( # try again, to get the error right - s, len(s), None, final=True, errorhandler=eh)[0] - return space.newunicode(u) - if encoding == 'utf-8': - s = space.charbuf_w(w_obj) - eh = unicodehelper.decode_error_handler(space) - return space.newunicode(str_decode_utf_8( - s, len(s), None, final=True, errorhandler=eh, - allow_surrogates=True)[0]) - w_codecs = space.getbuiltinmodule("_codecs") - w_decode = space.getattr(w_codecs, space.newtext("decode")) - if errors is None: - w_retval = space.call_function(w_decode, w_obj, space.newtext(encoding)) - else: - w_retval = space.call_function(w_decode, w_obj, space.newtext(encoding), - space.newtext(errors)) - return w_retval - + unicodehelper.check_ascii_or_raise(space, s) + return space.newutf8(s, len(s)) + if encoding == 'utf-8' or encoding == 'utf8': + if (space.isinstance_w(w_obj, space.w_unicode) or + space.isinstance_w(w_obj, space.w_bytes)): + s = space.utf8_w(w_obj) + else: + s = space.charbuf_w(w_obj) + lgt = unicodehelper.check_utf8_or_raise(space, s) + return space.newutf8(s, lgt) + if encoding is None: + encoding = space.sys.defaultencoding + return decode(space, w_obj, encoding, errors) def unicode_from_encoded_object(space, w_obj, encoding, errors): # explicitly block bytearray on 2.7 @@ -534,7 +1149,7 @@ def unicode_from_object(space, w_obj): # test_unicode_conversion_with__str__ if w_unicode_method is None: if space.isinstance_w(w_obj, space.w_unicode): - return space.newunicode(space.unicode_w(w_obj)) + return space.convert_arg_to_w_unicode(w_obj) w_unicode_method = space.lookup(w_obj, "__str__") if w_unicode_method is not None: w_res = space.get_and_call_function(w_unicode_method, w_obj) @@ -551,11 +1166,8 @@ def unicode_from_string(space, w_bytes): if encoding != 'ascii': return unicode_from_encoded_object(space, w_bytes, encoding, "strict") s = space.bytes_w(w_bytes) - try: - return W_UnicodeObject(s.decode("ascii")) - except UnicodeDecodeError: - # raising UnicodeDecodeError is messy, "please crash for me" - return unicode_from_encoded_object(space, w_bytes, "ascii", "strict") + unicodehelper.check_ascii_or_raise(space, s) + return W_UnicodeObject(s, len(s)) class UnicodeDocstrings: @@ -1102,38 +1714,42 @@ def _create_list_from_unicode(value): return [s for s in value] -W_UnicodeObject.EMPTY = W_UnicodeObject(u'') +W_UnicodeObject.EMPTY = W_UnicodeObject('', 0) # Helper for converting int/long def unicode_to_decimal_w(space, w_unistr): if not isinstance(w_unistr, W_UnicodeObject): raise oefmt(space.w_TypeError, "expected unicode, got '%T'", w_unistr) - unistr = w_unistr._value - result = ['\0'] * len(unistr) + unistr = w_unistr._utf8 + result = ['\0'] * w_unistr._length digits = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] - for i in xrange(len(unistr)): - uchr = ord(unistr[i]) - if unicodedb.isspace(uchr): - result[i] = ' ' + res_pos = 0 + iter = rutf8.Utf8StringIterator(unistr) + for uchr in iter: + if W_UnicodeObject._isspace(uchr): + result[res_pos] = ' ' + res_pos += 1 continue try: - result[i] = digits[unicodedb.decimal(uchr)] + result[res_pos] = digits[unicodedb.decimal(uchr)] except KeyError: if 0 < uchr < 256: - result[i] = chr(uchr) + result[res_pos] = chr(uchr) else: w_encoding = space.newtext('decimal') - w_start = space.newint(i) - w_end = space.newint(i+1) + pos = iter.get_pos() + w_start = space.newint(pos) + w_end = space.newint(pos+1) w_reason = space.newtext('invalid decimal Unicode string') raise OperationError(space.w_UnicodeEncodeError, space.newtuple([w_encoding, w_unistr, w_start, w_end, w_reason])) + res_pos += 1 return ''.join(result) -_repr_function, _ = make_unicode_escape_function( - pass_printable=False, unicode_output=False, quotes=True, prefix='u') +_repr_function = rutf8.make_utf8_escape_function( + pass_printable=False, quotes=True, prefix='u') |