some ascii fast paths of latin-1 encoding/decoding

author: Carl Friedrich Bolz-Tereick <cfbolz@gmx.de> 2021-03-02 20:37:27 +0100
committer: Carl Friedrich Bolz-Tereick <cfbolz@gmx.de> 2021-03-02 20:37:27 +0100
commit: df4810d86c2d8ae70a59a0c9f163c0f3fbc44ab2 (patch)
tree: f2f76762def2af41963672a4cc06774673d3fe69
parent: add ascii fast paths to the tolower/toupper functions of the unicode dbs too (diff)
download: pypy-df4810d86c2d8ae70a59a0c9f163c0f3fbc44ab2.tar.gz
pypy-df4810d86c2d8ae70a59a0c9f163c0f3fbc44ab2.tar.bz2
pypy-df4810d86c2d8ae70a59a0c9f163c0f3fbc44ab2.zip
4 files changed, 23 insertions, 4 deletions
diff --git a/pypy/interpreter/test/test_unicodehelper.py b/pypy/interpreter/test/test_unicodehelper.py
index 4d849cd9cf..34e08da5ac 100644
--- a/pypy/interpreter/test/test_unicodehelper.py
+++ b/pypy/interpreter/test/test_unicodehelper.py
@@ -11,6 +11,7 @@ from rpython.rlib import rutf8
 
 from pypy.interpreter.unicodehelper import str_decode_utf8
 from pypy.interpreter.unicodehelper import utf8_encode_ascii, str_decode_ascii
+from pypy.interpreter.unicodehelper import utf8_encode_latin_1
 from pypy.interpreter import unicodehelper as uh
 from pypy.module._codecs.interp_codecs import CodecState
 
@@ -91,3 +92,9 @@ def test_encode_decimal(space):
     result = uh.unicode_encode_decimal(
         u'12\u1234'.encode('utf8'), 'xmlcharrefreplace', handler)
     assert result == '12&#4660;'
+
+def test_utf8_encode_latin1_ascii_prefix():
+    utf8 = b'abcde\xc3\xa4g'
+    b = utf8_encode_latin_1(utf8, None, None)
+    assert b == b'abcde\xe4g'
+
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
index 34fde1c874..d17ccb767b 100644
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -155,14 +155,15 @@ def utf8_encode_latin_1(s, errors, errorhandler):
     try:
         rutf8.check_ascii(s)
         return s
-    except rutf8.CheckError:
-        return _utf8_encode_latin_1_slowpath(s, errors, errorhandler)
+    except rutf8.CheckError, e:
+        return _utf8_encode_latin_1_slowpath(s, e.pos, errors, errorhandler)
 
-def _utf8_encode_latin_1_slowpath(s, errors, errorhandler):
+def _utf8_encode_latin_1_slowpath(s, first_non_ascii_char, errors, errorhandler):
     size = len(s)
     result = StringBuilder(size)
     index = 0
-    pos = 0
+    result.append_slice(s, 0, first_non_ascii_char)
+    pos = first_non_ascii_char
     while pos < size:
         ch = rutf8.codepoint_at_pos(s, pos)
         if ch <= 0xFF:
diff --git a/pypy/objspace/std/test/test_unicodeobject.py b/pypy/objspace/std/test/test_unicodeobject.py
index e8763dc496..7f88ed9721 100644
--- a/pypy/objspace/std/test/test_unicodeobject.py
+++ b/pypy/objspace/std/test/test_unicodeobject.py
@@ -217,6 +217,14 @@ class TestUnicodeObject:
             uniupper, = unicodedb.toupper_full(ch)
             assert chr(uniupper) == chr(ch).upper()
 
+    def test_latin1_encode_shortcut_ascii(self, monkeypatch):
+        from rpython.rlib import rutf8
+        from pypy.objspace.std.unicodeobject import encode_object
+        monkeypatch.setattr(rutf8, "check_ascii", None)
+        w_b = encode_object(self.space, self.space.newutf8("abc", 3), "latin-1", "strict")
+        assert self.space.bytes_w(w_b) == "abc"
+
+
 class AppTestUnicodeStringStdOnly:
     def test_compares(self):
         assert u'a' == 'a'
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
index 0be4a9e55c..16edebfb03 100644
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -1216,6 +1216,9 @@ def encode_object(space, w_obj, encoding, errors):
             if rutf8.has_surrogates(utf8):
                 utf8 = rutf8.reencode_utf8_with_surrogates(utf8)
             return space.newbytes(utf8)
+        if (encoding == "latin1" or encoding == "latin-1" and
+                isinstance(w_obj, W_UnicodeObject) and w_obj.is_ascii()):
+            return space.newbytes(w_obj._utf8)
     return encode(space, w_obj, encoding, errors)
author	Carl Friedrich Bolz-Tereick <cfbolz@gmx.de>	2021-03-02 20:37:27 +0100
committer	Carl Friedrich Bolz-Tereick <cfbolz@gmx.de>	2021-03-02 20:37:27 +0100
commit	df4810d86c2d8ae70a59a0c9f163c0f3fbc44ab2 (patch)
tree	f2f76762def2af41963672a4cc06774673d3fe69
parent	add ascii fast paths to the tolower/toupper functions of the unicode dbs too (diff)
download	pypy-df4810d86c2d8ae70a59a0c9f163c0f3fbc44ab2.tar.gz pypy-df4810d86c2d8ae70a59a0c9f163c0f3fbc44ab2.tar.bz2 pypy-df4810d86c2d8ae70a59a0c9f163c0f3fbc44ab2.zip