1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
|
https://github.com/streamlink/streamlink/commit/9d8156dd794ee0919297cd90d85bcc11b8a28358
From 9d8156dd794ee0919297cd90d85bcc11b8a28358 Mon Sep 17 00:00:00 2001
From: bastimeyer <mail@bastimeyer.de>
Date: Tue, 21 Nov 2023 20:10:47 +0100
Subject: [PATCH] utils.parse: fix libxml2 2.12.0 compatibility
---
src/streamlink/compat.py | 11 ++++
src/streamlink/utils/parse.py | 17 +++++-
tests/utils/test_parse.py | 112 ++++++++++++++++++++++++++--------
3 files changed, 114 insertions(+), 26 deletions(-)
diff --git a/src/streamlink/compat.py b/src/streamlink/compat.py
index c75201544d3..993bce64cfd 100644
--- a/src/streamlink/compat.py
+++ b/src/streamlink/compat.py
@@ -2,11 +2,22 @@
import sys
+# compatibility import of charset_normalizer/chardet via requests<3.0
+try:
+ from requests.compat import chardet as charset_normalizer # type: ignore
+except ImportError: # pragma: no cover
+ import charset_normalizer
+
+
is_darwin = sys.platform == "darwin"
is_win32 = os.name == "nt"
+detect_encoding = charset_normalizer.detect
+
+
__all__ = [
"is_darwin",
"is_win32",
+ "detect_encoding",
]
diff --git a/src/streamlink/utils/parse.py b/src/streamlink/utils/parse.py
index 8c9f79c8b51..17479b81f59 100644
--- a/src/streamlink/utils/parse.py
+++ b/src/streamlink/utils/parse.py
@@ -4,6 +4,7 @@
from lxml.etree import HTML, XML
+from streamlink.compat import detect_encoding
from streamlink.plugin import PluginError
@@ -51,7 +52,21 @@ def parse_html(
- Removes XML declarations of invalid XHTML5 documents
- Wraps errors in custom exception with a snippet of the data in the message
"""
- if isinstance(data, str) and data.lstrip().startswith("<?xml"):
+ # strip XML text declarations from XHTML5 documents which were incorrectly defined as HTML5
+ is_bytes = isinstance(data, bytes)
+ if data and data.lstrip()[:5].lower() == (b"<?xml" if is_bytes else "<?xml"):
+ if is_bytes:
+ # get the document's encoding using the "encoding" attribute value of the XML text declaration
+ match = re.match(rb"^\s*<\?xml\s.*?encoding=(?P<q>[\'\"])(?P<encoding>.+?)(?P=q).*?\?>", data, re.IGNORECASE)
+ if match:
+ encoding_value = detect_encoding(match["encoding"])["encoding"]
+ encoding = match["encoding"].decode(encoding_value)
+ else:
+ # no "encoding" attribute: try to figure out encoding from the document's content
+ encoding = detect_encoding(data)["encoding"]
+
+ data = data.decode(encoding)
+
data = re.sub(r"^\s*<\?xml.+?\?>", "", data)
return _parse(HTML, data, name, exception, schema, *args, **kwargs)
diff --git a/tests/utils/test_parse.py b/tests/utils/test_parse.py
index aedae7d4e8e..69c16f282b9 100644
--- a/tests/utils/test_parse.py
+++ b/tests/utils/test_parse.py
@@ -74,31 +74,93 @@ def test_parse_xml_entities(self):
assert actual.tag == expected.tag
assert actual.attrib == expected.attrib
- def test_parse_xml_encoding(self):
- tree = parse_xml("""<?xml version="1.0" encoding="UTF-8"?><test>ä</test>""")
- assert tree.xpath(".//text()") == ["ä"]
- tree = parse_xml("""<test>ä</test>""")
- assert tree.xpath(".//text()") == ["ä"]
- tree = parse_xml(b"""<?xml version="1.0" encoding="UTF-8"?><test>\xC3\xA4</test>""")
- assert tree.xpath(".//text()") == ["ä"]
- tree = parse_xml(b"""<test>\xC3\xA4</test>""")
- assert tree.xpath(".//text()") == ["ä"]
-
- def test_parse_html_encoding(self):
- tree = parse_html("""<!DOCTYPE html><html><head><meta charset="utf-8"/></head><body>ä</body></html>""")
- assert tree.xpath(".//body/text()") == ["ä"]
- tree = parse_html("""<!DOCTYPE html><html><body>ä</body></html>""")
- assert tree.xpath(".//body/text()") == ["ä"]
- tree = parse_html(b"""<!DOCTYPE html><html><meta charset="utf-8"/><body>\xC3\xA4</body></html>""")
- assert tree.xpath(".//body/text()") == ["ä"]
- tree = parse_html(b"""<!DOCTYPE html><html><body>\xC3\xA4</body></html>""")
- assert tree.xpath(".//body/text()") == ["ä"]
-
- def test_parse_html_xhtml5(self):
- tree = parse_html("""<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE html><html><body>ä?></body></html>""")
- assert tree.xpath(".//body/text()") == ["ä?>"]
- tree = parse_html(b"""<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE html><html><body>\xC3\xA4?></body></html>""")
- assert tree.xpath(".//body/text()") == ["ä?>"]
+ @pytest.mark.parametrize(("content", "expected"), [
+ pytest.param(
+ """<?xml version="1.0" encoding="UTF-8"?><test>ä</test>""",
+ "ä",
+ id="string-utf-8",
+ ),
+ pytest.param(
+ """<test>ä</test>""",
+ "ä",
+ id="string-unknown",
+ ),
+ pytest.param(
+ b"""<?xml version="1.0" encoding="UTF-8"?><test>\xC3\xA4</test>""",
+ "ä",
+ id="bytes-utf-8",
+ ),
+ pytest.param(
+ b"""<?xml version="1.0" encoding="ISO-8859-1"?><test>\xE4</test>""",
+ "ä",
+ id="bytes-iso-8859-1",
+ ),
+ pytest.param(
+ b"""<test>\xC3\xA4</test>""",
+ "ä",
+ id="bytes-unknown",
+ ),
+ ])
+ def test_parse_xml_encoding(self, content, expected):
+ tree = parse_xml(content)
+ assert tree.xpath(".//text()") == [expected]
+
+ @pytest.mark.parametrize(("content", "expected"), [
+ pytest.param(
+ """<!DOCTYPE html><html><head><meta charset="utf-8"/></head><body>ä</body></html>""",
+ "ä",
+ id="string-utf-8",
+ ),
+ pytest.param(
+ """<!DOCTYPE html><html><body>ä</body></html>""",
+ "ä",
+ id="string-unknown",
+ ),
+ pytest.param(
+ b"""<!DOCTYPE html><html><head><meta charset="utf-8"/></head><body>\xC3\xA4</body></html>""",
+ "ä",
+ id="bytes-utf-8",
+ ),
+ pytest.param(
+ b"""<!DOCTYPE html><html><head><meta charset="ISO-8859-1"/></head><body>\xE4</body></html>""",
+ "ä",
+ id="bytes-iso-8859-1",
+ ),
+ pytest.param(
+ b"""<!DOCTYPE html><html><body>\xC3\xA4</body></html>""",
+ "ä",
+ id="bytes-unknown",
+ ),
+ ])
+ def test_parse_html_encoding(self, content, expected):
+ tree = parse_html(content)
+ assert tree.xpath(".//body/text()") == [expected]
+
+ @pytest.mark.parametrize(("content", "expected"), [
+ pytest.param(
+ """<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE html><html><body>ä?></body></html>""",
+ "ä?>",
+ id="string",
+ ),
+ pytest.param(
+ b"""<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE html><html><body>\xC3\xA4?></body></html>""",
+ "ä?>",
+ id="bytes-utf-8",
+ ),
+ pytest.param(
+ b"""<?xml version="1.0" encoding="ISO-8859-1"?><!DOCTYPE html><html><body>\xE4?></body></html>""",
+ "ä?>",
+ id="bytes-iso-8859-1",
+ ),
+ pytest.param(
+ b"""<?xml version="1.0"?><!DOCTYPE html><html><body>\xC3\xA4?></body></html>""",
+ "ä?>",
+ id="bytes-unknown",
+ ),
+ ])
+ def test_parse_html_xhtml5(self, content, expected):
+ tree = parse_html(content)
+ assert tree.xpath(".//body/text()") == [expected]
def test_parse_qsd(self):
assert parse_qsd("test=1&foo=bar", schema=validate.Schema({"test": str, "foo": "bar"})) == {"test": "1", "foo": "bar"}
|