You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
67 lines
2.2 KiB
67 lines
2.2 KiB
7 years ago
|
From cc5681e3d07023e8684c5da962c4fb5fcecfd385 Mon Sep 17 00:00:00 2001
|
||
|
From: "Reynaldo H. Verdejo Pinochet" <reynaldo@osg.samsung.com>
|
||
|
Date: Fri, 28 Nov 2014 13:26:13 -0300
|
||
|
Subject: [PATCH] subparse: avoid false negatives dealing with UTF-8
|
||
|
|
||
|
g_utf8_validate() chokes at any NUL among max_len
|
||
|
bytes so we should avoid passing null character
|
||
|
terminators if present. Additionally, only part of
|
||
|
the available data might be valid UTF-8. For example
|
||
|
a byte at the end might be the start of a valid UTF-8
|
||
|
run (ie: d0) but not be a valid UTF-8 character by
|
||
|
itself. In this case, we consume only the valid portion
|
||
|
of the run.
|
||
|
|
||
|
https://bugzilla.gnome.org/show_bug.cgi?id=740784
|
||
|
---
|
||
|
gst/subparse/gstsubparse.c | 23 +++++++++++++++++++++--
|
||
|
1 file changed, 21 insertions(+), 2 deletions(-)
|
||
|
|
||
|
diff --git a/gst/subparse/gstsubparse.c b/gst/subparse/gstsubparse.c
|
||
|
index 11b76c3..2719fd7 100644
|
||
7 years ago
|
--- a/gst/subparse/gstsubparse.c
|
||
|
+++ b/gst/subparse/gstsubparse.c
|
||
7 years ago
|
@@ -437,6 +437,9 @@ convert_encoding (GstSubParse * self, const gchar * str, gsize len,
|
||
7 years ago
|
const gchar *encoding;
|
||
|
GError *err = NULL;
|
||
|
gchar *ret = NULL;
|
||
|
+ gsize nuls = 0;
|
||
|
+ gsize valid_utf8_len;
|
||
|
+ const gchar *invalid_utf8_start;
|
||
|
|
||
|
*consumed = 0;
|
||
|
|
||
7 years ago
|
@@ -457,11 +460,27 @@ convert_encoding (GstSubParse * self, const gchar * str, gsize len,
|
||
7 years ago
|
|
||
|
/* Otherwise check if it's UTF8 */
|
||
|
if (self->valid_utf8) {
|
||
|
- if (g_utf8_validate (str, len, NULL)) {
|
||
|
+ /* Trim NUL terminator(s) if present */
|
||
|
+ while (len > 0 && str[len - 1] == '\0') {
|
||
|
+ len--;
|
||
|
+ nuls++;
|
||
|
+ }
|
||
|
+
|
||
|
+ /* Consume whole byte run if all valid UTF-8 */
|
||
|
+ if (g_utf8_validate (str, len, &invalid_utf8_start)) {
|
||
|
GST_LOG_OBJECT (self, "valid UTF-8, no conversion needed");
|
||
|
- *consumed = len;
|
||
|
+ *consumed = len + nuls;
|
||
|
return g_strndup (str, len);
|
||
|
}
|
||
|
+
|
||
|
+ /* Consume initial data as far as we have at least 1 valid code point */
|
||
|
+ valid_utf8_len = invalid_utf8_start - str;
|
||
|
+ if (valid_utf8_len) {
|
||
|
+ GST_WARNING_OBJECT (self, "At least some of the data was invalid UTF-8");
|
||
|
+ *consumed = valid_utf8_len;
|
||
|
+ return g_strndup (str, valid_utf8_len);
|
||
|
+ }
|
||
|
+
|
||
|
GST_INFO_OBJECT (self, "invalid UTF-8!");
|
||
|
self->valid_utf8 = FALSE;
|
||
|
}
|
||
7 years ago
|
--
|
||
|
2.1.4
|
||
7 years ago
|
|