From cc5681e3d07023e8684c5da962c4fb5fcecfd385 Mon Sep 17 00:00:00 2001 From: "Reynaldo H. Verdejo Pinochet" Date: Fri, 28 Nov 2014 13:26:13 -0300 Subject: [PATCH] subparse: avoid false negatives dealing with UTF-8 g_utf8_validate() chokes at any NUL among max_len bytes so we should avoid passing null character terminators if present. Additionally, only part of the available data might be valid UTF-8. For example a byte at the end might be the start of a valid UTF-8 run (ie: d0) but not be a valid UTF-8 character by itself. In this case, we consume only the valid portion of the run. https://bugzilla.gnome.org/show_bug.cgi?id=740784 --- gst/subparse/gstsubparse.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/gst/subparse/gstsubparse.c b/gst/subparse/gstsubparse.c index 11b76c3..2719fd7 100644 --- a/gst/subparse/gstsubparse.c +++ b/gst/subparse/gstsubparse.c @@ -437,6 +437,9 @@ convert_encoding (GstSubParse * self, const gchar * str, gsize len, const gchar *encoding; GError *err = NULL; gchar *ret = NULL; + gsize nuls = 0; + gsize valid_utf8_len; + const gchar *invalid_utf8_start; *consumed = 0; @@ -457,11 +460,27 @@ convert_encoding (GstSubParse * self, const gchar * str, gsize len, /* Otherwise check if it's UTF8 */ if (self->valid_utf8) { - if (g_utf8_validate (str, len, NULL)) { + /* Trim NUL terminator(s) if present */ + while (len > 0 && str[len - 1] == '\0') { + len--; + nuls++; + } + + /* Consume whole byte run if all valid UTF-8 */ + if (g_utf8_validate (str, len, &invalid_utf8_start)) { GST_LOG_OBJECT (self, "valid UTF-8, no conversion needed"); - *consumed = len; + *consumed = len + nuls; return g_strndup (str, len); } + + /* Consume initial data as far as we have at least 1 valid code point */ + valid_utf8_len = invalid_utf8_start - str; + if (valid_utf8_len) { + GST_WARNING_OBJECT (self, "At least some of the data was invalid UTF-8"); + *consumed = valid_utf8_len; + return g_strndup (str, valid_utf8_len); + } + GST_INFO_OBJECT (self, "invalid UTF-8!"); self->valid_utf8 = FALSE; } -- 2.1.4