From 0a7fe7ad57ac815f0f45b69f8f734e402abb6f41 Mon Sep 17 00:00:00 2001 From: Romain Vimont Date: Thu, 30 May 2019 19:01:08 +0200 Subject: [PATCH] Add helpers to truncate UTF-8 at code points This will help to avoid truncating a UTF-8 string in the middle of a code point, producing an invalid UTF-8 result. --- app/meson.build | 3 +- app/src/str_util.c | 16 +++++++ app/src/str_util.h | 4 ++ app/tests/test_strutil.c | 32 ++++++++++++++ .../com/genymobile/scrcpy/StringUtils.java | 23 ++++++++++ .../genymobile/scrcpy/StringUtilsTest.java | 44 +++++++++++++++++++ 6 files changed, 121 insertions(+), 1 deletion(-) create mode 100644 server/src/main/java/com/genymobile/scrcpy/StringUtils.java create mode 100644 server/src/test/java/com/genymobile/scrcpy/StringUtilsTest.java diff --git a/app/meson.build b/app/meson.build index 9bcaa9ae..c6c91e71 100644 --- a/app/meson.build +++ b/app/meson.build @@ -159,7 +159,8 @@ tests = [ ]], ['test_control_event_serialize', [ 'tests/test_control_event_serialize.c', - 'src/control_event.c' + 'src/control_event.c', + 'src/str_util.c' ]], ['test_strutil', [ 'tests/test_strutil.c', diff --git a/app/src/str_util.c b/app/src/str_util.c index d9ae6948..2878bf96 100644 --- a/app/src/str_util.c +++ b/app/src/str_util.c @@ -58,6 +58,22 @@ strquote(const char *src) { return quoted; } +size_t +utf8_truncation_index(const char *utf8, size_t max_len) { + size_t len = strlen(utf8); + if (len <= max_len) { + return len; + } + len = max_len; + // see UTF-8 encoding + while ((utf8[len] & 0x80) != 0 && (utf8[len] & 0xc0) != 0xc0) { + // the next byte is not the start of a new UTF-8 codepoint + // so if we would cut there, the character would be truncated + len--; + } + return len; +} + #ifdef _WIN32 wchar_t * diff --git a/app/src/str_util.h b/app/src/str_util.h index 9ef06cbf..0d1b9c01 100644 --- a/app/src/str_util.h +++ b/app/src/str_util.h @@ -23,6 +23,10 @@ xstrjoin(char *dst, const char *const tokens[], char sep, size_t n); char * strquote(const char *src); +// return the index to truncate a UTF-8 string at a valid position +size_t +utf8_truncation_index(const char *utf8, size_t max_len); + #ifdef _WIN32 // convert a UTF-8 string to a wchar_t string // returns the new allocated string, to be freed by the caller diff --git a/app/tests/test_strutil.c b/app/tests/test_strutil.c index 1dd7fbbe..18ac4a7d 100644 --- a/app/tests/test_strutil.c +++ b/app/tests/test_strutil.c @@ -126,6 +126,37 @@ static void test_xstrjoin_truncated_after_sep(void) { assert(!strcmp("abc de ", s)); } +static void test_utf8_truncate(void) { + const char *s = "aÉbÔc"; + assert(strlen(s) == 7); // É and Ô are 2 bytes-wide + + size_t count; + + count = utf8_truncation_index(s, 1); + assert(count == 1); + + count = utf8_truncation_index(s, 2); + assert(count == 1); // É is 2 bytes-wide + + count = utf8_truncation_index(s, 3); + assert(count == 3); + + count = utf8_truncation_index(s, 4); + assert(count == 4); + + count = utf8_truncation_index(s, 5); + assert(count == 4); // Ô is 2 bytes-wide + + count = utf8_truncation_index(s, 6); + assert(count == 6); + + count = utf8_truncation_index(s, 7); + assert(count == 7); + + count = utf8_truncation_index(s, 8); + assert(count == 7); // no more chars +} + int main(void) { test_xstrncpy_simple(); test_xstrncpy_just_fit(); @@ -135,5 +166,6 @@ int main(void) { test_xstrjoin_truncated_in_token(); test_xstrjoin_truncated_before_sep(); test_xstrjoin_truncated_after_sep(); + test_utf8_truncate(); return 0; } diff --git a/server/src/main/java/com/genymobile/scrcpy/StringUtils.java b/server/src/main/java/com/genymobile/scrcpy/StringUtils.java new file mode 100644 index 00000000..199fc8c1 --- /dev/null +++ b/server/src/main/java/com/genymobile/scrcpy/StringUtils.java @@ -0,0 +1,23 @@ +package com.genymobile.scrcpy; + +public final class StringUtils { + private StringUtils() { + // not instantiable + } + + @SuppressWarnings("checkstyle:MagicNumber") + public static int getUtf8TruncationIndex(byte[] utf8, int maxLength) { + int len = utf8.length; + if (len <= maxLength) { + return len; + } + len = maxLength; + // see UTF-8 encoding + while ((utf8[len] & 0x80) != 0 && (utf8[len] & 0xc0) != 0xc0) { + // the next byte is not the start of a new UTF-8 codepoint + // so if we would cut there, the character would be truncated + len--; + } + return len; + } +} diff --git a/server/src/test/java/com/genymobile/scrcpy/StringUtilsTest.java b/server/src/test/java/com/genymobile/scrcpy/StringUtilsTest.java new file mode 100644 index 00000000..a2683945 --- /dev/null +++ b/server/src/test/java/com/genymobile/scrcpy/StringUtilsTest.java @@ -0,0 +1,44 @@ +package com.genymobile.scrcpy; + +import junit.framework.Assert; + +import org.junit.Test; + +import java.nio.charset.StandardCharsets; + +public class StringUtilsTest { + + @Test + @SuppressWarnings("checkstyle:MagicNumber") + public void testUtf8Trucate() { + String s = "aÉbÔc"; + byte[] utf8 = s.getBytes(StandardCharsets.UTF_8); + Assert.assertEquals(7, utf8.length); + + int count; + + count = StringUtils.getUtf8TruncationIndex(utf8, 1); + Assert.assertEquals(1, count); + + count = StringUtils.getUtf8TruncationIndex(utf8, 2); + Assert.assertEquals(1, count); // É is 2 bytes-wide + + count = StringUtils.getUtf8TruncationIndex(utf8, 3); + Assert.assertEquals(3, count); + + count = StringUtils.getUtf8TruncationIndex(utf8, 4); + Assert.assertEquals(4, count); + + count = StringUtils.getUtf8TruncationIndex(utf8, 5); + Assert.assertEquals(4, count); // Ô is 2 bytes-wide + + count = StringUtils.getUtf8TruncationIndex(utf8, 6); + Assert.assertEquals(6, count); + + count = StringUtils.getUtf8TruncationIndex(utf8, 7); + Assert.assertEquals(7, count); + + count = StringUtils.getUtf8TruncationIndex(utf8, 8); + Assert.assertEquals(7, count); // no more chars + } +}