From 0a7fe7ad57ac815f0f45b69f8f734e402abb6f41 Mon Sep 17 00:00:00 2001
From: Romain Vimont <rom@rom1v.com>
Date: Thu, 30 May 2019 19:01:08 +0200
Subject: [PATCH] Add helpers to truncate UTF-8 at code points

This will help to avoid truncating a UTF-8 string in the middle of a
code point, producing an invalid UTF-8 result.
---
 app/meson.build                               |  3 +-
 app/src/str_util.c                            | 16 +++++++
 app/src/str_util.h                            |  4 ++
 app/tests/test_strutil.c                      | 32 ++++++++++++++
 .../com/genymobile/scrcpy/StringUtils.java    | 23 ++++++++++
 .../genymobile/scrcpy/StringUtilsTest.java    | 44 +++++++++++++++++++
 6 files changed, 121 insertions(+), 1 deletion(-)
 create mode 100644 server/src/main/java/com/genymobile/scrcpy/StringUtils.java
 create mode 100644 server/src/test/java/com/genymobile/scrcpy/StringUtilsTest.java

diff --git a/app/meson.build b/app/meson.build
index 9bcaa9ae..c6c91e71 100644
--- a/app/meson.build
+++ b/app/meson.build
@@ -159,7 +159,8 @@ tests = [
     ]],
     ['test_control_event_serialize', [
         'tests/test_control_event_serialize.c',
-        'src/control_event.c'
+        'src/control_event.c',
+        'src/str_util.c'
     ]],
     ['test_strutil', [
         'tests/test_strutil.c',
diff --git a/app/src/str_util.c b/app/src/str_util.c
index d9ae6948..2878bf96 100644
--- a/app/src/str_util.c
+++ b/app/src/str_util.c
@@ -58,6 +58,22 @@ strquote(const char *src) {
     return quoted;
 }
 
+size_t
+utf8_truncation_index(const char *utf8, size_t max_len) {
+    size_t len = strlen(utf8);
+    if (len <= max_len) {
+        return len;
+    }
+    len = max_len;
+    // see UTF-8 encoding <https://en.wikipedia.org/wiki/UTF-8#Description>
+    while ((utf8[len] & 0x80) != 0 && (utf8[len] & 0xc0) != 0xc0) {
+        // the next byte is not the start of a new UTF-8 codepoint
+        // so if we would cut there, the character would be truncated
+        len--;
+    }
+    return len;
+}
+
 #ifdef _WIN32
 
 wchar_t *
diff --git a/app/src/str_util.h b/app/src/str_util.h
index 9ef06cbf..0d1b9c01 100644
--- a/app/src/str_util.h
+++ b/app/src/str_util.h
@@ -23,6 +23,10 @@ xstrjoin(char *dst, const char *const tokens[], char sep, size_t n);
 char *
 strquote(const char *src);
 
+// return the index to truncate a UTF-8 string at a valid position
+size_t
+utf8_truncation_index(const char *utf8, size_t max_len);
+
 #ifdef _WIN32
 // convert a UTF-8 string to a wchar_t string
 // returns the new allocated string, to be freed by the caller
diff --git a/app/tests/test_strutil.c b/app/tests/test_strutil.c
index 1dd7fbbe..18ac4a7d 100644
--- a/app/tests/test_strutil.c
+++ b/app/tests/test_strutil.c
@@ -126,6 +126,37 @@ static void test_xstrjoin_truncated_after_sep(void) {
     assert(!strcmp("abc de ", s));
 }
 
+static void test_utf8_truncate(void) {
+    const char *s = "aÉbÔc";
+    assert(strlen(s) == 7); // É and Ô are 2 bytes-wide
+
+    size_t count;
+
+    count = utf8_truncation_index(s, 1);
+    assert(count == 1);
+
+    count = utf8_truncation_index(s, 2);
+    assert(count == 1); // É is 2 bytes-wide
+
+    count = utf8_truncation_index(s, 3);
+    assert(count == 3);
+
+    count = utf8_truncation_index(s, 4);
+    assert(count == 4);
+
+    count = utf8_truncation_index(s, 5);
+    assert(count == 4); // Ô is 2 bytes-wide
+
+    count = utf8_truncation_index(s, 6);
+    assert(count == 6);
+
+    count = utf8_truncation_index(s, 7);
+    assert(count == 7);
+
+    count = utf8_truncation_index(s, 8);
+    assert(count == 7); // no more chars
+}
+
 int main(void) {
     test_xstrncpy_simple();
     test_xstrncpy_just_fit();
@@ -135,5 +166,6 @@ int main(void) {
     test_xstrjoin_truncated_in_token();
     test_xstrjoin_truncated_before_sep();
     test_xstrjoin_truncated_after_sep();
+    test_utf8_truncate();
     return 0;
 }
diff --git a/server/src/main/java/com/genymobile/scrcpy/StringUtils.java b/server/src/main/java/com/genymobile/scrcpy/StringUtils.java
new file mode 100644
index 00000000..199fc8c1
--- /dev/null
+++ b/server/src/main/java/com/genymobile/scrcpy/StringUtils.java
@@ -0,0 +1,23 @@
+package com.genymobile.scrcpy;
+
+public final class StringUtils {
+    private StringUtils() {
+        // not instantiable
+    }
+
+    @SuppressWarnings("checkstyle:MagicNumber")
+    public static int getUtf8TruncationIndex(byte[] utf8, int maxLength) {
+        int len = utf8.length;
+        if (len <= maxLength) {
+            return len;
+        }
+        len = maxLength;
+        // see UTF-8 encoding <https://en.wikipedia.org/wiki/UTF-8#Description>
+        while ((utf8[len] & 0x80) != 0 && (utf8[len] & 0xc0) != 0xc0) {
+            // the next byte is not the start of a new UTF-8 codepoint
+            // so if we would cut there, the character would be truncated
+            len--;
+        }
+        return len;
+    }
+}
diff --git a/server/src/test/java/com/genymobile/scrcpy/StringUtilsTest.java b/server/src/test/java/com/genymobile/scrcpy/StringUtilsTest.java
new file mode 100644
index 00000000..a2683945
--- /dev/null
+++ b/server/src/test/java/com/genymobile/scrcpy/StringUtilsTest.java
@@ -0,0 +1,44 @@
+package com.genymobile.scrcpy;
+
+import junit.framework.Assert;
+
+import org.junit.Test;
+
+import java.nio.charset.StandardCharsets;
+
+public class StringUtilsTest {
+
+    @Test
+    @SuppressWarnings("checkstyle:MagicNumber")
+    public void testUtf8Trucate() {
+        String s = "aÉbÔc";
+        byte[] utf8 = s.getBytes(StandardCharsets.UTF_8);
+        Assert.assertEquals(7, utf8.length);
+
+        int count;
+
+        count = StringUtils.getUtf8TruncationIndex(utf8, 1);
+        Assert.assertEquals(1, count);
+
+        count = StringUtils.getUtf8TruncationIndex(utf8, 2);
+        Assert.assertEquals(1, count); // É is 2 bytes-wide
+
+        count = StringUtils.getUtf8TruncationIndex(utf8, 3);
+        Assert.assertEquals(3, count);
+
+        count = StringUtils.getUtf8TruncationIndex(utf8, 4);
+        Assert.assertEquals(4, count);
+
+        count = StringUtils.getUtf8TruncationIndex(utf8, 5);
+        Assert.assertEquals(4, count); // Ô is 2 bytes-wide
+
+        count = StringUtils.getUtf8TruncationIndex(utf8, 6);
+        Assert.assertEquals(6, count);
+
+        count = StringUtils.getUtf8TruncationIndex(utf8, 7);
+        Assert.assertEquals(7, count);
+
+        count = StringUtils.getUtf8TruncationIndex(utf8, 8);
+        Assert.assertEquals(7, count); // no more chars
+    }
+}