Add helpers to truncate UTF-8 at code points

This will help to avoid truncating a UTF-8 string in the middle of a code point, producing an invalid UTF-8 result.
2019-05-30 19:01:08 +02:00 · 2019-05-30 19:01:08 +02:00 · 0a7fe7ad57
commit 0a7fe7ad57
parent 3aa5426cad
6 changed files with 121 additions and 1 deletions
--- a/app/meson.build
+++ b/app/meson.build
@ -159,7 +159,8 @@ tests = [
    ]],
    ['test_control_event_serialize', [
        'tests/test_control_event_serialize.c',
-        'src/control_event.c'
+        'src/control_event.c',
        'src/str_util.c'
    ]],
    ['test_strutil', [
        'tests/test_strutil.c',
--- a/app/src/str_util.c
+++ b/app/src/str_util.c
@ -58,6 +58,22 @@ strquote(const char *src) {
    return quoted;
 }
 size_t
 utf8_truncation_index(const char *utf8, size_t max_len) {
    size_t len = strlen(utf8);
    if (len <= max_len) {
        return len;
    }
    len = max_len;
    // see UTF-8 encoding <https://en.wikipedia.org/wiki/UTF-8#Description>
    while ((utf8[len] & 0x80) != 0 && (utf8[len] & 0xc0) != 0xc0) {
        // the next byte is not the start of a new UTF-8 codepoint
        // so if we would cut there, the character would be truncated
        len--;
    }
    return len;
 }
 #ifdef _WIN32
 wchar_t *
--- a/app/src/str_util.h
+++ b/app/src/str_util.h
@ -23,6 +23,10 @@ xstrjoin(char *dst, const char *const tokens[], char sep, size_t n);
 char *
 strquote(const char *src);
 // return the index to truncate a UTF-8 string at a valid position
 size_t
 utf8_truncation_index(const char *utf8, size_t max_len);
 #ifdef _WIN32
 // convert a UTF-8 string to a wchar_t string
 // returns the new allocated string, to be freed by the caller
--- a/app/tests/test_strutil.c
+++ b/app/tests/test_strutil.c
@ -126,6 +126,37 @@ static void test_xstrjoin_truncated_after_sep(void) {
    assert(!strcmp("abc de ", s));
 }
 static void test_utf8_truncate(void) {
    const char *s = "aÉbÔc";
    assert(strlen(s) == 7); // É and Ô are 2 bytes-wide
    size_t count;
    count = utf8_truncation_index(s, 1);
    assert(count == 1);
    count = utf8_truncation_index(s, 2);
    assert(count == 1); // É is 2 bytes-wide
    count = utf8_truncation_index(s, 3);
    assert(count == 3);
    count = utf8_truncation_index(s, 4);
    assert(count == 4);
    count = utf8_truncation_index(s, 5);
    assert(count == 4); // Ô is 2 bytes-wide
    count = utf8_truncation_index(s, 6);
    assert(count == 6);
    count = utf8_truncation_index(s, 7);
    assert(count == 7);
    count = utf8_truncation_index(s, 8);
    assert(count == 7); // no more chars
 }
 int main(void) {
    test_xstrncpy_simple();
    test_xstrncpy_just_fit();
@ -135,5 +166,6 @@ int main(void) {
    test_xstrjoin_truncated_in_token();
    test_xstrjoin_truncated_before_sep();
    test_xstrjoin_truncated_after_sep();
    test_utf8_truncate();
    return 0;
 }
--- a/server/src/main/java/com/genymobile/scrcpy/StringUtils.java
+++ b/server/src/main/java/com/genymobile/scrcpy/StringUtils.java
@ -0,0 +1,23 @@
 package com.genymobile.scrcpy;
 public final class StringUtils {
    private StringUtils() {
        // not instantiable
    }
    @SuppressWarnings("checkstyle:MagicNumber")
    public static int getUtf8TruncationIndex(byte[] utf8, int maxLength) {
        int len = utf8.length;
        if (len <= maxLength) {
            return len;
        }
        len = maxLength;
        // see UTF-8 encoding <https://en.wikipedia.org/wiki/UTF-8#Description>
        while ((utf8[len] & 0x80) != 0 && (utf8[len] & 0xc0) != 0xc0) {
            // the next byte is not the start of a new UTF-8 codepoint
            // so if we would cut there, the character would be truncated
            len--;
        }
        return len;
    }
 }
--- a/server/src/test/java/com/genymobile/scrcpy/StringUtilsTest.java
+++ b/server/src/test/java/com/genymobile/scrcpy/StringUtilsTest.java
@ -0,0 +1,44 @@
 package com.genymobile.scrcpy;
 import junit.framework.Assert;
 import org.junit.Test;
 import java.nio.charset.StandardCharsets;
 public class StringUtilsTest {
    @Test
    @SuppressWarnings("checkstyle:MagicNumber")
    public void testUtf8Trucate() {
        String s = "aÉbÔc";
        byte[] utf8 = s.getBytes(StandardCharsets.UTF_8);
        Assert.assertEquals(7, utf8.length);
        int count;
        count = StringUtils.getUtf8TruncationIndex(utf8, 1);
        Assert.assertEquals(1, count);
        count = StringUtils.getUtf8TruncationIndex(utf8, 2);
        Assert.assertEquals(1, count); // É is 2 bytes-wide
        count = StringUtils.getUtf8TruncationIndex(utf8, 3);
        Assert.assertEquals(3, count);
        count = StringUtils.getUtf8TruncationIndex(utf8, 4);
        Assert.assertEquals(4, count);
        count = StringUtils.getUtf8TruncationIndex(utf8, 5);
        Assert.assertEquals(4, count); // Ô is 2 bytes-wide
        count = StringUtils.getUtf8TruncationIndex(utf8, 6);
        Assert.assertEquals(6, count);
        count = StringUtils.getUtf8TruncationIndex(utf8, 7);
        Assert.assertEquals(7, count);
        count = StringUtils.getUtf8TruncationIndex(utf8, 8);
        Assert.assertEquals(7, count); // no more chars
    }
 }