Add helpers to truncate UTF-8 at code points
This will help to avoid truncating a UTF-8 string in the middle of a code point, producing an invalid UTF-8 result.
This commit is contained in:
parent
3aa5426cad
commit
0a7fe7ad57
6 changed files with 121 additions and 1 deletions
|
@ -159,7 +159,8 @@ tests = [
|
||||||
]],
|
]],
|
||||||
['test_control_event_serialize', [
|
['test_control_event_serialize', [
|
||||||
'tests/test_control_event_serialize.c',
|
'tests/test_control_event_serialize.c',
|
||||||
'src/control_event.c'
|
'src/control_event.c',
|
||||||
|
'src/str_util.c'
|
||||||
]],
|
]],
|
||||||
['test_strutil', [
|
['test_strutil', [
|
||||||
'tests/test_strutil.c',
|
'tests/test_strutil.c',
|
||||||
|
|
|
@ -58,6 +58,22 @@ strquote(const char *src) {
|
||||||
return quoted;
|
return quoted;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size_t
|
||||||
|
utf8_truncation_index(const char *utf8, size_t max_len) {
|
||||||
|
size_t len = strlen(utf8);
|
||||||
|
if (len <= max_len) {
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
len = max_len;
|
||||||
|
// see UTF-8 encoding <https://en.wikipedia.org/wiki/UTF-8#Description>
|
||||||
|
while ((utf8[len] & 0x80) != 0 && (utf8[len] & 0xc0) != 0xc0) {
|
||||||
|
// the next byte is not the start of a new UTF-8 codepoint
|
||||||
|
// so if we would cut there, the character would be truncated
|
||||||
|
len--;
|
||||||
|
}
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
|
|
||||||
wchar_t *
|
wchar_t *
|
||||||
|
|
|
@ -23,6 +23,10 @@ xstrjoin(char *dst, const char *const tokens[], char sep, size_t n);
|
||||||
char *
|
char *
|
||||||
strquote(const char *src);
|
strquote(const char *src);
|
||||||
|
|
||||||
|
// return the index to truncate a UTF-8 string at a valid position
|
||||||
|
size_t
|
||||||
|
utf8_truncation_index(const char *utf8, size_t max_len);
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
// convert a UTF-8 string to a wchar_t string
|
// convert a UTF-8 string to a wchar_t string
|
||||||
// returns the new allocated string, to be freed by the caller
|
// returns the new allocated string, to be freed by the caller
|
||||||
|
|
|
@ -126,6 +126,37 @@ static void test_xstrjoin_truncated_after_sep(void) {
|
||||||
assert(!strcmp("abc de ", s));
|
assert(!strcmp("abc de ", s));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void test_utf8_truncate(void) {
|
||||||
|
const char *s = "aÉbÔc";
|
||||||
|
assert(strlen(s) == 7); // É and Ô are 2 bytes-wide
|
||||||
|
|
||||||
|
size_t count;
|
||||||
|
|
||||||
|
count = utf8_truncation_index(s, 1);
|
||||||
|
assert(count == 1);
|
||||||
|
|
||||||
|
count = utf8_truncation_index(s, 2);
|
||||||
|
assert(count == 1); // É is 2 bytes-wide
|
||||||
|
|
||||||
|
count = utf8_truncation_index(s, 3);
|
||||||
|
assert(count == 3);
|
||||||
|
|
||||||
|
count = utf8_truncation_index(s, 4);
|
||||||
|
assert(count == 4);
|
||||||
|
|
||||||
|
count = utf8_truncation_index(s, 5);
|
||||||
|
assert(count == 4); // Ô is 2 bytes-wide
|
||||||
|
|
||||||
|
count = utf8_truncation_index(s, 6);
|
||||||
|
assert(count == 6);
|
||||||
|
|
||||||
|
count = utf8_truncation_index(s, 7);
|
||||||
|
assert(count == 7);
|
||||||
|
|
||||||
|
count = utf8_truncation_index(s, 8);
|
||||||
|
assert(count == 7); // no more chars
|
||||||
|
}
|
||||||
|
|
||||||
int main(void) {
|
int main(void) {
|
||||||
test_xstrncpy_simple();
|
test_xstrncpy_simple();
|
||||||
test_xstrncpy_just_fit();
|
test_xstrncpy_just_fit();
|
||||||
|
@ -135,5 +166,6 @@ int main(void) {
|
||||||
test_xstrjoin_truncated_in_token();
|
test_xstrjoin_truncated_in_token();
|
||||||
test_xstrjoin_truncated_before_sep();
|
test_xstrjoin_truncated_before_sep();
|
||||||
test_xstrjoin_truncated_after_sep();
|
test_xstrjoin_truncated_after_sep();
|
||||||
|
test_utf8_truncate();
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
23
server/src/main/java/com/genymobile/scrcpy/StringUtils.java
Normal file
23
server/src/main/java/com/genymobile/scrcpy/StringUtils.java
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
package com.genymobile.scrcpy;
|
||||||
|
|
||||||
|
public final class StringUtils {
|
||||||
|
private StringUtils() {
|
||||||
|
// not instantiable
|
||||||
|
}
|
||||||
|
|
||||||
|
@SuppressWarnings("checkstyle:MagicNumber")
|
||||||
|
public static int getUtf8TruncationIndex(byte[] utf8, int maxLength) {
|
||||||
|
int len = utf8.length;
|
||||||
|
if (len <= maxLength) {
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
len = maxLength;
|
||||||
|
// see UTF-8 encoding <https://en.wikipedia.org/wiki/UTF-8#Description>
|
||||||
|
while ((utf8[len] & 0x80) != 0 && (utf8[len] & 0xc0) != 0xc0) {
|
||||||
|
// the next byte is not the start of a new UTF-8 codepoint
|
||||||
|
// so if we would cut there, the character would be truncated
|
||||||
|
len--;
|
||||||
|
}
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,44 @@
|
||||||
|
package com.genymobile.scrcpy;
|
||||||
|
|
||||||
|
import junit.framework.Assert;
|
||||||
|
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
|
||||||
|
public class StringUtilsTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@SuppressWarnings("checkstyle:MagicNumber")
|
||||||
|
public void testUtf8Trucate() {
|
||||||
|
String s = "aÉbÔc";
|
||||||
|
byte[] utf8 = s.getBytes(StandardCharsets.UTF_8);
|
||||||
|
Assert.assertEquals(7, utf8.length);
|
||||||
|
|
||||||
|
int count;
|
||||||
|
|
||||||
|
count = StringUtils.getUtf8TruncationIndex(utf8, 1);
|
||||||
|
Assert.assertEquals(1, count);
|
||||||
|
|
||||||
|
count = StringUtils.getUtf8TruncationIndex(utf8, 2);
|
||||||
|
Assert.assertEquals(1, count); // É is 2 bytes-wide
|
||||||
|
|
||||||
|
count = StringUtils.getUtf8TruncationIndex(utf8, 3);
|
||||||
|
Assert.assertEquals(3, count);
|
||||||
|
|
||||||
|
count = StringUtils.getUtf8TruncationIndex(utf8, 4);
|
||||||
|
Assert.assertEquals(4, count);
|
||||||
|
|
||||||
|
count = StringUtils.getUtf8TruncationIndex(utf8, 5);
|
||||||
|
Assert.assertEquals(4, count); // Ô is 2 bytes-wide
|
||||||
|
|
||||||
|
count = StringUtils.getUtf8TruncationIndex(utf8, 6);
|
||||||
|
Assert.assertEquals(6, count);
|
||||||
|
|
||||||
|
count = StringUtils.getUtf8TruncationIndex(utf8, 7);
|
||||||
|
Assert.assertEquals(7, count);
|
||||||
|
|
||||||
|
count = StringUtils.getUtf8TruncationIndex(utf8, 8);
|
||||||
|
Assert.assertEquals(7, count); // no more chars
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in a new issue