stdlib: Improve Unicode support and consistency in string comparison functions.

SDL_strcasecmp (even when calling into a C runtime) does not work with
Unicode chars, and depending on the user's locale, might not work with
even basic ASCII strings.

This implements the function from scratch, using "case-folding,"
which is a more robust method that deals with various languages. It
involves a hashtable of a few hundred codepoints that are "uppercase" and
how to map them to lowercase equivalents (possibly increasing the size of
the string in the process). The vast majority of human languages (and
Unicode) do not have letters with different cases, but still, this static
table takes about 10 kilobytes on a 64-bit machine.

Even this will fail in one known case: the Turkish 'i' folds differently
if you're writing in Turkish vs other languages. Generally this is seen as
unfortunate collateral damage in cases where you can't specify the language
in use.

In addition to case-folding the codepoints, the new functions also know how
to decode the various formats to turn them into codepoints in the first
place, instead of blindly stepping by one byte (or one wchar_t) per
character.

Also included is casefolding.txt from the Unicode Consortium and a perl
script to generate the hashtable from that text file, so we can trivially
update this if new languages are added in the future.

A simple test using the new function:

```c
 #include <SDL3/SDL.h>

 int main(void)
 {
     const char *a = "α ε η";
     const char *b = "Α Ε Η";
     SDL_Log("    strcasecmp(\"%s\", \"%s\") == %d\n", a, b, strcasecmp(a, b));
     SDL_Log("SDL_strcasecmp(\"%s\", \"%s\") == %d\n", a, b, SDL_strcasecmp(a, b));
     return 0;
 }
```

Produces:

```
INFO:     strcasecmp("α ε η", "Α Ε Η") == 32
INFO: SDL_strcasecmp("α ε η", "Α Ε Η") == 0
```

glibc strcasecmp() fails to compare a Greek lowercase string to its uppercase
equivalent, even with a UTF-8 locale, but SDL_strcasecmp() works.

Other SDL_stdinc.h functions are changed to be more consistent, which is to
say they now ignore any C runtime and often dictate that only English-based
low-ASCII works with them.

Fixes Issue #9313.
This commit is contained in:
Ryan C. Gordon
2024-03-26 13:22:38 -04:00
parent 4659a84bd1
commit a5c892d2c3
17 changed files with 4971 additions and 210 deletions

View File

@@ -50,7 +50,6 @@
/* Useful headers */
#cmakedefine HAVE_ALLOCA_H 1
#cmakedefine HAVE_CTYPE_H 1
#cmakedefine HAVE_FLOAT_H 1
#cmakedefine HAVE_ICONV_H 1
#cmakedefine HAVE_INTTYPES_H 1
@@ -97,10 +96,6 @@
#cmakedefine HAVE_WCSSTR 1
#cmakedefine HAVE_WCSCMP 1
#cmakedefine HAVE_WCSNCMP 1
#cmakedefine HAVE_WCSCASECMP 1
#cmakedefine HAVE__WCSICMP 1
#cmakedefine HAVE_WCSNCASECMP 1
#cmakedefine HAVE__WCSNICMP 1
#cmakedefine HAVE_WCSTOL 1
#cmakedefine HAVE_STRLEN 1
#cmakedefine HAVE_STRNLEN 1
@@ -131,10 +126,6 @@
#cmakedefine HAVE_ATOF 1
#cmakedefine HAVE_STRCMP 1
#cmakedefine HAVE_STRNCMP 1
#cmakedefine HAVE__STRICMP 1
#cmakedefine HAVE_STRCASECMP 1
#cmakedefine HAVE__STRNICMP 1
#cmakedefine HAVE_STRNCASECMP 1
#cmakedefine HAVE_STRCASESTR 1
#cmakedefine HAVE_SSCANF 1
#cmakedefine HAVE_VSSCANF 1

View File

@@ -36,7 +36,6 @@
#define HAVE_GCC_ATOMICS 1
#define HAVE_ALLOCA_H 1
#define HAVE_CTYPE_H 1
#define HAVE_FLOAT_H 1
#define HAVE_INTTYPES_H 1
#define HAVE_LIMITS_H 1
@@ -85,8 +84,6 @@
#define HAVE_ATOF 1
#define HAVE_STRCMP 1
#define HAVE_STRNCMP 1
#define HAVE_STRCASECMP 1
#define HAVE_STRNCASECMP 1
#define HAVE_STRCASESTR 1
#define HAVE_VSSCANF 1
#define HAVE_VSNPRINTF 1

View File

@@ -35,7 +35,6 @@
/* Useful headers */
#define HAVE_ALLOCA_H 1
#define HAVE_CTYPE_H 1
#define HAVE_FLOAT_H 1
#define HAVE_ICONV_H 1
#define HAVE_INTTYPES_H 1
@@ -76,8 +75,6 @@
#define HAVE_WCSSTR 1
#define HAVE_WCSCMP 1
#define HAVE_WCSNCMP 1
#define HAVE_WCSCASECMP 1
#define HAVE_WCSNCASECMP 1
#define HAVE_STRLEN 1
#define HAVE_STRLCPY 1
#define HAVE_STRLCAT 1
@@ -94,8 +91,6 @@
#define HAVE_ATOF 1
#define HAVE_STRCMP 1
#define HAVE_STRNCMP 1
#define HAVE_STRCASECMP 1
#define HAVE_STRNCASECMP 1
#define HAVE_SSCANF 1
#define HAVE_VSSCANF 1
#define HAVE_VSNPRINTF 1

View File

@@ -28,7 +28,6 @@
#define HAVE_GCC_ATOMICS 1
#define HAVE_ALLOCA_H 1
#define HAVE_CTYPE_H 1
#define HAVE_FLOAT_H 1
#define HAVE_INTTYPES_H 1
#define HAVE_LIMITS_H 1
@@ -77,8 +76,6 @@
#define HAVE_ATOF 1
#define HAVE_STRCMP 1
#define HAVE_STRNCMP 1
#define HAVE_STRCASECMP 1
#define HAVE_STRNCASECMP 1
#define HAVE_STRCASESTR 1
#define HAVE_VSSCANF 1
#define HAVE_VSNPRINTF 1

View File

@@ -32,7 +32,6 @@
/* Useful headers */
#define HAVE_ALLOCA_H 1
#define HAVE_CTYPE_H 1
#define HAVE_FLOAT_H 1
#define HAVE_INTTYPES_H 1
#define HAVE_LIBUNWIND_H 1
@@ -81,8 +80,6 @@
#define HAVE_ATOF 1
#define HAVE_STRCMP 1
#define HAVE_STRNCMP 1
#define HAVE_STRCASECMP 1
#define HAVE_STRNCASECMP 1
#define HAVE_STRCASESTR 1
#define HAVE_VSSCANF 1
#define HAVE_VSNPRINTF 1

View File

@@ -118,7 +118,6 @@ typedef unsigned int uintptr_t;
#if HAVE_LIBC
/* Useful headers */
#define HAVE_CTYPE_H 1
#define HAVE_FLOAT_H 1
#define HAVE_LIMITS_H 1
#define HAVE_MATH_H 1
@@ -159,10 +158,6 @@ typedef unsigned int uintptr_t;
#define HAVE_ATOF 1
#define HAVE_STRCMP 1
#define HAVE_STRNCMP 1
#define HAVE__STRICMP 1
#define HAVE__STRNICMP 1
#define HAVE__WCSICMP 1
#define HAVE__WCSNICMP 1
#define HAVE__WCSDUP 1
#define HAVE_SSCANF 1
#define HAVE_VSSCANF 1

View File

@@ -57,7 +57,6 @@
#endif
/* Useful headers */
#define HAVE_CTYPE_H 1
#define HAVE_FLOAT_H 1
#define HAVE_LIMITS_H 1
#define HAVE_MATH_H 1
@@ -100,10 +99,6 @@
#define HAVE_ATOF 1
#define HAVE_STRCMP 1
#define HAVE_STRNCMP 1
#define HAVE__STRICMP 1
#define HAVE__STRNICMP 1
#define HAVE__WCSICMP 1
#define HAVE__WCSNICMP 1
#define HAVE__WCSDUP 1
#define HAVE_ACOS 1
#define HAVE_ASIN 1

View File

@@ -57,7 +57,6 @@
#define HAVE_TPCSHRD_H 1
#define HAVE_LIBC 1
#define HAVE_CTYPE_H 1
#define HAVE_FLOAT_H 1
#define HAVE_LIMITS_H 1
#define HAVE_MATH_H 1
@@ -96,8 +95,6 @@
#define HAVE_ATOF 1
#define HAVE_STRCMP 1
#define HAVE_STRNCMP 1
#define HAVE__STRICMP 1
#define HAVE__STRNICMP 1
#define HAVE_VSNPRINTF 1
/* TODO, WinRT: consider using ??_s versions of the following */
/* #undef HAVE__STRLWR */

View File

@@ -57,7 +57,6 @@
#endif
/* Useful headers */
#define HAVE_CTYPE_H 1
#define HAVE_FLOAT_H 1
#define HAVE_LIMITS_H 1
#define HAVE_MATH_H 1
@@ -100,10 +99,6 @@
#define HAVE_ATOF 1
#define HAVE_STRCMP 1
#define HAVE_STRNCMP 1
#define HAVE__STRICMP 1
#define HAVE__STRNICMP 1
#define HAVE__WCSICMP 1
#define HAVE__WCSNICMP 1
#define HAVE__WCSDUP 1
#define HAVE_ACOS 1
#define HAVE_ASIN 1