Merge branch 'ci-utf8'

Fix UTF-8 build under MSW and improve its coverage in the CI jobs. See #23313. Closes #23328.
2023-03-30 19:44:37 +02:00 · 2023-03-30 19:44:37 +02:00 · dee816bb5b
commit dee816bb5b
parent 3ac39970b1 f1f612ea1a
28 changed files with 887 additions and 544 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -108,10 +108,11 @@ jobs:
            skip_samples: true
            use_asan: true
            use_xvfb: true
-          - name: Ubuntu 22.04 wxGTK C++20
+          - name: Ubuntu 22.04 wxGTK UTF-8 C++20
            runner: ubuntu-22.04
-            configure_flags: --with-cxx=20
-            skip_samples: true
+            configure_flags: --with-cxx=20 --enable-utf8 --enable-utf8only
+            use_asan: true
+            use_xvfb: true
          - name: Ubuntu 18.04 wxX11
            runner: ubuntu-latest
            container: ubuntu:18.04
--- a/.github/workflows/ci_msw.yml
+++ b/.github/workflows/ci_msw.yml
@ -67,6 +67,7 @@ jobs:
          - configuration: 'DLL Debug'
            platform: 'x64'
            vsversion: 2022
+            use_utf8: true
          - configuration: 'Debug'
            platform: 'Win32'
            vsversion: 2019
@ -81,14 +82,18 @@ jobs:
        with:
          submodules: 'recursive'

-      - name: Configure to use STL
-        if: matrix.use_stl
+      - name: Configure build options
        working-directory: include/wx/msw
        run: |
-            $txt = Get-Content setup.h
-            Write-Output $txt |
-            %{$_ -replace "define wxUSE_STL 0", "define wxUSE_STL 1"} |
-            Set-Content setup.h
+            $use_stl = "${{ matrix.use_stl }}" ? 1 : 0
+            $use_utf8 = "${{ matrix.use_utf8 }}" ? 1 : 0
+            if ( $use_stl -or $use_utf8 ) {
+              $txt = Get-Content setup.h
+              Write-Output $txt |
+              %{$_ -replace "define wxUSE_STL 0", "define wxUSE_STL $use_stl"} |
+              %{$_ -replace "define wxUSE_UNICODE_UTF8 0", "define wxUSE_UNICODE_UTF8 $use_utf8"} |
+              Set-Content setup.h
+            }

      - name: Add MSBuild to PATH
        uses: microsoft/setup-msbuild@v1.1.3
--- a/appveyor.yml
+++ b/appveyor.yml
@ -34,6 +34,11 @@ environment:
    ARCH: x64
    wxUSE_STL: 1
    APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2022
+  - TOOLSET: msbuild
+    CONFIGURATION: Debug
+    ARCH: x64
+    wxUSE_UNICODE_UTF8: 1
+    APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2022
  - TOOLSET: nmake
    VS: '14.0'
    BUILD: debug
@ -67,11 +72,13 @@ before_build:
 - ps: |
    $env:PATH = $env:PATH -replace "C:\\Program Files\\Git\\usr\\bin",""
    if (-not (Test-Path env:wxUSE_STL)) { $env:wxUSE_STL = '0' }
+    if (-not (Test-Path env:wxUSE_UNICODE_UTF8)) { $env:wxUSE_UNICODE_UTF8 = '0' }
    if (-not (Test-Path env:wxUSE_WEBVIEW_EDGE)) { $env:wxUSE_WEBVIEW_EDGE = '0' }
    if (($env:TOOLSET -ne "msys2") -and ($env:TOOLSET -ne "cygwin")) {
      $txt = gc include\wx\msw\setup.h
      Write-Output $txt |
      %{$_ -replace "define wxUSE_STL 0", "define wxUSE_STL $env:wxUSE_STL"} |
+      %{$_ -replace "define wxUSE_UNICODE_UTF8 0", "define wxUSE_UNICODE_UTF8 $env:wxUSE_UNICODE_UTF8"} |
      %{$_ -replace "define wxUSE_WEBVIEW_EDGE 0", "define wxUSE_WEBVIEW_EDGE $env:wxUSE_WEBVIEW_EDGE"} |
      sc include\wx\msw\setup.h
    }
--- a/build/cmake/options.cmake
+++ b/build/cmake/options.cmake
@ -86,10 +86,8 @@ wx_option(wxUSE_STL "use standard C++ classes for everything" OFF)
 set(wxTHIRD_PARTY_LIBRARIES ${wxTHIRD_PARTY_LIBRARIES} wxUSE_STL "use C++ STL classes")
 wx_dependent_option(wxUSE_STD_CONTAINERS "use standard C++ container classes" ON "wxUSE_STL" OFF)

-if(NOT WIN32)
-    wx_option(wxUSE_UNICODE_UTF8 "use UTF-8 representation for strings (Unix only)" OFF)
-    wx_dependent_option(wxUSE_UTF8_LOCALE_ONLY "only support UTF-8 locales in UTF-8 build (Unix only)" ON "wxUSE_UNICODE_UTF8" OFF)
-endif()
+wx_option(wxUSE_UNICODE_UTF8 "use UTF-8 representation for strings" OFF)
+wx_dependent_option(wxUSE_UTF8_LOCALE_ONLY "only support UTF-8 locales in UTF-8 build" ON "wxUSE_UNICODE_UTF8" OFF)

 if(NOT WIN32)
    wx_option(wxUSE_VISIBILITY "use of ELF symbols visibility")
--- a/build/cmake/setup.h.in
+++ b/build/cmake/setup.h.in
@ -119,6 +119,11 @@
 #cmakedefine01 wxUSE_REPRODUCIBLE_BUILD


+#cmakedefine01 wxUSE_UNICODE_UTF8
+
+#cmakedefine01 wxUSE_UTF8_LOCALE_ONLY
+
+

 #cmakedefine01 wxUSE_ON_FATAL_EXCEPTION

@ -591,9 +596,6 @@
 #cmakedefine01 wxUSE_SELECT_DISPATCHER
 #cmakedefine01 wxUSE_EPOLL_DISPATCHER

-#cmakedefine01 wxUSE_UNICODE_UTF8
-#cmakedefine01 wxUSE_UTF8_LOCALE_ONLY
-
 /*
   Use GStreamer for Unix.

--- a/build/tools/appveyor-test.bat
+++ b/build/tools/appveyor-test.bat
@ -8,9 +8,12 @@ goto %TOOLSET%

 :msbuild
 PATH=C:\projects\wxwidgets\lib\vc_x64_dll;%PATH%
-.\vc_x64_mswudll\test.exe
+if "%CONFIGURATION%"=="DLL Release" set suffix=dll
+if "%CONFIGURATION%"=="DLL Debug" set suffix=ddll
+if "%CONFIGURATION%"=="Debug" set suffix=d
+.\vc_x64_mswu%suffix%\test.exe
 if %errorlevel% NEQ 0 goto :error
-.\vc_x64_mswudll\test_gui.exe
+.\vc_x64_mswu%suffix%\test_gui.exe
 goto :eof

 :nmake
--- a/4
+++ b/4
@ -2086,8 +2086,8 @@ Optional Features:
  --disable-std_iostreams  disable use of standard C++ stream classes
 --enable-std_string_conv_in_wxstring     provide implicit conversion to std::string in wxString
 --disable-unsafe_conv_in_wxstring        disable unsafe implicit conversions in wxString
-  --enable-utf8           use UTF-8 representation for strings (Unix only)
-  --enable-utf8only       only support UTF-8 locales in UTF-8 build (Unix only)
+  --enable-utf8           use UTF-8 representation for strings
+  --enable-utf8only       only support UTF-8 locales in UTF-8 build
  --enable-extended_rtti  use extended RTTI (XTI)
  --disable-optimise      compile without optimisations
  --enable-profile        create code with profiling information
--- a/configure.in
+++ b/configure.in
@ -649,8 +649,8 @@ WX_ARG_ENABLE(std_containers,[  --enable-std_containers use standard C++ contain
 WX_ARG_DISABLE(std_iostreams,[  --disable-std_iostreams  disable use of standard C++ stream classes], wxUSE_STD_IOSTREAM)
 WX_ARG_ENABLE(std_string_conv_in_wxstring, [ --enable-std_string_conv_in_wxstring     provide implicit conversion to std::string in wxString], wxUSE_STD_STRING_CONV_IN_WXSTRING)
 WX_ARG_DISABLE(unsafe_conv_in_wxstring,     [ --disable-unsafe_conv_in_wxstring        disable unsafe implicit conversions in wxString], wxUSE_UNSAFE_WXSTRING_CONV)
-WX_ARG_ENABLE_PARAM(utf8,    [  --enable-utf8           use UTF-8 representation for strings (Unix only)], wxUSE_UNICODE_UTF8)
-WX_ARG_ENABLE(utf8only,      [  --enable-utf8only       only support UTF-8 locales in UTF-8 build (Unix only)], wxUSE_UNICODE_UTF8_LOCALE)
+WX_ARG_ENABLE_PARAM(utf8,    [  --enable-utf8           use UTF-8 representation for strings], wxUSE_UNICODE_UTF8)
+WX_ARG_ENABLE(utf8only,      [  --enable-utf8only       only support UTF-8 locales in UTF-8 build], wxUSE_UNICODE_UTF8_LOCALE)
 WX_ARG_ENABLE(extended_rtti, [  --enable-extended_rtti  use extended RTTI (XTI)], wxUSE_EXTENDED_RTTI)

 WX_ARG_DISABLE(optimise,   [  --disable-optimise      compile without optimisations], wxUSE_OPTIMISE)
--- a/docs/doxygen/overviews/string.h
+++ b/docs/doxygen/overviews/string.h
@ -296,6 +296,18 @@ for (i = s.begin(); i != s.end(); ++i)
 }
@endcode

+or, even simpler, range for loop:
+@code
+wxString s = "hello";
+for ( auto c : s )
+{
+    // do something with "c"
+}
+@endcode
+
+@note wxString iterators have unusual proxy-like semantics and can be used to
+    modify the string even when @e not using references, i.e. with just @c
+    auto, as in the example above.


@section overview_string_related String Related Functions and Classes
--- a/include/wx/android/setup.h
+++ b/include/wx/android/setup.h
@ -77,6 +77,54 @@
 // Recommended setting: 0
 #define wxUSE_REPRODUCIBLE_BUILD 0

+// ----------------------------------------------------------------------------
+// wxString encoding settings
+// ----------------------------------------------------------------------------
+
+// If set to 1, wxString uses UTF-8 internally instead of UTF-32 (Unix) or
+// UTF-16 (MSW).
+//
+// This option can be set to 1 if you want to avoid the overhead of converting
+// between wchar_t encoding (UTF-32 or UTF-16) used by wxString by default and
+// UTF-8, i.e. it makes functions such as wxString::FromUTF8() and utf8_str()
+// much more efficient and constant time, as they don't perform any conversion
+// any longer, which is especially interesting in wxGTK where these functions
+// are used every time a GTK function is called. But this is compensated by
+// making all the non-UTF-8 functions less efficient, notably requiring a
+// conversion when passing any string to Win32 API.
+//
+// Moreover, accessing strings by character index becomes, in general, a O(N)
+// iteration, where N is the index, so only enable this option if you don't use
+// index access for arbitrary characters (unless it is done inside a loop
+// consecutively for all characters as this special access pattern is optimized
+// by caching the last accessed index -- but using iterate, or range for loop,
+// is still better even in this case), as otherwise you may observe significant
+// slowdown in your program performance.
+//
+// Default is 0
+//
+// Recommended setting: 0 but can be set to 1 for optimization purposes and if
+// you're sure that you're not using loops using indices to iterate over
+// strings in your code.
+#define wxUSE_UNICODE_UTF8 0
+
+// If set to 1, assume that all narrow strings use UTF-8.
+//
+// By default, wxWidgets assumes that all "char*" strings use the encoding of
+// the current locale, which is commonly, but not always, UTF-8 under Unix but
+// rarely UTF-8 under MSW. This option tells the library that all strings
+// always use UTF-8, avoiding the need to perform any conversions between them
+// and wxString internal representation when wxUSE_UNICODE_UTF8 is set to 1.
+//
+// In fact, using this option only makes sense when wxUSE_UNICODE_UTF8==1 and
+// it must not be enabled without the other option.
+//
+// Default is 0
+//
+// Recommended setting: 0 but can be set to 1 if your program is always run in
+// an UTF-8 locale.
+#define wxUSE_UTF8_LOCALE_ONLY 0
+
 // ----------------------------------------------------------------------------
 // debugging settings
 // ----------------------------------------------------------------------------
--- a/include/wx/catch_cppunit.h
+++ b/include/wx/catch_cppunit.h
@ -89,16 +89,16 @@ namespace Catch
        static std::string convert(const wxString& wxs)
        {
            std::string s;
-            s.reserve(wxs.length());
-            for ( wxString::const_iterator i = wxs.begin();
-                  i != wxs.end();
-                  ++i )
+            s.reserve(wxs.length() + 2);
+            s += '"';
+            for ( auto c : wxs )
            {
-                if ( !iswprint(*i) )
-                    s += wxString::Format(wxASCII_STR("\\u%04X"), *i).ToAscii();
+                if ( c >= 128 || !iswprint(c) )
+                    s += wxString::Format(wxASCII_STR("\\u%04X"), c).ToAscii();
                else
-                    s += *i;
+                    s += c;
            }
+            s += '"';

            return s;
        }
--- a/include/wx/gtk/setup.h
+++ b/include/wx/gtk/setup.h
@ -78,6 +78,54 @@
 // Recommended setting: 0
 #define wxUSE_REPRODUCIBLE_BUILD 0

+// ----------------------------------------------------------------------------
+// wxString encoding settings
+// ----------------------------------------------------------------------------
+
+// If set to 1, wxString uses UTF-8 internally instead of UTF-32 (Unix) or
+// UTF-16 (MSW).
+//
+// This option can be set to 1 if you want to avoid the overhead of converting
+// between wchar_t encoding (UTF-32 or UTF-16) used by wxString by default and
+// UTF-8, i.e. it makes functions such as wxString::FromUTF8() and utf8_str()
+// much more efficient and constant time, as they don't perform any conversion
+// any longer, which is especially interesting in wxGTK where these functions
+// are used every time a GTK function is called. But this is compensated by
+// making all the non-UTF-8 functions less efficient, notably requiring a
+// conversion when passing any string to Win32 API.
+//
+// Moreover, accessing strings by character index becomes, in general, a O(N)
+// iteration, where N is the index, so only enable this option if you don't use
+// index access for arbitrary characters (unless it is done inside a loop
+// consecutively for all characters as this special access pattern is optimized
+// by caching the last accessed index -- but using iterate, or range for loop,
+// is still better even in this case), as otherwise you may observe significant
+// slowdown in your program performance.
+//
+// Default is 0
+//
+// Recommended setting: 0 but can be set to 1 for optimization purposes and if
+// you're sure that you're not using loops using indices to iterate over
+// strings in your code.
+#define wxUSE_UNICODE_UTF8 0
+
+// If set to 1, assume that all narrow strings use UTF-8.
+//
+// By default, wxWidgets assumes that all "char*" strings use the encoding of
+// the current locale, which is commonly, but not always, UTF-8 under Unix but
+// rarely UTF-8 under MSW. This option tells the library that all strings
+// always use UTF-8, avoiding the need to perform any conversions between them
+// and wxString internal representation when wxUSE_UNICODE_UTF8 is set to 1.
+//
+// In fact, using this option only makes sense when wxUSE_UNICODE_UTF8==1 and
+// it must not be enabled without the other option.
+//
+// Default is 0
+//
+// Recommended setting: 0 but can be set to 1 if your program is always run in
+// an UTF-8 locale.
+#define wxUSE_UTF8_LOCALE_ONLY 0
+
 // ----------------------------------------------------------------------------
 // debugging settings
 // ----------------------------------------------------------------------------
--- a/include/wx/msw/setup.h
+++ b/include/wx/msw/setup.h
@ -78,6 +78,54 @@
 // Recommended setting: 0
 #define wxUSE_REPRODUCIBLE_BUILD 0

+// ----------------------------------------------------------------------------
+// wxString encoding settings
+// ----------------------------------------------------------------------------
+
+// If set to 1, wxString uses UTF-8 internally instead of UTF-32 (Unix) or
+// UTF-16 (MSW).
+//
+// This option can be set to 1 if you want to avoid the overhead of converting
+// between wchar_t encoding (UTF-32 or UTF-16) used by wxString by default and
+// UTF-8, i.e. it makes functions such as wxString::FromUTF8() and utf8_str()
+// much more efficient and constant time, as they don't perform any conversion
+// any longer, which is especially interesting in wxGTK where these functions
+// are used every time a GTK function is called. But this is compensated by
+// making all the non-UTF-8 functions less efficient, notably requiring a
+// conversion when passing any string to Win32 API.
+//
+// Moreover, accessing strings by character index becomes, in general, a O(N)
+// iteration, where N is the index, so only enable this option if you don't use
+// index access for arbitrary characters (unless it is done inside a loop
+// consecutively for all characters as this special access pattern is optimized
+// by caching the last accessed index -- but using iterate, or range for loop,
+// is still better even in this case), as otherwise you may observe significant
+// slowdown in your program performance.
+//
+// Default is 0
+//
+// Recommended setting: 0 but can be set to 1 for optimization purposes and if
+// you're sure that you're not using loops using indices to iterate over
+// strings in your code.
+#define wxUSE_UNICODE_UTF8 0
+
+// If set to 1, assume that all narrow strings use UTF-8.
+//
+// By default, wxWidgets assumes that all "char*" strings use the encoding of
+// the current locale, which is commonly, but not always, UTF-8 under Unix but
+// rarely UTF-8 under MSW. This option tells the library that all strings
+// always use UTF-8, avoiding the need to perform any conversions between them
+// and wxString internal representation when wxUSE_UNICODE_UTF8 is set to 1.
+//
+// In fact, using this option only makes sense when wxUSE_UNICODE_UTF8==1 and
+// it must not be enabled without the other option.
+//
+// Default is 0
+//
+// Recommended setting: 0 but can be set to 1 if your program is always run in
+// an UTF-8 locale.
+#define wxUSE_UTF8_LOCALE_ONLY 0
+
 // ----------------------------------------------------------------------------
 // debugging settings
 // ----------------------------------------------------------------------------
--- a/include/wx/osx/setup.h
+++ b/include/wx/osx/setup.h
@ -84,6 +84,54 @@
 // Recommended setting: 0
 #define wxUSE_REPRODUCIBLE_BUILD 0

+// ----------------------------------------------------------------------------
+// wxString encoding settings
+// ----------------------------------------------------------------------------
+
+// If set to 1, wxString uses UTF-8 internally instead of UTF-32 (Unix) or
+// UTF-16 (MSW).
+//
+// This option can be set to 1 if you want to avoid the overhead of converting
+// between wchar_t encoding (UTF-32 or UTF-16) used by wxString by default and
+// UTF-8, i.e. it makes functions such as wxString::FromUTF8() and utf8_str()
+// much more efficient and constant time, as they don't perform any conversion
+// any longer, which is especially interesting in wxGTK where these functions
+// are used every time a GTK function is called. But this is compensated by
+// making all the non-UTF-8 functions less efficient, notably requiring a
+// conversion when passing any string to Win32 API.
+//
+// Moreover, accessing strings by character index becomes, in general, a O(N)
+// iteration, where N is the index, so only enable this option if you don't use
+// index access for arbitrary characters (unless it is done inside a loop
+// consecutively for all characters as this special access pattern is optimized
+// by caching the last accessed index -- but using iterate, or range for loop,
+// is still better even in this case), as otherwise you may observe significant
+// slowdown in your program performance.
+//
+// Default is 0
+//
+// Recommended setting: 0 but can be set to 1 for optimization purposes and if
+// you're sure that you're not using loops using indices to iterate over
+// strings in your code.
+#define wxUSE_UNICODE_UTF8 0
+
+// If set to 1, assume that all narrow strings use UTF-8.
+//
+// By default, wxWidgets assumes that all "char*" strings use the encoding of
+// the current locale, which is commonly, but not always, UTF-8 under Unix but
+// rarely UTF-8 under MSW. This option tells the library that all strings
+// always use UTF-8, avoiding the need to perform any conversions between them
+// and wxString internal representation when wxUSE_UNICODE_UTF8 is set to 1.
+//
+// In fact, using this option only makes sense when wxUSE_UNICODE_UTF8==1 and
+// it must not be enabled without the other option.
+//
+// Default is 0
+//
+// Recommended setting: 0 but can be set to 1 if your program is always run in
+// an UTF-8 locale.
+#define wxUSE_UTF8_LOCALE_ONLY 0
+
 // ----------------------------------------------------------------------------
 // debugging settings
 // ----------------------------------------------------------------------------
--- a/include/wx/setup_inc.h
+++ b/include/wx/setup_inc.h
@ -74,6 +74,54 @@
 // Recommended setting: 0
 #define wxUSE_REPRODUCIBLE_BUILD 0

+// ----------------------------------------------------------------------------
+// wxString encoding settings
+// ----------------------------------------------------------------------------
+
+// If set to 1, wxString uses UTF-8 internally instead of UTF-32 (Unix) or
+// UTF-16 (MSW).
+//
+// This option can be set to 1 if you want to avoid the overhead of converting
+// between wchar_t encoding (UTF-32 or UTF-16) used by wxString by default and
+// UTF-8, i.e. it makes functions such as wxString::FromUTF8() and utf8_str()
+// much more efficient and constant time, as they don't perform any conversion
+// any longer, which is especially interesting in wxGTK where these functions
+// are used every time a GTK function is called. But this is compensated by
+// making all the non-UTF-8 functions less efficient, notably requiring a
+// conversion when passing any string to Win32 API.
+//
+// Moreover, accessing strings by character index becomes, in general, a O(N)
+// iteration, where N is the index, so only enable this option if you don't use
+// index access for arbitrary characters (unless it is done inside a loop
+// consecutively for all characters as this special access pattern is optimized
+// by caching the last accessed index -- but using iterate, or range for loop,
+// is still better even in this case), as otherwise you may observe significant
+// slowdown in your program performance.
+//
+// Default is 0
+//
+// Recommended setting: 0 but can be set to 1 for optimization purposes and if
+// you're sure that you're not using loops using indices to iterate over
+// strings in your code.
+#define wxUSE_UNICODE_UTF8 0
+
+// If set to 1, assume that all narrow strings use UTF-8.
+//
+// By default, wxWidgets assumes that all "char*" strings use the encoding of
+// the current locale, which is commonly, but not always, UTF-8 under Unix but
+// rarely UTF-8 under MSW. This option tells the library that all strings
+// always use UTF-8, avoiding the need to perform any conversions between them
+// and wxString internal representation when wxUSE_UNICODE_UTF8 is set to 1.
+//
+// In fact, using this option only makes sense when wxUSE_UNICODE_UTF8==1 and
+// it must not be enabled without the other option.
+//
+// Default is 0
+//
+// Recommended setting: 0 but can be set to 1 if your program is always run in
+// an UTF-8 locale.
+#define wxUSE_UTF8_LOCALE_ONLY 0
+
 // ----------------------------------------------------------------------------
 // debugging settings
 // ----------------------------------------------------------------------------
--- a/include/wx/string.h
+++ b/include/wx/string.h
@ -274,6 +274,7 @@ public:
    ~wxStringIteratorNode()
        { clear(); }

+    inline void clear();
    inline void set(const wxString *str, wxStringImpl::const_iterator *citer)
        { clear(); DoSet(str, citer, nullptr); }
    inline void set(const wxString *str, wxStringImpl::iterator *iter)
@ -285,7 +286,6 @@ public:
    wxStringIteratorNode *m_prev{nullptr}, *m_next{nullptr};

 private:
-    inline void clear();
    inline void DoSet(const wxString *str,
                      wxStringImpl::const_iterator *citer,
                      wxStringImpl::iterator *iter);
@ -447,8 +447,8 @@ private:
      unsigned lastUsed;
  };

-  static wxTHREAD_SPECIFIC_DECL Cache ms_cache;
-  static Cache& GetCache() { return ms_cache; }
+  // Implemented out of line because per-thread variable can't be DLL exported.
+  static Cache& GetCache();

  static Cache::Element *GetCacheBegin() { return GetCache().cached; }
  static Cache::Element *GetCacheEnd() { return GetCacheBegin() + Cache::SIZE; }
@ -987,7 +987,17 @@ public:
      // This is logically equivalent to strlen(str.mb_str()) but avoids
      // actually converting the string to multibyte and just computes the
      // length that it would have after conversion.
+
+      // Note that in UTF-8 build we need to use the actual wide character
+      // buffer length and not the string length, as it may be different when
+      // using surrogates, but in wchar_t build they're the same by definition
+      // and we can avoid creating an extra buffer.
+#if wxUSE_UNICODE_UTF8
+      const wxScopedWCharBuffer wbuf(str.wc_str());
+      const size_t ofs = wxConvLibc.FromWChar(nullptr, 0, wbuf.data(), wbuf.length());
+#else // wxUSE_UNICODE_WCHAR
      const size_t ofs = wxConvLibc.FromWChar(nullptr, 0, str.wc_str(), str.length());
+#endif
      return ofs == wxCONV_FAILED ? 0 : static_cast<ptrdiff_t>(ofs);
  }

@ -1203,16 +1213,24 @@ public:
    { assign(std::move(str), nLength); }


-#if wxUSE_STRING_POS_CACHE
+#if wxUSE_UNICODE_UTF8
  ~wxString()
  {
+#if wxUSE_STRING_POS_CACHE
      // we need to invalidate our cache entry as another string could be
      // recreated at the same address (unlikely, but still possible, with the
      // heap-allocated strings but perfectly common with stack-allocated ones)
      InvalidateCache();
-  }
 #endif // wxUSE_STRING_POS_CACHE

+      // We also need to clear any still existing iterators pointing into this
+      // string, as otherwise clearing them later, when they're destroyed,
+      // would try to use a dangling string pointer stored in them.
+      while ( m_iterators.ptr )
+          m_iterators.ptr->clear();
+  }
+#endif // wxUSE_UNICODE_UTF8
+
  #if wxUSE_UNICODE_WCHAR
    wxString(const std::wstring& str) : m_impl(str) {}
    wxString(std::wstring&& str) noexcept : m_impl(std::move(str)) {}
@ -1350,7 +1368,7 @@ public:
  size_type capacity() const { return m_impl.capacity(); }
  void reserve(size_t sz) { m_impl.reserve(sz); }

-  void shrink_to_fit() { Shrink(); }
+  void shrink_to_fit() { m_impl.shrink_to_fit(); }

  void resize(size_t nSize, wxUniChar ch = wxT('\0'))
  {
@ -1685,7 +1703,7 @@ public:
    // conversions with (possible) format conversions: have to return a
    // buffer with temporary data
    //
-    // the functions defined (in either Unicode or ANSI) mode are mb_str() to
+    // All builds of the library define the same functions: mb_str() to
    // return an ANSI (multibyte) string, wc_str() to return a wide string and
    // fn_str() to return a string which should be used with the OS APIs
    // accepting the file names. The return value is always the same, but the
@ -2223,8 +2241,7 @@ public:
    // only works if the data of this string is not shared
  bool Alloc(size_t nLen) { reserve(nLen); return capacity() >= nLen; }
    // minimize the string's memory
-    // only works if the data of this string is not shared
-  bool Shrink();
+  bool Shrink() { shrink_to_fit(); return true; }

  // wxWidgets version 1 compatibility functions

@ -3540,6 +3557,8 @@ private:

  friend class WXDLLIMPEXP_FWD_BASE wxStringIteratorNode;
  friend class WXDLLIMPEXP_FWD_BASE wxUniCharRef;
+  friend class wxUTF8StringBuffer;
+  friend class wxUTF8StringBufferLength;
 #endif // wxUSE_UNICODE_UTF8

  friend class WXDLLIMPEXP_FWD_BASE wxCStrData;
@ -3645,7 +3664,7 @@ struct wxStringAsBufHelper<wchar_t>
    {
        wxScopedWCharBuffer wbuf(s.wc_str());
        if ( len )
-            *len = wxWcslen(wbuf);
+            *len = wbuf.length();
        return wbuf;
    }
 };
@ -3785,8 +3804,89 @@ typedef wxStringInternalBufferLength          wxStringBufferLength;
 #endif // wxUSE_UNICODE_UTF8/wxUSE_UNICODE_WCHAR

 #if wxUSE_UNICODE_UTF8
-typedef wxStringInternalBuffer                wxUTF8StringBuffer;
-typedef wxStringInternalBufferLength          wxUTF8StringBufferLength;
+
+// Special implementation of buffer classes for UTF-8 build which exploit the
+// fact that we can write directly to std::string used by wxString, avoiding an
+// extra copy which could be significant for long strings.
+
+namespace wxPrivate
+{
+
+class wxUTF8StringBufferBase
+{
+public:
+    using CharType = char;
+
+    ~wxUTF8StringBufferBase()
+    {
+        // This class works only with UTF-8 strings, so we need to check if the
+        // string has valid contents. Note that it isn't an error if it
+        // doesn't, as it can happen that the function we use this buffer with
+        // (e.g. vsnprintf()) writes something invalid into the provided buffer
+        // in some cases.
+        if ( !wxStringOperations::IsValidUtf8String(m_str.c_str()) )
+            m_str.clear();
+    }
+
+    operator char*() const { return const_cast<char*>(m_str.c_str()); }
+
+protected:
+    explicit wxUTF8StringBufferBase(std::string& str, size_t size)
+        : m_str{str}
+    {
+        m_str.resize(size);
+    }
+
+    std::string& m_str;
+
+    wxDECLARE_NO_COPY_CLASS(wxUTF8StringBufferBase);
+};
+
+} // wxPrivate
+
+class wxUTF8StringBuffer : public wxPrivate::wxUTF8StringBufferBase
+{
+public:
+    wxUTF8StringBuffer(wxString& str, size_t size)
+        : wxPrivate::wxUTF8StringBufferBase{str.m_impl, size}
+    {
+    }
+
+    ~wxUTF8StringBuffer()
+    {
+        // This class works only with NUL-terminated strings, so we need to
+        // resize the string to have the correct length.
+        m_str.resize(strlen(m_str.c_str()));
+    }
+
+private:
+    wxDECLARE_NO_COPY_CLASS(wxUTF8StringBuffer);
+};
+
+class wxUTF8StringBufferLength : public wxPrivate::wxUTF8StringBufferBase
+{
+public:
+    wxUTF8StringBufferLength(wxString& str, size_t size)
+        : wxPrivate::wxUTF8StringBufferBase{str.m_impl, size}
+    {
+    }
+
+    ~wxUTF8StringBufferLength()
+    {
+        wxASSERT_MSG( m_lenSet, "forgot to call SetLength()" );
+
+        m_str.resize(m_len);
+    }
+
+    void SetLength(size_t length) { m_len = length; m_lenSet = true; }
+
+protected:
+    size_t m_len = 0;
+    bool m_lenSet = false;
+
+    wxDECLARE_NO_COPY_CLASS(wxUTF8StringBufferLength);
+};
+
 #else // wxUSE_UNICODE_WCHAR

 // Note about inlined dtors in the classes below: this is done not for
--- a/include/wx/strvararg.h
+++ b/include/wx/strvararg.h
@ -824,11 +824,19 @@ struct wxArgNormalizerUtf8<const std::string&>
 #ifdef __cpp_lib_string_view
 template<>
 struct wxArgNormalizerUtf8<const std::string_view&>
-    : public wxArgNormalizerUtf8<const char*>
 {
    wxArgNormalizerUtf8(const std::string_view& v,
                        const wxFormatString *fmt, unsigned index)
-        : wxArgNormalizerUtf8<const char*>(v.data(), fmt, index) {}
+        : m_str{v}
+    {
+        wxASSERT_ARG_TYPE( fmt, index, wxFormatString::Arg_String );
+    }
+
+    const char* get() const { return m_str.c_str(); }
+
+    // We need to store this string to ensure that we use a NUL-terminated
+    // buffer, i.e. we can't use string_view data directly.
+    const std::string m_str;
 };
 #endif // __cpp_lib_string_view

--- a/include/wx/univ/setup.h
+++ b/include/wx/univ/setup.h
@ -77,6 +77,54 @@
 // Recommended setting: 0
 #define wxUSE_REPRODUCIBLE_BUILD 0

+// ----------------------------------------------------------------------------
+// wxString encoding settings
+// ----------------------------------------------------------------------------
+
+// If set to 1, wxString uses UTF-8 internally instead of UTF-32 (Unix) or
+// UTF-16 (MSW).
+//
+// This option can be set to 1 if you want to avoid the overhead of converting
+// between wchar_t encoding (UTF-32 or UTF-16) used by wxString by default and
+// UTF-8, i.e. it makes functions such as wxString::FromUTF8() and utf8_str()
+// much more efficient and constant time, as they don't perform any conversion
+// any longer, which is especially interesting in wxGTK where these functions
+// are used every time a GTK function is called. But this is compensated by
+// making all the non-UTF-8 functions less efficient, notably requiring a
+// conversion when passing any string to Win32 API.
+//
+// Moreover, accessing strings by character index becomes, in general, a O(N)
+// iteration, where N is the index, so only enable this option if you don't use
+// index access for arbitrary characters (unless it is done inside a loop
+// consecutively for all characters as this special access pattern is optimized
+// by caching the last accessed index -- but using iterate, or range for loop,
+// is still better even in this case), as otherwise you may observe significant
+// slowdown in your program performance.
+//
+// Default is 0
+//
+// Recommended setting: 0 but can be set to 1 for optimization purposes and if
+// you're sure that you're not using loops using indices to iterate over
+// strings in your code.
+#define wxUSE_UNICODE_UTF8 0
+
+// If set to 1, assume that all narrow strings use UTF-8.
+//
+// By default, wxWidgets assumes that all "char*" strings use the encoding of
+// the current locale, which is commonly, but not always, UTF-8 under Unix but
+// rarely UTF-8 under MSW. This option tells the library that all strings
+// always use UTF-8, avoiding the need to perform any conversions between them
+// and wxString internal representation when wxUSE_UNICODE_UTF8 is set to 1.
+//
+// In fact, using this option only makes sense when wxUSE_UNICODE_UTF8==1 and
+// it must not be enabled without the other option.
+//
+// Default is 0
+//
+// Recommended setting: 0 but can be set to 1 if your program is always run in
+// an UTF-8 locale.
+#define wxUSE_UTF8_LOCALE_ONLY 0
+
 // ----------------------------------------------------------------------------
 // debugging settings
 // ----------------------------------------------------------------------------
--- a/interface/wx/string.h
+++ b/interface/wx/string.h
@ -1422,7 +1422,7 @@ public:
        wxStringBuffer and wxStringBufferLength classes may be very useful when working
        with some external API which requires the caller to provide a writable buffer.

-        See also the reserve() and resize() STL-like functions.
+        See also the reserve(), resize() and shrink_to_fit() STL-like functions.
    */
    ///@{

@ -1468,6 +1468,9 @@ public:
    /**
        Minimizes the string's memory.

+        Please note that this method does the same thing as the standard
+        shrink_to_fit() one and shouldn't be used in new code.
+
        This can be useful after a call to Alloc() if too much memory were
        preallocated.

--- a/setup.h.in
+++ b/setup.h.in
@ -122,6 +122,11 @@
 #define wxUSE_REPRODUCIBLE_BUILD 0


+#define wxUSE_UNICODE_UTF8 0
+
+#define wxUSE_UTF8_LOCALE_ONLY 0
+
+

 #define wxUSE_ON_FATAL_EXCEPTION 0

@ -594,9 +599,6 @@
 #define wxUSE_SELECT_DISPATCHER 0
 #define wxUSE_EPOLL_DISPATCHER 0

-#define wxUSE_UNICODE_UTF8 0
-#define wxUSE_UTF8_LOCALE_ONLY 0
-
 /*
   Use GStreamer for Unix.

--- a/setup.h_vms
+++ b/setup.h_vms
@ -169,6 +169,10 @@ typedef pid_t GPid;

 #define wxUSE_REPRODUCIBLE_BUILD 1

+#define wxUSE_UNICODE_UTF8 0
+
+#define wxUSE_UTF8_LOCALE_ONLY 0
+
 #define wxUSE_EXCEPTIONS 1

 #define wxUSE_EXTENDED_RTTI 0
@ -652,9 +656,6 @@ typedef pid_t GPid;
 #define wxUSE_SELECT_DISPATCHER 1
 #define wxUSE_EPOLL_DISPATCHER 0

-#define wxUSE_UNICODE_UTF8 0
-#define wxUSE_UTF8_LOCALE_ONLY 0
-
 /*
   Use GStreamer for Unix.

--- a/src/common/sstream.cpp
+++ b/src/common/sstream.cpp
@ -127,10 +127,23 @@ wxStringOutputStream::wxStringOutputStream(wxString *pString, wxMBConv& conv)
    // length anyhow in this case.
 #if wxUSE_UNICODE_UTF8
    if ( conv.IsUTF8() )
+    {
        m_pos = m_str->utf8_length();
+    }
    else
-#endif // wxUSE_UNICODE_UTF8
-        m_pos = m_conv.FromWChar(nullptr, 0, m_str->wc_str(), m_str->length());
+    {
+        // Note that we can't just use wxString::length() because it may return
+        // a different value from the buffer length when wchar_t uses UTF-16
+        // (i.e. MSW) and the string contains any surrogates.
+        const wxScopedWCharBuffer wbuf(m_str->wc_str());
+        m_pos = m_conv.FromWChar(nullptr, 0, wbuf.data(), wbuf.length());
+    }
+#else // !wxUSE_UNICODE_UTF8
+    // When using wchar_t for internal representation, the string length and
+    // the length of the buffer returned by wc_str() are one and the same, so
+    // we can avoid creating a temporary buffer, unlike in UTF-8 case above.
+    m_pos = m_conv.FromWChar(nullptr, 0, m_str->wc_str(), m_str->length());
+#endif // wxUSE_UNICODE_UTF8/!wxUSE_UNICODE_UTF8
 }

 // ----------------------------------------------------------------------------
--- a/src/common/string.cpp
+++ b/src/common/string.cpp
@ -75,7 +75,12 @@ const wxStringCharType WXDLLIMPEXP_BASE *wxEmptyStringImpl = "";
 const wxChar WXDLLIMPEXP_BASE *wxEmptyString = wxT("");
 #if wxUSE_STRING_POS_CACHE

-wxTHREAD_SPECIFIC_DECL wxString::Cache wxString::ms_cache;
+/* static */
+wxString::Cache& wxString::GetCache()
+{
+    static wxTHREAD_SPECIFIC_DECL Cache s_cache;
+    return s_cache;
+}

 // gdb seems to be unable to display thread-local variables correctly, at least
 // not my 6.4.98 version under amd64, so provide this debugging helper to do it
@ -229,7 +234,7 @@ void wxString::PosLenToImpl(size_t pos, size_t len,
            // going beyond the end of the string, just as std::string does
            const const_iterator e(end());
            const_iterator i(b);
-            while ( len && i <= e )
+            while ( len && i < e )
            {
                ++i;
                --len;
@ -509,14 +514,6 @@ const char *wxString::AsChar(const wxMBConv& conv) const
    return m_convertedToChar.m_str;
 }

-// shrink to minimal size (releasing extra memory)
-bool wxString::Shrink()
-{
-  wxString tmp(begin(), end());
-  swap(tmp);
-  return true;
-}
-
 // ---------------------------------------------------------------------------
 // data access
 // ---------------------------------------------------------------------------
@ -1756,7 +1753,7 @@ int wxString::DoPrintfUtf8(const char *format, ...)
    va_list argptr;
    va_start(argptr, format);

-    int iLen = PrintfV(format, argptr);
+    int iLen = PrintfV(wxString::FromUTF8(format), argptr);

    va_end(argptr);

@ -1847,6 +1844,13 @@ static int DoStringPrintfV(wxString& str,
        // options.
        if ( len < 0 )
        {
+            // When vswprintf() returns an error, it can leave invalid bytes in
+            // the buffer, e.g. using "%c" with an invalid character results in
+            // U+FFFFFFFF in the buffer, which would trigger an assert when we
+            // try to copy it back to wxString as UTF-8 in "tmp" buffer dtor,
+            // so ensure we don't try to do it.
+            buf[0] = L'\0';
+
            // assume it only returns error if there is not enough space, but
            // as we don't know how much we need, double the current size of
            // the buffer
@ -1895,16 +1899,12 @@ static int DoStringPrintfV(wxString& str,

 int wxString::PrintfV(const wxString& format, va_list argptr)
 {
-#if wxUSE_UNICODE_UTF8
-    typedef wxStringTypeBuffer<char> Utf8Buffer;
-#endif
-
 #if wxUSE_UTF8_LOCALE_ONLY
-    return DoStringPrintfV<Utf8Buffer>(*this, format, argptr);
+    return DoStringPrintfV<wxUTF8StringBuffer>(*this, format, argptr);
 #else
    #if wxUSE_UNICODE_UTF8
    if ( wxLocaleIsUtf8 )
-        return DoStringPrintfV<Utf8Buffer>(*this, format, argptr);
+        return DoStringPrintfV<wxUTF8StringBuffer>(*this, format, argptr);
    else
        // wxChar* version
        return DoStringPrintfV<wxStringBuffer>(*this, format, argptr);
--- a/src/common/wxcrt.cpp
+++ b/src/common/wxcrt.cpp
@ -1041,6 +1041,19 @@ char *strdup(const char *s)
 bool wxLocaleIsUtf8 = false; // the safer setting if not known
 #endif

+static bool wxIsCharsetUtf8(const char* charset)
+{
+    if ( strcmp(charset, "UTF-8") == 0 ||
+         strcmp(charset, "utf-8") == 0 ||
+         strcmp(charset, "UTF8") == 0 ||
+         strcmp(charset, "utf8") == 0 )
+    {
+        return true;
+    }
+
+    return false;
+}
+
 static bool wxIsLocaleUtf8()
 {
    // NB: we intentionally don't use wxLocale::GetSystemEncodingName(),
@ -1051,31 +1064,28 @@ static bool wxIsLocaleUtf8()
    // GNU libc provides current character set this way (this conforms to
    // Unix98)
    const char *charset = nl_langinfo(CODESET);
-    if ( charset )
-    {
-        // "UTF-8" is used by modern glibc versions, but test other variants
-        // as well, just in case:
-        if ( strcmp(charset, "UTF-8") == 0 ||
-             strcmp(charset, "utf-8") == 0 ||
-             strcmp(charset, "UTF8") == 0 ||
-             strcmp(charset, "utf8") == 0 )
-        {
-            return true;
-        }
-    }
+    if ( charset && wxIsCharsetUtf8(charset) )
+        return true;
 #endif // HAVE_LANGINFO_H

-    // check if we're running under the "C" locale: it is 7bit subset
-    // of UTF-8, so it can be safely used with the UTF-8 build:
+    // check LC_CTYPE string: this also works with (sufficiently recent) MSVC
+    // and on any other system without nl_langinfo()
    const char *lc_ctype = setlocale(LC_CTYPE, nullptr);
-    if ( lc_ctype &&
-         (strcmp(lc_ctype, "C") == 0 || strcmp(lc_ctype, "POSIX") == 0) )
+    if ( lc_ctype )
    {
-        return true;
+        // check if we're running under the "C" locale: it is 7bit subset
+        // of UTF-8, so it can be safely used with the UTF-8 build:
+        if ( (strcmp(lc_ctype, "C") == 0 || strcmp(lc_ctype, "POSIX") == 0) )
+            return true;
+
+        // any other locale can also use UTF-8 encoding if it's explicitly
+        // specified
+        const char* charset = strrchr(lc_ctype, '.');
+        if ( charset && wxIsCharsetUtf8(charset + 1) )
+            return true;
    }

-    // we don't know what charset libc is using, so assume the worst
-    // to be safe:
+    // by default assume that we don't use UTF-8
    return false;
 }

--- a/src/msw/registry.cpp
+++ b/src/msw/registry.cpp
@ -108,10 +108,13 @@ GetMSWAccessFlags(wxRegKey::AccessMode mode, wxRegKey::WOW64ViewMode viewMode);
 static wxString GetFullName(const wxRegKey *pKey);
 static wxString GetFullName(const wxRegKey *pKey, const wxString& szValue);

-// returns "value" argument of wxRegKey methods converted into a value that can
-// be passed to win32 registry functions; specifically, converts empty string
-// to nullptr
-static inline const wxChar *RegValueStr(const wxString& szValue);
+// Returns a (wide char) pointer to the string contents or null for an empty
+// string.
+//
+// Unfortunately this needs to be a macro to ensure that the temporary buffer
+// returned by t_str() in UTF-8 build lives long enough.
+#define RegValueStr(szValue) \
+    ((szValue).empty() ? nullptr : static_cast<const wchar_t*>(szValue.t_str()))

 // Return the user-readable name of the given REG_XXX type constant.
 static wxString GetTypeString(DWORD dwType)
@ -1589,9 +1592,4 @@ inline void RemoveTrailingSeparator(wxString& str)
    str.Truncate(str.Len() - 1);
 }

-inline const wxChar *RegValueStr(const wxString& szValue)
-{
-    return szValue.empty() ? nullptr : szValue.t_str();
-}
-
 #endif // wxUSE_REGKEY
--- a/tests/strings/strings.cpp
+++ b/tests/strings/strings.cpp
--- a/tests/strings/unicode.cpp
+++ b/tests/strings/unicode.cpp
@ -441,16 +441,16 @@ void UnicodeTestCase::Iteration()
    // verify the string was decoded correctly:
    {
        size_t idx = 0;
-        for ( wxString::const_iterator i = text.begin(); i != text.end(); ++i, ++idx )
+        for ( auto c : text )
        {
-            CPPUNIT_ASSERT( *i == textUTF16[idx] );
+            CPPUNIT_ASSERT( c == textUTF16[idx++] );
        }
    }

    // overwrite the string with something that is shorter in UTF-8:
    {
-        for ( wxString::iterator i = text.begin(); i != text.end(); ++i )
-            *i = 'x';
+        for ( auto c : text )
+            c = 'x';
    }

    // restore the original text now:
@ -459,9 +459,9 @@ void UnicodeTestCase::Iteration()
        wxString::const_iterator end2 = text.end();

        size_t idx = 0;
-        for ( wxString::iterator i = text.begin(); i != text.end(); ++i, ++idx )
+        for ( auto c : text )
        {
-            *i = textUTF16[idx];
+            c = textUTF16[idx++];

            CPPUNIT_ASSERT( end1 == text.end() );
            CPPUNIT_ASSERT( end2 == text.end() );
@ -474,9 +474,9 @@ void UnicodeTestCase::Iteration()
    // and verify it again:
    {
        size_t idx = 0;
-        for ( wxString::const_iterator i = text.begin(); i != text.end(); ++i, ++idx )
+        for ( auto c : text )
        {
-            CPPUNIT_ASSERT( *i == textUTF16[idx] );
+            CPPUNIT_ASSERT( c == textUTF16[idx++] );
        }
    }
 }
--- a/tests/test.cpp
+++ b/tests/test.cpp
@ -633,6 +633,13 @@ bool TestApp::OnInit()
    cout << " as " << wxGetUserId()
         << std::endl;

+    // Optionally allow executing the tests in the locale specified by the
+    // standard environment variable, this is especially useful to use UTF-8
+    // for all tests by just setting WX_TEST_LOCALE=C.
+    wxString testLoc;
+    if ( wxGetEnv(wxASCII_STR("WX_TEST_LOCALE"), &testLoc) )
+        wxSetlocale(LC_ALL, testLoc);
+
 #if wxUSE_GUI
    // create a parent window to be used as parent for the GUI controls
    new wxTestableFrame();