Merge branch 'ci-utf8'

Fix UTF-8 build under MSW and improve its coverage in the CI jobs.

See #23313.

Closes #23328.
This commit is contained in:
Vadim Zeitlin 2023-03-30 19:44:37 +02:00
commit dee816bb5b
28 changed files with 887 additions and 544 deletions

View file

@ -108,10 +108,11 @@ jobs:
skip_samples: true
use_asan: true
use_xvfb: true
- name: Ubuntu 22.04 wxGTK C++20
- name: Ubuntu 22.04 wxGTK UTF-8 C++20
runner: ubuntu-22.04
configure_flags: --with-cxx=20
skip_samples: true
configure_flags: --with-cxx=20 --enable-utf8 --enable-utf8only
use_asan: true
use_xvfb: true
- name: Ubuntu 18.04 wxX11
runner: ubuntu-latest
container: ubuntu:18.04

View file

@ -67,6 +67,7 @@ jobs:
- configuration: 'DLL Debug'
platform: 'x64'
vsversion: 2022
use_utf8: true
- configuration: 'Debug'
platform: 'Win32'
vsversion: 2019
@ -81,14 +82,18 @@ jobs:
with:
submodules: 'recursive'
- name: Configure to use STL
if: matrix.use_stl
- name: Configure build options
working-directory: include/wx/msw
run: |
$txt = Get-Content setup.h
Write-Output $txt |
%{$_ -replace "define wxUSE_STL 0", "define wxUSE_STL 1"} |
Set-Content setup.h
$use_stl = "${{ matrix.use_stl }}" ? 1 : 0
$use_utf8 = "${{ matrix.use_utf8 }}" ? 1 : 0
if ( $use_stl -or $use_utf8 ) {
$txt = Get-Content setup.h
Write-Output $txt |
%{$_ -replace "define wxUSE_STL 0", "define wxUSE_STL $use_stl"} |
%{$_ -replace "define wxUSE_UNICODE_UTF8 0", "define wxUSE_UNICODE_UTF8 $use_utf8"} |
Set-Content setup.h
}
- name: Add MSBuild to PATH
uses: microsoft/setup-msbuild@v1.1.3

View file

@ -34,6 +34,11 @@ environment:
ARCH: x64
wxUSE_STL: 1
APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2022
- TOOLSET: msbuild
CONFIGURATION: Debug
ARCH: x64
wxUSE_UNICODE_UTF8: 1
APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2022
- TOOLSET: nmake
VS: '14.0'
BUILD: debug
@ -67,11 +72,13 @@ before_build:
- ps: |
$env:PATH = $env:PATH -replace "C:\\Program Files\\Git\\usr\\bin",""
if (-not (Test-Path env:wxUSE_STL)) { $env:wxUSE_STL = '0' }
if (-not (Test-Path env:wxUSE_UNICODE_UTF8)) { $env:wxUSE_UNICODE_UTF8 = '0' }
if (-not (Test-Path env:wxUSE_WEBVIEW_EDGE)) { $env:wxUSE_WEBVIEW_EDGE = '0' }
if (($env:TOOLSET -ne "msys2") -and ($env:TOOLSET -ne "cygwin")) {
$txt = gc include\wx\msw\setup.h
Write-Output $txt |
%{$_ -replace "define wxUSE_STL 0", "define wxUSE_STL $env:wxUSE_STL"} |
%{$_ -replace "define wxUSE_UNICODE_UTF8 0", "define wxUSE_UNICODE_UTF8 $env:wxUSE_UNICODE_UTF8"} |
%{$_ -replace "define wxUSE_WEBVIEW_EDGE 0", "define wxUSE_WEBVIEW_EDGE $env:wxUSE_WEBVIEW_EDGE"} |
sc include\wx\msw\setup.h
}

View file

@ -86,10 +86,8 @@ wx_option(wxUSE_STL "use standard C++ classes for everything" OFF)
set(wxTHIRD_PARTY_LIBRARIES ${wxTHIRD_PARTY_LIBRARIES} wxUSE_STL "use C++ STL classes")
wx_dependent_option(wxUSE_STD_CONTAINERS "use standard C++ container classes" ON "wxUSE_STL" OFF)
if(NOT WIN32)
wx_option(wxUSE_UNICODE_UTF8 "use UTF-8 representation for strings (Unix only)" OFF)
wx_dependent_option(wxUSE_UTF8_LOCALE_ONLY "only support UTF-8 locales in UTF-8 build (Unix only)" ON "wxUSE_UNICODE_UTF8" OFF)
endif()
wx_option(wxUSE_UNICODE_UTF8 "use UTF-8 representation for strings" OFF)
wx_dependent_option(wxUSE_UTF8_LOCALE_ONLY "only support UTF-8 locales in UTF-8 build" ON "wxUSE_UNICODE_UTF8" OFF)
if(NOT WIN32)
wx_option(wxUSE_VISIBILITY "use of ELF symbols visibility")

View file

@ -119,6 +119,11 @@
#cmakedefine01 wxUSE_REPRODUCIBLE_BUILD
#cmakedefine01 wxUSE_UNICODE_UTF8
#cmakedefine01 wxUSE_UTF8_LOCALE_ONLY
#cmakedefine01 wxUSE_ON_FATAL_EXCEPTION
@ -591,9 +596,6 @@
#cmakedefine01 wxUSE_SELECT_DISPATCHER
#cmakedefine01 wxUSE_EPOLL_DISPATCHER
#cmakedefine01 wxUSE_UNICODE_UTF8
#cmakedefine01 wxUSE_UTF8_LOCALE_ONLY
/*
Use GStreamer for Unix.

View file

@ -8,9 +8,12 @@ goto %TOOLSET%
:msbuild
PATH=C:\projects\wxwidgets\lib\vc_x64_dll;%PATH%
.\vc_x64_mswudll\test.exe
if "%CONFIGURATION%"=="DLL Release" set suffix=dll
if "%CONFIGURATION%"=="DLL Debug" set suffix=ddll
if "%CONFIGURATION%"=="Debug" set suffix=d
.\vc_x64_mswu%suffix%\test.exe
if %errorlevel% NEQ 0 goto :error
.\vc_x64_mswudll\test_gui.exe
.\vc_x64_mswu%suffix%\test_gui.exe
goto :eof
:nmake

4
configure vendored
View file

@ -2086,8 +2086,8 @@ Optional Features:
--disable-std_iostreams disable use of standard C++ stream classes
--enable-std_string_conv_in_wxstring provide implicit conversion to std::string in wxString
--disable-unsafe_conv_in_wxstring disable unsafe implicit conversions in wxString
--enable-utf8 use UTF-8 representation for strings (Unix only)
--enable-utf8only only support UTF-8 locales in UTF-8 build (Unix only)
--enable-utf8 use UTF-8 representation for strings
--enable-utf8only only support UTF-8 locales in UTF-8 build
--enable-extended_rtti use extended RTTI (XTI)
--disable-optimise compile without optimisations
--enable-profile create code with profiling information

View file

@ -649,8 +649,8 @@ WX_ARG_ENABLE(std_containers,[ --enable-std_containers use standard C++ contain
WX_ARG_DISABLE(std_iostreams,[ --disable-std_iostreams disable use of standard C++ stream classes], wxUSE_STD_IOSTREAM)
WX_ARG_ENABLE(std_string_conv_in_wxstring, [ --enable-std_string_conv_in_wxstring provide implicit conversion to std::string in wxString], wxUSE_STD_STRING_CONV_IN_WXSTRING)
WX_ARG_DISABLE(unsafe_conv_in_wxstring, [ --disable-unsafe_conv_in_wxstring disable unsafe implicit conversions in wxString], wxUSE_UNSAFE_WXSTRING_CONV)
WX_ARG_ENABLE_PARAM(utf8, [ --enable-utf8 use UTF-8 representation for strings (Unix only)], wxUSE_UNICODE_UTF8)
WX_ARG_ENABLE(utf8only, [ --enable-utf8only only support UTF-8 locales in UTF-8 build (Unix only)], wxUSE_UNICODE_UTF8_LOCALE)
WX_ARG_ENABLE_PARAM(utf8, [ --enable-utf8 use UTF-8 representation for strings], wxUSE_UNICODE_UTF8)
WX_ARG_ENABLE(utf8only, [ --enable-utf8only only support UTF-8 locales in UTF-8 build], wxUSE_UNICODE_UTF8_LOCALE)
WX_ARG_ENABLE(extended_rtti, [ --enable-extended_rtti use extended RTTI (XTI)], wxUSE_EXTENDED_RTTI)
WX_ARG_DISABLE(optimise, [ --disable-optimise compile without optimisations], wxUSE_OPTIMISE)

View file

@ -296,6 +296,18 @@ for (i = s.begin(); i != s.end(); ++i)
}
@endcode
or, even simpler, range for loop:
@code
wxString s = "hello";
for ( auto c : s )
{
// do something with "c"
}
@endcode
@note wxString iterators have unusual proxy-like semantics and can be used to
modify the string even when @e not using references, i.e. with just @c
auto, as in the example above.
@section overview_string_related String Related Functions and Classes

View file

@ -77,6 +77,54 @@
// Recommended setting: 0
#define wxUSE_REPRODUCIBLE_BUILD 0
// ----------------------------------------------------------------------------
// wxString encoding settings
// ----------------------------------------------------------------------------
// If set to 1, wxString uses UTF-8 internally instead of UTF-32 (Unix) or
// UTF-16 (MSW).
//
// This option can be set to 1 if you want to avoid the overhead of converting
// between wchar_t encoding (UTF-32 or UTF-16) used by wxString by default and
// UTF-8, i.e. it makes functions such as wxString::FromUTF8() and utf8_str()
// much more efficient and constant time, as they don't perform any conversion
// any longer, which is especially interesting in wxGTK where these functions
// are used every time a GTK function is called. But this is compensated by
// making all the non-UTF-8 functions less efficient, notably requiring a
// conversion when passing any string to Win32 API.
//
// Moreover, accessing strings by character index becomes, in general, a O(N)
// iteration, where N is the index, so only enable this option if you don't use
// index access for arbitrary characters (unless it is done inside a loop
// consecutively for all characters as this special access pattern is optimized
// by caching the last accessed index -- but using iterate, or range for loop,
// is still better even in this case), as otherwise you may observe significant
// slowdown in your program performance.
//
// Default is 0
//
// Recommended setting: 0 but can be set to 1 for optimization purposes and if
// you're sure that you're not using loops using indices to iterate over
// strings in your code.
#define wxUSE_UNICODE_UTF8 0
// If set to 1, assume that all narrow strings use UTF-8.
//
// By default, wxWidgets assumes that all "char*" strings use the encoding of
// the current locale, which is commonly, but not always, UTF-8 under Unix but
// rarely UTF-8 under MSW. This option tells the library that all strings
// always use UTF-8, avoiding the need to perform any conversions between them
// and wxString internal representation when wxUSE_UNICODE_UTF8 is set to 1.
//
// In fact, using this option only makes sense when wxUSE_UNICODE_UTF8==1 and
// it must not be enabled without the other option.
//
// Default is 0
//
// Recommended setting: 0 but can be set to 1 if your program is always run in
// an UTF-8 locale.
#define wxUSE_UTF8_LOCALE_ONLY 0
// ----------------------------------------------------------------------------
// debugging settings
// ----------------------------------------------------------------------------

View file

@ -89,16 +89,16 @@ namespace Catch
static std::string convert(const wxString& wxs)
{
std::string s;
s.reserve(wxs.length());
for ( wxString::const_iterator i = wxs.begin();
i != wxs.end();
++i )
s.reserve(wxs.length() + 2);
s += '"';
for ( auto c : wxs )
{
if ( !iswprint(*i) )
s += wxString::Format(wxASCII_STR("\\u%04X"), *i).ToAscii();
if ( c >= 128 || !iswprint(c) )
s += wxString::Format(wxASCII_STR("\\u%04X"), c).ToAscii();
else
s += *i;
s += c;
}
s += '"';
return s;
}

View file

@ -78,6 +78,54 @@
// Recommended setting: 0
#define wxUSE_REPRODUCIBLE_BUILD 0
// ----------------------------------------------------------------------------
// wxString encoding settings
// ----------------------------------------------------------------------------
// If set to 1, wxString uses UTF-8 internally instead of UTF-32 (Unix) or
// UTF-16 (MSW).
//
// This option can be set to 1 if you want to avoid the overhead of converting
// between wchar_t encoding (UTF-32 or UTF-16) used by wxString by default and
// UTF-8, i.e. it makes functions such as wxString::FromUTF8() and utf8_str()
// much more efficient and constant time, as they don't perform any conversion
// any longer, which is especially interesting in wxGTK where these functions
// are used every time a GTK function is called. But this is compensated by
// making all the non-UTF-8 functions less efficient, notably requiring a
// conversion when passing any string to Win32 API.
//
// Moreover, accessing strings by character index becomes, in general, a O(N)
// iteration, where N is the index, so only enable this option if you don't use
// index access for arbitrary characters (unless it is done inside a loop
// consecutively for all characters as this special access pattern is optimized
// by caching the last accessed index -- but using iterate, or range for loop,
// is still better even in this case), as otherwise you may observe significant
// slowdown in your program performance.
//
// Default is 0
//
// Recommended setting: 0 but can be set to 1 for optimization purposes and if
// you're sure that you're not using loops using indices to iterate over
// strings in your code.
#define wxUSE_UNICODE_UTF8 0
// If set to 1, assume that all narrow strings use UTF-8.
//
// By default, wxWidgets assumes that all "char*" strings use the encoding of
// the current locale, which is commonly, but not always, UTF-8 under Unix but
// rarely UTF-8 under MSW. This option tells the library that all strings
// always use UTF-8, avoiding the need to perform any conversions between them
// and wxString internal representation when wxUSE_UNICODE_UTF8 is set to 1.
//
// In fact, using this option only makes sense when wxUSE_UNICODE_UTF8==1 and
// it must not be enabled without the other option.
//
// Default is 0
//
// Recommended setting: 0 but can be set to 1 if your program is always run in
// an UTF-8 locale.
#define wxUSE_UTF8_LOCALE_ONLY 0
// ----------------------------------------------------------------------------
// debugging settings
// ----------------------------------------------------------------------------

View file

@ -78,6 +78,54 @@
// Recommended setting: 0
#define wxUSE_REPRODUCIBLE_BUILD 0
// ----------------------------------------------------------------------------
// wxString encoding settings
// ----------------------------------------------------------------------------
// If set to 1, wxString uses UTF-8 internally instead of UTF-32 (Unix) or
// UTF-16 (MSW).
//
// This option can be set to 1 if you want to avoid the overhead of converting
// between wchar_t encoding (UTF-32 or UTF-16) used by wxString by default and
// UTF-8, i.e. it makes functions such as wxString::FromUTF8() and utf8_str()
// much more efficient and constant time, as they don't perform any conversion
// any longer, which is especially interesting in wxGTK where these functions
// are used every time a GTK function is called. But this is compensated by
// making all the non-UTF-8 functions less efficient, notably requiring a
// conversion when passing any string to Win32 API.
//
// Moreover, accessing strings by character index becomes, in general, a O(N)
// iteration, where N is the index, so only enable this option if you don't use
// index access for arbitrary characters (unless it is done inside a loop
// consecutively for all characters as this special access pattern is optimized
// by caching the last accessed index -- but using iterate, or range for loop,
// is still better even in this case), as otherwise you may observe significant
// slowdown in your program performance.
//
// Default is 0
//
// Recommended setting: 0 but can be set to 1 for optimization purposes and if
// you're sure that you're not using loops using indices to iterate over
// strings in your code.
#define wxUSE_UNICODE_UTF8 0
// If set to 1, assume that all narrow strings use UTF-8.
//
// By default, wxWidgets assumes that all "char*" strings use the encoding of
// the current locale, which is commonly, but not always, UTF-8 under Unix but
// rarely UTF-8 under MSW. This option tells the library that all strings
// always use UTF-8, avoiding the need to perform any conversions between them
// and wxString internal representation when wxUSE_UNICODE_UTF8 is set to 1.
//
// In fact, using this option only makes sense when wxUSE_UNICODE_UTF8==1 and
// it must not be enabled without the other option.
//
// Default is 0
//
// Recommended setting: 0 but can be set to 1 if your program is always run in
// an UTF-8 locale.
#define wxUSE_UTF8_LOCALE_ONLY 0
// ----------------------------------------------------------------------------
// debugging settings
// ----------------------------------------------------------------------------

View file

@ -84,6 +84,54 @@
// Recommended setting: 0
#define wxUSE_REPRODUCIBLE_BUILD 0
// ----------------------------------------------------------------------------
// wxString encoding settings
// ----------------------------------------------------------------------------
// If set to 1, wxString uses UTF-8 internally instead of UTF-32 (Unix) or
// UTF-16 (MSW).
//
// This option can be set to 1 if you want to avoid the overhead of converting
// between wchar_t encoding (UTF-32 or UTF-16) used by wxString by default and
// UTF-8, i.e. it makes functions such as wxString::FromUTF8() and utf8_str()
// much more efficient and constant time, as they don't perform any conversion
// any longer, which is especially interesting in wxGTK where these functions
// are used every time a GTK function is called. But this is compensated by
// making all the non-UTF-8 functions less efficient, notably requiring a
// conversion when passing any string to Win32 API.
//
// Moreover, accessing strings by character index becomes, in general, a O(N)
// iteration, where N is the index, so only enable this option if you don't use
// index access for arbitrary characters (unless it is done inside a loop
// consecutively for all characters as this special access pattern is optimized
// by caching the last accessed index -- but using iterate, or range for loop,
// is still better even in this case), as otherwise you may observe significant
// slowdown in your program performance.
//
// Default is 0
//
// Recommended setting: 0 but can be set to 1 for optimization purposes and if
// you're sure that you're not using loops using indices to iterate over
// strings in your code.
#define wxUSE_UNICODE_UTF8 0
// If set to 1, assume that all narrow strings use UTF-8.
//
// By default, wxWidgets assumes that all "char*" strings use the encoding of
// the current locale, which is commonly, but not always, UTF-8 under Unix but
// rarely UTF-8 under MSW. This option tells the library that all strings
// always use UTF-8, avoiding the need to perform any conversions between them
// and wxString internal representation when wxUSE_UNICODE_UTF8 is set to 1.
//
// In fact, using this option only makes sense when wxUSE_UNICODE_UTF8==1 and
// it must not be enabled without the other option.
//
// Default is 0
//
// Recommended setting: 0 but can be set to 1 if your program is always run in
// an UTF-8 locale.
#define wxUSE_UTF8_LOCALE_ONLY 0
// ----------------------------------------------------------------------------
// debugging settings
// ----------------------------------------------------------------------------

View file

@ -74,6 +74,54 @@
// Recommended setting: 0
#define wxUSE_REPRODUCIBLE_BUILD 0
// ----------------------------------------------------------------------------
// wxString encoding settings
// ----------------------------------------------------------------------------
// If set to 1, wxString uses UTF-8 internally instead of UTF-32 (Unix) or
// UTF-16 (MSW).
//
// This option can be set to 1 if you want to avoid the overhead of converting
// between wchar_t encoding (UTF-32 or UTF-16) used by wxString by default and
// UTF-8, i.e. it makes functions such as wxString::FromUTF8() and utf8_str()
// much more efficient and constant time, as they don't perform any conversion
// any longer, which is especially interesting in wxGTK where these functions
// are used every time a GTK function is called. But this is compensated by
// making all the non-UTF-8 functions less efficient, notably requiring a
// conversion when passing any string to Win32 API.
//
// Moreover, accessing strings by character index becomes, in general, a O(N)
// iteration, where N is the index, so only enable this option if you don't use
// index access for arbitrary characters (unless it is done inside a loop
// consecutively for all characters as this special access pattern is optimized
// by caching the last accessed index -- but using iterate, or range for loop,
// is still better even in this case), as otherwise you may observe significant
// slowdown in your program performance.
//
// Default is 0
//
// Recommended setting: 0 but can be set to 1 for optimization purposes and if
// you're sure that you're not using loops using indices to iterate over
// strings in your code.
#define wxUSE_UNICODE_UTF8 0
// If set to 1, assume that all narrow strings use UTF-8.
//
// By default, wxWidgets assumes that all "char*" strings use the encoding of
// the current locale, which is commonly, but not always, UTF-8 under Unix but
// rarely UTF-8 under MSW. This option tells the library that all strings
// always use UTF-8, avoiding the need to perform any conversions between them
// and wxString internal representation when wxUSE_UNICODE_UTF8 is set to 1.
//
// In fact, using this option only makes sense when wxUSE_UNICODE_UTF8==1 and
// it must not be enabled without the other option.
//
// Default is 0
//
// Recommended setting: 0 but can be set to 1 if your program is always run in
// an UTF-8 locale.
#define wxUSE_UTF8_LOCALE_ONLY 0
// ----------------------------------------------------------------------------
// debugging settings
// ----------------------------------------------------------------------------

View file

@ -274,6 +274,7 @@ public:
~wxStringIteratorNode()
{ clear(); }
inline void clear();
inline void set(const wxString *str, wxStringImpl::const_iterator *citer)
{ clear(); DoSet(str, citer, nullptr); }
inline void set(const wxString *str, wxStringImpl::iterator *iter)
@ -285,7 +286,6 @@ public:
wxStringIteratorNode *m_prev{nullptr}, *m_next{nullptr};
private:
inline void clear();
inline void DoSet(const wxString *str,
wxStringImpl::const_iterator *citer,
wxStringImpl::iterator *iter);
@ -447,8 +447,8 @@ private:
unsigned lastUsed;
};
static wxTHREAD_SPECIFIC_DECL Cache ms_cache;
static Cache& GetCache() { return ms_cache; }
// Implemented out of line because per-thread variable can't be DLL exported.
static Cache& GetCache();
static Cache::Element *GetCacheBegin() { return GetCache().cached; }
static Cache::Element *GetCacheEnd() { return GetCacheBegin() + Cache::SIZE; }
@ -987,7 +987,17 @@ public:
// This is logically equivalent to strlen(str.mb_str()) but avoids
// actually converting the string to multibyte and just computes the
// length that it would have after conversion.
// Note that in UTF-8 build we need to use the actual wide character
// buffer length and not the string length, as it may be different when
// using surrogates, but in wchar_t build they're the same by definition
// and we can avoid creating an extra buffer.
#if wxUSE_UNICODE_UTF8
const wxScopedWCharBuffer wbuf(str.wc_str());
const size_t ofs = wxConvLibc.FromWChar(nullptr, 0, wbuf.data(), wbuf.length());
#else // wxUSE_UNICODE_WCHAR
const size_t ofs = wxConvLibc.FromWChar(nullptr, 0, str.wc_str(), str.length());
#endif
return ofs == wxCONV_FAILED ? 0 : static_cast<ptrdiff_t>(ofs);
}
@ -1203,16 +1213,24 @@ public:
{ assign(std::move(str), nLength); }
#if wxUSE_STRING_POS_CACHE
#if wxUSE_UNICODE_UTF8
~wxString()
{
#if wxUSE_STRING_POS_CACHE
// we need to invalidate our cache entry as another string could be
// recreated at the same address (unlikely, but still possible, with the
// heap-allocated strings but perfectly common with stack-allocated ones)
InvalidateCache();
}
#endif // wxUSE_STRING_POS_CACHE
// We also need to clear any still existing iterators pointing into this
// string, as otherwise clearing them later, when they're destroyed,
// would try to use a dangling string pointer stored in them.
while ( m_iterators.ptr )
m_iterators.ptr->clear();
}
#endif // wxUSE_UNICODE_UTF8
#if wxUSE_UNICODE_WCHAR
wxString(const std::wstring& str) : m_impl(str) {}
wxString(std::wstring&& str) noexcept : m_impl(std::move(str)) {}
@ -1350,7 +1368,7 @@ public:
size_type capacity() const { return m_impl.capacity(); }
void reserve(size_t sz) { m_impl.reserve(sz); }
void shrink_to_fit() { Shrink(); }
void shrink_to_fit() { m_impl.shrink_to_fit(); }
void resize(size_t nSize, wxUniChar ch = wxT('\0'))
{
@ -1685,7 +1703,7 @@ public:
// conversions with (possible) format conversions: have to return a
// buffer with temporary data
//
// the functions defined (in either Unicode or ANSI) mode are mb_str() to
// All builds of the library define the same functions: mb_str() to
// return an ANSI (multibyte) string, wc_str() to return a wide string and
// fn_str() to return a string which should be used with the OS APIs
// accepting the file names. The return value is always the same, but the
@ -2223,8 +2241,7 @@ public:
// only works if the data of this string is not shared
bool Alloc(size_t nLen) { reserve(nLen); return capacity() >= nLen; }
// minimize the string's memory
// only works if the data of this string is not shared
bool Shrink();
bool Shrink() { shrink_to_fit(); return true; }
// wxWidgets version 1 compatibility functions
@ -3540,6 +3557,8 @@ private:
friend class WXDLLIMPEXP_FWD_BASE wxStringIteratorNode;
friend class WXDLLIMPEXP_FWD_BASE wxUniCharRef;
friend class wxUTF8StringBuffer;
friend class wxUTF8StringBufferLength;
#endif // wxUSE_UNICODE_UTF8
friend class WXDLLIMPEXP_FWD_BASE wxCStrData;
@ -3645,7 +3664,7 @@ struct wxStringAsBufHelper<wchar_t>
{
wxScopedWCharBuffer wbuf(s.wc_str());
if ( len )
*len = wxWcslen(wbuf);
*len = wbuf.length();
return wbuf;
}
};
@ -3785,8 +3804,89 @@ typedef wxStringInternalBufferLength wxStringBufferLength;
#endif // wxUSE_UNICODE_UTF8/wxUSE_UNICODE_WCHAR
#if wxUSE_UNICODE_UTF8
typedef wxStringInternalBuffer wxUTF8StringBuffer;
typedef wxStringInternalBufferLength wxUTF8StringBufferLength;
// Special implementation of buffer classes for UTF-8 build which exploit the
// fact that we can write directly to std::string used by wxString, avoiding an
// extra copy which could be significant for long strings.
namespace wxPrivate
{
class wxUTF8StringBufferBase
{
public:
using CharType = char;
~wxUTF8StringBufferBase()
{
// This class works only with UTF-8 strings, so we need to check if the
// string has valid contents. Note that it isn't an error if it
// doesn't, as it can happen that the function we use this buffer with
// (e.g. vsnprintf()) writes something invalid into the provided buffer
// in some cases.
if ( !wxStringOperations::IsValidUtf8String(m_str.c_str()) )
m_str.clear();
}
operator char*() const { return const_cast<char*>(m_str.c_str()); }
protected:
explicit wxUTF8StringBufferBase(std::string& str, size_t size)
: m_str{str}
{
m_str.resize(size);
}
std::string& m_str;
wxDECLARE_NO_COPY_CLASS(wxUTF8StringBufferBase);
};
} // wxPrivate
class wxUTF8StringBuffer : public wxPrivate::wxUTF8StringBufferBase
{
public:
wxUTF8StringBuffer(wxString& str, size_t size)
: wxPrivate::wxUTF8StringBufferBase{str.m_impl, size}
{
}
~wxUTF8StringBuffer()
{
// This class works only with NUL-terminated strings, so we need to
// resize the string to have the correct length.
m_str.resize(strlen(m_str.c_str()));
}
private:
wxDECLARE_NO_COPY_CLASS(wxUTF8StringBuffer);
};
class wxUTF8StringBufferLength : public wxPrivate::wxUTF8StringBufferBase
{
public:
wxUTF8StringBufferLength(wxString& str, size_t size)
: wxPrivate::wxUTF8StringBufferBase{str.m_impl, size}
{
}
~wxUTF8StringBufferLength()
{
wxASSERT_MSG( m_lenSet, "forgot to call SetLength()" );
m_str.resize(m_len);
}
void SetLength(size_t length) { m_len = length; m_lenSet = true; }
protected:
size_t m_len = 0;
bool m_lenSet = false;
wxDECLARE_NO_COPY_CLASS(wxUTF8StringBufferLength);
};
#else // wxUSE_UNICODE_WCHAR
// Note about inlined dtors in the classes below: this is done not for

View file

@ -824,11 +824,19 @@ struct wxArgNormalizerUtf8<const std::string&>
#ifdef __cpp_lib_string_view
template<>
struct wxArgNormalizerUtf8<const std::string_view&>
: public wxArgNormalizerUtf8<const char*>
{
wxArgNormalizerUtf8(const std::string_view& v,
const wxFormatString *fmt, unsigned index)
: wxArgNormalizerUtf8<const char*>(v.data(), fmt, index) {}
: m_str{v}
{
wxASSERT_ARG_TYPE( fmt, index, wxFormatString::Arg_String );
}
const char* get() const { return m_str.c_str(); }
// We need to store this string to ensure that we use a NUL-terminated
// buffer, i.e. we can't use string_view data directly.
const std::string m_str;
};
#endif // __cpp_lib_string_view

View file

@ -77,6 +77,54 @@
// Recommended setting: 0
#define wxUSE_REPRODUCIBLE_BUILD 0
// ----------------------------------------------------------------------------
// wxString encoding settings
// ----------------------------------------------------------------------------
// If set to 1, wxString uses UTF-8 internally instead of UTF-32 (Unix) or
// UTF-16 (MSW).
//
// This option can be set to 1 if you want to avoid the overhead of converting
// between wchar_t encoding (UTF-32 or UTF-16) used by wxString by default and
// UTF-8, i.e. it makes functions such as wxString::FromUTF8() and utf8_str()
// much more efficient and constant time, as they don't perform any conversion
// any longer, which is especially interesting in wxGTK where these functions
// are used every time a GTK function is called. But this is compensated by
// making all the non-UTF-8 functions less efficient, notably requiring a
// conversion when passing any string to Win32 API.
//
// Moreover, accessing strings by character index becomes, in general, a O(N)
// iteration, where N is the index, so only enable this option if you don't use
// index access for arbitrary characters (unless it is done inside a loop
// consecutively for all characters as this special access pattern is optimized
// by caching the last accessed index -- but using iterate, or range for loop,
// is still better even in this case), as otherwise you may observe significant
// slowdown in your program performance.
//
// Default is 0
//
// Recommended setting: 0 but can be set to 1 for optimization purposes and if
// you're sure that you're not using loops using indices to iterate over
// strings in your code.
#define wxUSE_UNICODE_UTF8 0
// If set to 1, assume that all narrow strings use UTF-8.
//
// By default, wxWidgets assumes that all "char*" strings use the encoding of
// the current locale, which is commonly, but not always, UTF-8 under Unix but
// rarely UTF-8 under MSW. This option tells the library that all strings
// always use UTF-8, avoiding the need to perform any conversions between them
// and wxString internal representation when wxUSE_UNICODE_UTF8 is set to 1.
//
// In fact, using this option only makes sense when wxUSE_UNICODE_UTF8==1 and
// it must not be enabled without the other option.
//
// Default is 0
//
// Recommended setting: 0 but can be set to 1 if your program is always run in
// an UTF-8 locale.
#define wxUSE_UTF8_LOCALE_ONLY 0
// ----------------------------------------------------------------------------
// debugging settings
// ----------------------------------------------------------------------------

View file

@ -1422,7 +1422,7 @@ public:
wxStringBuffer and wxStringBufferLength classes may be very useful when working
with some external API which requires the caller to provide a writable buffer.
See also the reserve() and resize() STL-like functions.
See also the reserve(), resize() and shrink_to_fit() STL-like functions.
*/
///@{
@ -1468,6 +1468,9 @@ public:
/**
Minimizes the string's memory.
Please note that this method does the same thing as the standard
shrink_to_fit() one and shouldn't be used in new code.
This can be useful after a call to Alloc() if too much memory were
preallocated.

View file

@ -122,6 +122,11 @@
#define wxUSE_REPRODUCIBLE_BUILD 0
#define wxUSE_UNICODE_UTF8 0
#define wxUSE_UTF8_LOCALE_ONLY 0
#define wxUSE_ON_FATAL_EXCEPTION 0
@ -594,9 +599,6 @@
#define wxUSE_SELECT_DISPATCHER 0
#define wxUSE_EPOLL_DISPATCHER 0
#define wxUSE_UNICODE_UTF8 0
#define wxUSE_UTF8_LOCALE_ONLY 0
/*
Use GStreamer for Unix.

View file

@ -169,6 +169,10 @@ typedef pid_t GPid;
#define wxUSE_REPRODUCIBLE_BUILD 1
#define wxUSE_UNICODE_UTF8 0
#define wxUSE_UTF8_LOCALE_ONLY 0
#define wxUSE_EXCEPTIONS 1
#define wxUSE_EXTENDED_RTTI 0
@ -652,9 +656,6 @@ typedef pid_t GPid;
#define wxUSE_SELECT_DISPATCHER 1
#define wxUSE_EPOLL_DISPATCHER 0
#define wxUSE_UNICODE_UTF8 0
#define wxUSE_UTF8_LOCALE_ONLY 0
/*
Use GStreamer for Unix.

View file

@ -127,10 +127,23 @@ wxStringOutputStream::wxStringOutputStream(wxString *pString, wxMBConv& conv)
// length anyhow in this case.
#if wxUSE_UNICODE_UTF8
if ( conv.IsUTF8() )
{
m_pos = m_str->utf8_length();
}
else
#endif // wxUSE_UNICODE_UTF8
m_pos = m_conv.FromWChar(nullptr, 0, m_str->wc_str(), m_str->length());
{
// Note that we can't just use wxString::length() because it may return
// a different value from the buffer length when wchar_t uses UTF-16
// (i.e. MSW) and the string contains any surrogates.
const wxScopedWCharBuffer wbuf(m_str->wc_str());
m_pos = m_conv.FromWChar(nullptr, 0, wbuf.data(), wbuf.length());
}
#else // !wxUSE_UNICODE_UTF8
// When using wchar_t for internal representation, the string length and
// the length of the buffer returned by wc_str() are one and the same, so
// we can avoid creating a temporary buffer, unlike in UTF-8 case above.
m_pos = m_conv.FromWChar(nullptr, 0, m_str->wc_str(), m_str->length());
#endif // wxUSE_UNICODE_UTF8/!wxUSE_UNICODE_UTF8
}
// ----------------------------------------------------------------------------

View file

@ -75,7 +75,12 @@ const wxStringCharType WXDLLIMPEXP_BASE *wxEmptyStringImpl = "";
const wxChar WXDLLIMPEXP_BASE *wxEmptyString = wxT("");
#if wxUSE_STRING_POS_CACHE
wxTHREAD_SPECIFIC_DECL wxString::Cache wxString::ms_cache;
/* static */
wxString::Cache& wxString::GetCache()
{
static wxTHREAD_SPECIFIC_DECL Cache s_cache;
return s_cache;
}
// gdb seems to be unable to display thread-local variables correctly, at least
// not my 6.4.98 version under amd64, so provide this debugging helper to do it
@ -229,7 +234,7 @@ void wxString::PosLenToImpl(size_t pos, size_t len,
// going beyond the end of the string, just as std::string does
const const_iterator e(end());
const_iterator i(b);
while ( len && i <= e )
while ( len && i < e )
{
++i;
--len;
@ -509,14 +514,6 @@ const char *wxString::AsChar(const wxMBConv& conv) const
return m_convertedToChar.m_str;
}
// shrink to minimal size (releasing extra memory)
bool wxString::Shrink()
{
wxString tmp(begin(), end());
swap(tmp);
return true;
}
// ---------------------------------------------------------------------------
// data access
// ---------------------------------------------------------------------------
@ -1756,7 +1753,7 @@ int wxString::DoPrintfUtf8(const char *format, ...)
va_list argptr;
va_start(argptr, format);
int iLen = PrintfV(format, argptr);
int iLen = PrintfV(wxString::FromUTF8(format), argptr);
va_end(argptr);
@ -1847,6 +1844,13 @@ static int DoStringPrintfV(wxString& str,
// options.
if ( len < 0 )
{
// When vswprintf() returns an error, it can leave invalid bytes in
// the buffer, e.g. using "%c" with an invalid character results in
// U+FFFFFFFF in the buffer, which would trigger an assert when we
// try to copy it back to wxString as UTF-8 in "tmp" buffer dtor,
// so ensure we don't try to do it.
buf[0] = L'\0';
// assume it only returns error if there is not enough space, but
// as we don't know how much we need, double the current size of
// the buffer
@ -1895,16 +1899,12 @@ static int DoStringPrintfV(wxString& str,
int wxString::PrintfV(const wxString& format, va_list argptr)
{
#if wxUSE_UNICODE_UTF8
typedef wxStringTypeBuffer<char> Utf8Buffer;
#endif
#if wxUSE_UTF8_LOCALE_ONLY
return DoStringPrintfV<Utf8Buffer>(*this, format, argptr);
return DoStringPrintfV<wxUTF8StringBuffer>(*this, format, argptr);
#else
#if wxUSE_UNICODE_UTF8
if ( wxLocaleIsUtf8 )
return DoStringPrintfV<Utf8Buffer>(*this, format, argptr);
return DoStringPrintfV<wxUTF8StringBuffer>(*this, format, argptr);
else
// wxChar* version
return DoStringPrintfV<wxStringBuffer>(*this, format, argptr);

View file

@ -1041,6 +1041,19 @@ char *strdup(const char *s)
bool wxLocaleIsUtf8 = false; // the safer setting if not known
#endif
static bool wxIsCharsetUtf8(const char* charset)
{
if ( strcmp(charset, "UTF-8") == 0 ||
strcmp(charset, "utf-8") == 0 ||
strcmp(charset, "UTF8") == 0 ||
strcmp(charset, "utf8") == 0 )
{
return true;
}
return false;
}
static bool wxIsLocaleUtf8()
{
// NB: we intentionally don't use wxLocale::GetSystemEncodingName(),
@ -1051,31 +1064,28 @@ static bool wxIsLocaleUtf8()
// GNU libc provides current character set this way (this conforms to
// Unix98)
const char *charset = nl_langinfo(CODESET);
if ( charset )
{
// "UTF-8" is used by modern glibc versions, but test other variants
// as well, just in case:
if ( strcmp(charset, "UTF-8") == 0 ||
strcmp(charset, "utf-8") == 0 ||
strcmp(charset, "UTF8") == 0 ||
strcmp(charset, "utf8") == 0 )
{
return true;
}
}
if ( charset && wxIsCharsetUtf8(charset) )
return true;
#endif // HAVE_LANGINFO_H
// check if we're running under the "C" locale: it is 7bit subset
// of UTF-8, so it can be safely used with the UTF-8 build:
// check LC_CTYPE string: this also works with (sufficiently recent) MSVC
// and on any other system without nl_langinfo()
const char *lc_ctype = setlocale(LC_CTYPE, nullptr);
if ( lc_ctype &&
(strcmp(lc_ctype, "C") == 0 || strcmp(lc_ctype, "POSIX") == 0) )
if ( lc_ctype )
{
return true;
// check if we're running under the "C" locale: it is 7bit subset
// of UTF-8, so it can be safely used with the UTF-8 build:
if ( (strcmp(lc_ctype, "C") == 0 || strcmp(lc_ctype, "POSIX") == 0) )
return true;
// any other locale can also use UTF-8 encoding if it's explicitly
// specified
const char* charset = strrchr(lc_ctype, '.');
if ( charset && wxIsCharsetUtf8(charset + 1) )
return true;
}
// we don't know what charset libc is using, so assume the worst
// to be safe:
// by default assume that we don't use UTF-8
return false;
}

View file

@ -108,10 +108,13 @@ GetMSWAccessFlags(wxRegKey::AccessMode mode, wxRegKey::WOW64ViewMode viewMode);
static wxString GetFullName(const wxRegKey *pKey);
static wxString GetFullName(const wxRegKey *pKey, const wxString& szValue);
// returns "value" argument of wxRegKey methods converted into a value that can
// be passed to win32 registry functions; specifically, converts empty string
// to nullptr
static inline const wxChar *RegValueStr(const wxString& szValue);
// Returns a (wide char) pointer to the string contents or null for an empty
// string.
//
// Unfortunately this needs to be a macro to ensure that the temporary buffer
// returned by t_str() in UTF-8 build lives long enough.
#define RegValueStr(szValue) \
((szValue).empty() ? nullptr : static_cast<const wchar_t*>(szValue.t_str()))
// Return the user-readable name of the given REG_XXX type constant.
static wxString GetTypeString(DWORD dwType)
@ -1589,9 +1592,4 @@ inline void RemoveTrailingSeparator(wxString& str)
str.Truncate(str.Len() - 1);
}
inline const wxChar *RegValueStr(const wxString& szValue)
{
return szValue.empty() ? nullptr : szValue.t_str();
}
#endif // wxUSE_REGKEY

File diff suppressed because it is too large Load diff

View file

@ -441,16 +441,16 @@ void UnicodeTestCase::Iteration()
// verify the string was decoded correctly:
{
size_t idx = 0;
for ( wxString::const_iterator i = text.begin(); i != text.end(); ++i, ++idx )
for ( auto c : text )
{
CPPUNIT_ASSERT( *i == textUTF16[idx] );
CPPUNIT_ASSERT( c == textUTF16[idx++] );
}
}
// overwrite the string with something that is shorter in UTF-8:
{
for ( wxString::iterator i = text.begin(); i != text.end(); ++i )
*i = 'x';
for ( auto c : text )
c = 'x';
}
// restore the original text now:
@ -459,9 +459,9 @@ void UnicodeTestCase::Iteration()
wxString::const_iterator end2 = text.end();
size_t idx = 0;
for ( wxString::iterator i = text.begin(); i != text.end(); ++i, ++idx )
for ( auto c : text )
{
*i = textUTF16[idx];
c = textUTF16[idx++];
CPPUNIT_ASSERT( end1 == text.end() );
CPPUNIT_ASSERT( end2 == text.end() );
@ -474,9 +474,9 @@ void UnicodeTestCase::Iteration()
// and verify it again:
{
size_t idx = 0;
for ( wxString::const_iterator i = text.begin(); i != text.end(); ++i, ++idx )
for ( auto c : text )
{
CPPUNIT_ASSERT( *i == textUTF16[idx] );
CPPUNIT_ASSERT( c == textUTF16[idx++] );
}
}
}

View file

@ -633,6 +633,13 @@ bool TestApp::OnInit()
cout << " as " << wxGetUserId()
<< std::endl;
// Optionally allow executing the tests in the locale specified by the
// standard environment variable, this is especially useful to use UTF-8
// for all tests by just setting WX_TEST_LOCALE=C.
wxString testLoc;
if ( wxGetEnv(wxASCII_STR("WX_TEST_LOCALE"), &testLoc) )
wxSetlocale(LC_ALL, testLoc);
#if wxUSE_GUI
// create a parent window to be used as parent for the GUI controls
new wxTestableFrame();