diff options
author | Lasse Collin <lasse.collin@tukaani.org> | 2010-09-10 10:30:33 +0300 |
---|---|---|
committer | Lasse Collin <lasse.collin@tukaani.org> | 2010-09-10 10:30:33 +0300 |
commit | bb0b1004f83cdc4d309e1471c2ecaf9f95ce60c5 (patch) | |
tree | d21a6ca975dbd1ed4d6181281be92eb53c82423c /src/common | |
parent | Update the Czech translation. (diff) | |
download | xz-bb0b1004f83cdc4d309e1471c2ecaf9f95ce60c5.tar.xz |
xz: Multiple fixes.
The code assumed that printing numbers with thousand separators
and decimal points would always produce only US-ASCII characters.
This was used for buffer sizes (with snprintf(), no overflows)
and aligning columns of the progress indicator and --list. That
assumption was wrong (e.g. LC_ALL=fi_FI.UTF-8 with glibc), so
multibyte character support was added in this commit. The old
way is used if the operating system doesn't have enough multibyte
support (e.g. lacks wcwidth()).
The sizes of buffers were increased to accomodate multibyte
characters. I don't know how big they should be exactly, but
they aren't used for anything critical, so it's not too bad.
If they still aren't big enough, I hopefully get a bug report.
snprintf() takes care of avoiding buffer overflows.
Some static buffers were replaced with buffers allocated on
stack. double_to_str() was removed. uint64_to_str() and
uint64_to_nicestr() now share the static buffer and test
for thousand separator support.
Integrity check names "None" and "Unknown-N" (2 <= N <= 15)
were marked to be translated. I had forgot these, plus they
wouldn't have worked correctly anyway before this commit,
because printing tables with multibyte strings didn't work.
Thanks to Marek Černocký for reporting the bug about
misaligned table columns in --list output.
Diffstat (limited to '')
-rw-r--r-- | src/common/tuklib_mbstr.h | 66 | ||||
-rw-r--r-- | src/common/tuklib_mbstr_fw.c | 31 | ||||
-rw-r--r-- | src/common/tuklib_mbstr_width.c | 64 |
3 files changed, 161 insertions, 0 deletions
diff --git a/src/common/tuklib_mbstr.h b/src/common/tuklib_mbstr.h new file mode 100644 index 00000000..9f358355 --- /dev/null +++ b/src/common/tuklib_mbstr.h @@ -0,0 +1,66 @@ +/////////////////////////////////////////////////////////////////////////////// +// +/// \file tuklib_mstr.h +/// \brief Utility functions for handling multibyte strings +/// +/// If not enough multibyte string support is available in the C library, +/// these functions keep working with the assumption that all strings +/// are in a single-byte character set without combining characters, e.g. +/// US-ASCII or ISO-8859-*. +// +// Author: Lasse Collin +// +// This file has been put into the public domain. +// You can do whatever you want with this file. +// +/////////////////////////////////////////////////////////////////////////////// + +#ifndef TUKLIB_MBSTR_H +#define TUKLIB_MBSTR_H + +#include "tuklib_common.h" +TUKLIB_DECLS_BEGIN + +#define tuklib_mbstr_width TUKLIB_SYMBOL(tuklib_mbstr_width) +extern size_t tuklib_mbstr_width(const char *str, size_t *bytes); +///< +/// \brief Get the number of columns needed for the multibyte string +/// +/// This is somewhat similar to wcswidth() but works on multibyte strings. +/// +/// \param str String whose width is to be calculated. If the +/// current locale uses a multibyte character set +/// that has shift states, the string must begin +/// and end in the initial shift state. +/// \param bytes If this is not NULL, *bytes is set to the +/// value returned by strlen(str) (even if an +/// error occurs when calculating the width). +/// +/// \return On success, the number of columns needed to display the +/// string e.g. in a terminal emulator is returned. On error, +/// (size_t)-1 is returned. Possible errors include invalid, +/// partial, or non-printable multibyte character in str, or +/// that str doesn't end in the initial shift state. + +#define tuklib_mbstr_fw TUKLIB_SYMBOL(tuklib_mbstr_fw) +extern int tuklib_mbstr_fw(const char *str, int columns_min); +///< +/// \brief Get the field width for printf() e.g. to align table columns +/// +/// Printing simple tables to a terminal can be done using the field field +/// feature in the printf() format string, but it works only with single-byte +/// character sets. To do the same with multibyte strings, tuklib_mbstr_fw() +/// can be used to calculate appropriate field width. +/// +/// The behavior of this function is undefined, if +/// - str is NULL or not terminated with '\0'; +/// - columns_min <= 0; or +/// - the calculated field width exceeds INT_MAX. +/// +/// \return If tuklib_mbstr_width(str, NULL) fails, -1 is returned. +/// If str needs more columns than columns_min, zero is returned. +/// Otherwise a positive integer is returned, which can be +/// used as the field width, e.g. printf("%*s", fw, str). + +TUKLIB_DECLS_END +#endif diff --git a/src/common/tuklib_mbstr_fw.c b/src/common/tuklib_mbstr_fw.c new file mode 100644 index 00000000..978a3fe1 --- /dev/null +++ b/src/common/tuklib_mbstr_fw.c @@ -0,0 +1,31 @@ +/////////////////////////////////////////////////////////////////////////////// +// +/// \file tuklib_mstr_fw.c +/// \brief Get the field width for printf() e.g. to align table columns +// +// Author: Lasse Collin +// +// This file has been put into the public domain. +// You can do whatever you want with this file. +// +/////////////////////////////////////////////////////////////////////////////// + +#include "tuklib_mbstr.h" + + +extern int +tuklib_mbstr_fw(const char *str, int columns_min) +{ + size_t len; + const size_t width = tuklib_mbstr_width(str, &len); + if (width == (size_t)-1) + return -1; + + if (width > (size_t)columns_min) + return 0; + + if (width < (size_t)columns_min) + len += (size_t)columns_min - width; + + return len; +} diff --git a/src/common/tuklib_mbstr_width.c b/src/common/tuklib_mbstr_width.c new file mode 100644 index 00000000..3c38990f --- /dev/null +++ b/src/common/tuklib_mbstr_width.c @@ -0,0 +1,64 @@ +/////////////////////////////////////////////////////////////////////////////// +// +/// \file tuklib_mstr_width.c +/// \brief Calculate width of a multibyte string +// +// Author: Lasse Collin +// +// This file has been put into the public domain. +// You can do whatever you want with this file. +// +/////////////////////////////////////////////////////////////////////////////// + +#include "tuklib_mbstr.h" + +#if defined(HAVE_MBRTOWC) && defined(HAVE_WCWIDTH) +# include <wchar.h> +#endif + + +extern size_t +tuklib_mbstr_width(const char *str, size_t *bytes) +{ + const size_t len = strlen(str); + if (bytes != NULL) + *bytes = len; + +#if !(defined(HAVE_MBRTOWC) && defined(HAVE_WCWIDTH)) + // In single-byte mode, the width of the string is the same + // as its length. + return len; + +#else + mbstate_t state; + memset(&state, 0, sizeof(state)); + + size_t width = 0; + size_t i = 0; + + // Convert one multibyte character at a time to wchar_t + // and get its width using wcwidth(). + while (i < len) { + wchar_t wc; + const size_t ret = mbrtowc(&wc, str + i, len - i, &state); + if (ret < 1 || ret > len) + return (size_t)-1; + + i += ret; + + const int wc_width = wcwidth(wc); + if (wc_width < 0) + return (size_t)-1; + + width += wc_width; + } + + // Require that the string ends in the initial shift state. + // This way the caller can be combine the string with other + // strings without needing to worry about the shift states. + if (!mbsinit(&state)) + return (size_t)-1; + + return width; +#endif +} |