xz: Multiple fixes.

The code assumed that printing numbers with thousand separators and decimal points would always produce only US-ASCII characters. This was used for buffer sizes (with snprintf(), no overflows) and aligning columns of the progress indicator and --list. That assumption was wrong (e.g. LC_ALL=fi_FI.UTF-8 with glibc), so multibyte character support was added in this commit. The old way is used if the operating system doesn't have enough multibyte support (e.g. lacks wcwidth()). The sizes of buffers were increased to accomodate multibyte characters. I don't know how big they should be exactly, but they aren't used for anything critical, so it's not too bad. If they still aren't big enough, I hopefully get a bug report. snprintf() takes care of avoiding buffer overflows. Some static buffers were replaced with buffers allocated on stack. double_to_str() was removed. uint64_to_str() and uint64_to_nicestr() now share the static buffer and test for thousand separator support. Integrity check names "None" and "Unknown-N" (2 <= N <= 15) were marked to be translated. I had forgot these, plus they wouldn't have worked correctly anyway before this commit, because printing tables with multibyte strings didn't work. Thanks to Marek Černocký for reporting the bug about misaligned table columns in --list output.
author: Lasse Collin <lasse.collin@tukaani.org> 2010-09-10 10:30:33 +0300
committer: Lasse Collin <lasse.collin@tukaani.org> 2010-09-10 10:30:33 +0300
commit: bb0b1004f83cdc4d309e1471c2ecaf9f95ce60c5 (patch)
tree: d21a6ca975dbd1ed4d6181281be92eb53c82423c /src/common/tuklib_mbstr_width.c
parent: Update the Czech translation. (diff)
download: xz-bb0b1004f83cdc4d309e1471c2ecaf9f95ce60c5.tar.xz
1 files changed, 64 insertions, 0 deletions
diff --git a/src/common/tuklib_mbstr_width.c b/src/common/tuklib_mbstr_width.c
new file mode 100644
index 00000000..3c38990f
--- /dev/null
+++ b/src/common/tuklib_mbstr_width.c
@@ -0,0 +1,64 @@
+///////////////////////////////////////////////////////////////////////////////
+//
+/// \file       tuklib_mstr_width.c
+/// \brief      Calculate width of a multibyte string
+//
+//  Author:     Lasse Collin
+//
+//  This file has been put into the public domain.
+//  You can do whatever you want with this file.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#include "tuklib_mbstr.h"
+
+#if defined(HAVE_MBRTOWC) && defined(HAVE_WCWIDTH)
+#	include <wchar.h>
+#endif
+
+
+extern size_t
+tuklib_mbstr_width(const char *str, size_t *bytes)
+{
+	const size_t len = strlen(str);
+	if (bytes != NULL)
+		*bytes = len;
+
+#if !(defined(HAVE_MBRTOWC) && defined(HAVE_WCWIDTH))
+	// In single-byte mode, the width of the string is the same
+	// as its length.
+	return len;
+
+#else
+	mbstate_t state;
+	memset(&state, 0, sizeof(state));
+
+	size_t width = 0;
+	size_t i = 0;
+
+	// Convert one multibyte character at a time to wchar_t
+	// and get its width using wcwidth().
+	while (i < len) {
+		wchar_t wc;
+		const size_t ret = mbrtowc(&wc, str + i, len - i, &state);
+		if (ret < 1 || ret > len)
+			return (size_t)-1;
+
+		i += ret;
+
+		const int wc_width = wcwidth(wc);
+		if (wc_width < 0)
+			return (size_t)-1;
+
+		width += wc_width;
+	}
+
+	// Require that the string ends in the initial shift state.
+	// This way the caller can be combine the string with other
+	// strings without needing to worry about the shift states.
+	if (!mbsinit(&state))
+		return (size_t)-1;
+
+	return width;
+#endif
+}
author	Lasse Collin <lasse.collin@tukaani.org>	2010-09-10 10:30:33 +0300
committer	Lasse Collin <lasse.collin@tukaani.org>	2010-09-10 10:30:33 +0300
commit	bb0b1004f83cdc4d309e1471c2ecaf9f95ce60c5 (patch)
tree	d21a6ca975dbd1ed4d6181281be92eb53c82423c /src/common/tuklib_mbstr_width.c
parent	Update the Czech translation. (diff)
download	xz-bb0b1004f83cdc4d309e1471c2ecaf9f95ce60c5.tar.xz