diff options
author | moneromooo-monero <moneromooo-monero@users.noreply.github.com> | 2019-04-30 19:16:11 +0000 |
---|---|---|
committer | moneromooo-monero <moneromooo-monero@users.noreply.github.com> | 2019-08-16 17:06:03 +0000 |
commit | eeca5ca0c8d7275069d45adf40546ea83edf46cc (patch) | |
tree | 71024c1a6dc4747e83f29bd4e1eeeb80094a19bd | |
parent | Merge pull request #5756 (diff) | |
download | monero-eeca5ca0c8d7275069d45adf40546ea83edf46cc.tar.xz |
epee: support unicode in parsed strings
-rw-r--r-- | contrib/epee/include/storages/parserse_base_utils.h | 59 | ||||
-rw-r--r-- | contrib/epee/include/string_tools.h | 24 | ||||
-rw-r--r-- | tests/unit_tests/epee_utils.cpp | 17 |
3 files changed, 78 insertions, 22 deletions
diff --git a/contrib/epee/include/storages/parserse_base_utils.h b/contrib/epee/include/storages/parserse_base_utils.h index b5c4138c5..fe53628a5 100644 --- a/contrib/epee/include/storages/parserse_base_utils.h +++ b/contrib/epee/include/storages/parserse_base_utils.h @@ -31,6 +31,9 @@ #include <algorithm> #include <boost/utility/string_ref.hpp> +#undef MONERO_DEFAULT_LOG_CATEGORY +#define MONERO_DEFAULT_LOG_CATEGORY "serialization" + namespace epee { namespace misc_utils @@ -62,6 +65,26 @@ namespace misc_utils 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; + static const constexpr unsigned char isx[256] = + { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 10, 11, 12, 13, 14, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 10, 11, 12, 13, 14, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + }; + inline bool isspace(char c) { return lut[(uint8_t)c] & 8; @@ -162,6 +185,42 @@ namespace misc_utils val.push_back('\\');break; case '/': //Slash character val.push_back('/');break; + case 'u': //Unicode code point + if (buf_end - it < 4) + { + ASSERT_MES_AND_THROW("Invalid Unicode escape sequence"); + } + else + { + uint32_t dst = 0; + for (int i = 0; i < 4; ++i) + { + const unsigned char tmp = isx[(int)*++it]; + CHECK_AND_ASSERT_THROW_MES(tmp != 0xff, "Bad Unicode encoding"); + dst = dst << 4 | tmp; + } + // encode as UTF-8 + if (dst <= 0x7f) + { + val.push_back(dst); + } + else if (dst <= 0x7ff) + { + val.push_back(0xc0 | (dst >> 6)); + val.push_back(0x80 | (dst & 0x3f)); + } + else if (dst <= 0xffff) + { + val.push_back(0xe0 | (dst >> 12)); + val.push_back(0x80 | ((dst >> 6) & 0x3f)); + val.push_back(0x80 | (dst & 0x3f)); + } + else + { + ASSERT_MES_AND_THROW("Unicode code point is out or range"); + } + } + break; default: val.push_back(*it); LOG_PRINT_L0("Unknown escape sequence :\"\\" << *it << "\""); diff --git a/contrib/epee/include/string_tools.h b/contrib/epee/include/string_tools.h index da47b7d55..1be5eb5e1 100644 --- a/contrib/epee/include/string_tools.h +++ b/contrib/epee/include/string_tools.h @@ -59,26 +59,6 @@ #pragma comment (lib, "Rpcrt4.lib") #endif -static const constexpr unsigned char isx[256] = -{ - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 10, 11, 12, 13, 14, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 10, 11, 12, 13, 14, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, -}; - namespace epee { namespace string_tools @@ -99,10 +79,10 @@ namespace string_tools for(size_t i = 0; i < s.size(); i += 2) { int tmp = *src++; - tmp = isx[tmp]; + tmp = epee::misc_utils::parse::isx[tmp]; if (tmp == 0xff) return false; int t2 = *src++; - t2 = isx[t2]; + t2 = epee::misc_utils::parse::isx[t2]; if (t2 == 0xff) return false; *dst++ = (tmp << 4) | t2; } diff --git a/tests/unit_tests/epee_utils.cpp b/tests/unit_tests/epee_utils.cpp index 946731826..c5f9a42d0 100644 --- a/tests/unit_tests/epee_utils.cpp +++ b/tests/unit_tests/epee_utils.cpp @@ -946,3 +946,20 @@ TEST(parsing, number) epee::misc_utils::parse::match_number(i, s.end(), val); ASSERT_EQ(val, "+9.34e+03"); } + +TEST(parsing, unicode) +{ + std::string bs; + std::string s; + std::string::const_iterator si; + + s = "\"\""; si = s.begin(); ASSERT_TRUE(epee::misc_utils::parse::match_string(si, s.end(), bs)); ASSERT_EQ(bs, ""); + s = "\"\\u0000\""; si = s.begin(); ASSERT_TRUE(epee::misc_utils::parse::match_string(si, s.end(), bs)); ASSERT_EQ(bs, std::string(1, '\0')); + s = "\"\\u0020\""; si = s.begin(); ASSERT_TRUE(epee::misc_utils::parse::match_string(si, s.end(), bs)); ASSERT_EQ(bs, " "); + s = "\"\\u1\""; si = s.begin(); ASSERT_FALSE(epee::misc_utils::parse::match_string(si, s.end(), bs)); + s = "\"\\u12\""; si = s.begin(); ASSERT_FALSE(epee::misc_utils::parse::match_string(si, s.end(), bs)); + s = "\"\\u123\""; si = s.begin(); ASSERT_FALSE(epee::misc_utils::parse::match_string(si, s.end(), bs)); + s = "\"\\u1234\""; si = s.begin(); ASSERT_TRUE(epee::misc_utils::parse::match_string(si, s.end(), bs)); ASSERT_EQ(bs, "ሴ"); + s = "\"foo\\u1234bar\""; si = s.begin(); ASSERT_TRUE(epee::misc_utils::parse::match_string(si, s.end(), bs)); ASSERT_EQ(bs, "fooሴbar"); + s = "\"\\u3042\\u307e\\u3084\\u304b\\u3059\""; si = s.begin(); ASSERT_TRUE(epee::misc_utils::parse::match_string(si, s.end(), bs)); ASSERT_EQ(bs, "あまやかす"); +} |