JsonEncoder: let the serializer replace invalid UTF-8 characters

Replacing invalid UTF-8 characters beforehand by our selves doesn't make
any sense, the serializer can literally perform the same replacement ops
with the exact same Unicode replacement character (U+FFFD) on its own.
So, why not just use it directly? Instead of wasting memory on a temporary
`String` object to always UTF-8 validate every and each value, we just
use the serializer to directly to dump the replaced char (if any) into
the output writer. No memory waste, no fuss!
This commit is contained in:
Yonas Habteab 2025-07-04 16:51:26 +02:00
parent dad4c0889f
commit 89418f38ee
2 changed files with 9 additions and 8 deletions

View File

@ -5,6 +5,7 @@
#include "base/dictionary.hpp" #include "base/dictionary.hpp"
#include "base/namespace.hpp" #include "base/namespace.hpp"
#include "base/objectlock.hpp" #include "base/objectlock.hpp"
#include "base/utility.hpp"
#include <boost/numeric/conversion/cast.hpp> #include <boost/numeric/conversion/cast.hpp>
#include <stack> #include <stack>
#include <utility> #include <utility>
@ -56,7 +57,7 @@ void JsonEncoder::Encode(const Value& value, boost::asio::yield_context* yc)
Write(value.ToBool() ? "true" : "false"); Write(value.ToBool() ? "true" : "false");
break; break;
case ValueString: case ValueString:
EncodeNlohmannJson(Utility::ValidateUTF8(value.Get<String>())); EncodeNlohmannJson(value.Get<String>());
break; break;
case ValueNumber: case ValueNumber:
EncodeNumber(value.Get<double>()); EncodeNumber(value.Get<double>());
@ -76,7 +77,7 @@ void JsonEncoder::Encode(const Value& value, boost::asio::yield_context* yc)
EncodeValueGenerator(gen, yc); EncodeValueGenerator(gen, yc);
} else { } else {
// Some other non-serializable object type! // Some other non-serializable object type!
EncodeNlohmannJson(Utility::ValidateUTF8(obj->ToString())); EncodeNlohmannJson(obj->ToString());
} }
break; break;
} }
@ -166,7 +167,7 @@ void JsonEncoder::EncodeObject(const Iterable& container, const ValExtractor& ex
WriteSeparatorAndIndentStrIfNeeded(!isEmpty); WriteSeparatorAndIndentStrIfNeeded(!isEmpty);
isEmpty = false; isEmpty = false;
EncodeNlohmannJson(Utility::ValidateUTF8(key)); EncodeNlohmannJson(key);
Write(m_Pretty ? ": " : ":"); Write(m_Pretty ? ": " : ":");
Encode(extractor(val), yc); Encode(extractor(val), yc);
@ -179,13 +180,15 @@ void JsonEncoder::EncodeObject(const Iterable& container, const ValExtractor& ex
* Dumps a nlohmann::json object to the output stream using the serializer. * Dumps a nlohmann::json object to the output stream using the serializer.
* *
* This function uses the @c nlohmann::detail::serializer to dump the provided @c nlohmann::json * This function uses the @c nlohmann::detail::serializer to dump the provided @c nlohmann::json
* object to the output stream managed by the @c JsonEncoder. * object to the output stream managed by the @c JsonEncoder. Strings will be properly escaped, and
* if any invalid UTF-8 sequences are encountered, it will replace them with the Unicode replacement
* character (U+FFFD).
* *
* @param json The nlohmann::json object to encode. * @param json The nlohmann::json object to encode.
*/ */
void JsonEncoder::EncodeNlohmannJson(const nlohmann::json& json) const void JsonEncoder::EncodeNlohmannJson(const nlohmann::json& json) const
{ {
nlohmann::detail::serializer<nlohmann::json> s(m_Writer, ' ', nlohmann::json::error_handler_t::strict); nlohmann::detail::serializer<nlohmann::json> s(m_Writer, ' ', nlohmann::json::error_handler_t::replace);
s.dump(json, m_Pretty, true, 0, 0); s.dump(json, m_Pretty, true, 0, 0);
} }

View File

@ -6,7 +6,6 @@
#include "base/i2-base.hpp" #include "base/i2-base.hpp"
#include "base/array.hpp" #include "base/array.hpp"
#include "base/generator.hpp" #include "base/generator.hpp"
#include "base/utility.hpp"
#include <boost/asio/spawn.hpp> #include <boost/asio/spawn.hpp>
#include <json.hpp> #include <json.hpp>
@ -58,8 +57,7 @@ class Value;
* The JSON encoder generates most of the low level JSON tokens, but it still relies on the already existing * The JSON encoder generates most of the low level JSON tokens, but it still relies on the already existing
* @c nlohmann::detail::serializer<> class to dump numbers and ASCII validated JSON strings. This means that the * @c nlohmann::detail::serializer<> class to dump numbers and ASCII validated JSON strings. This means that the
* encoder doesn't perform any kind of JSON validation or escaping on its own, but simply delegates all this kind * encoder doesn't perform any kind of JSON validation or escaping on its own, but simply delegates all this kind
* of work to serializer<>. However, Strings are UTF-8 validated beforehand using the @c Utility::ValidateUTF8() * of work to serializer<>.
* function and only the validated (copy of the original) String is passed to the serializer.
* *
* The generated JSON can be either prettified or compact, depending on your needs. The prettified JSON object * The generated JSON can be either prettified or compact, depending on your needs. The prettified JSON object
* is indented with 4 spaces and grows linearly with the depth of the object tree. * is indented with 4 spaces and grows linearly with the depth of the object tree.