Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 116 additions & 11 deletions include/nlohmann/detail/output/serializer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,14 @@ namespace detail
// serialization //
///////////////////

/// how to treat decoding errors
enum class error_handler_t
{
strict, ///< throw a type_error exception in case of invalid UTF-8
replace, ///< replace invalid UTF-8 sequences with U+FFFD
ignore ///< ignore invalid UTF-8 sequences
};

template<typename BasicJsonType>
class serializer
{
Expand All @@ -42,12 +50,17 @@ class serializer
/*!
@param[in] s output stream to serialize to
@param[in] ichar indentation character to use
@param[in] error_handler_ how to react on decoding errors
*/
serializer(output_adapter_t<char> s, const char ichar)
: o(std::move(s)), loc(std::localeconv()),
thousands_sep(loc->thousands_sep == nullptr ? '\0' : * (loc->thousands_sep)),
decimal_point(loc->decimal_point == nullptr ? '\0' : * (loc->decimal_point)),
indent_char(ichar), indent_string(512, indent_char)
serializer(output_adapter_t<char> s, const char ichar,
error_handler_t error_handler_ = error_handler_t::strict)
: o(std::move(s))
, loc(std::localeconv())
, thousands_sep(loc->thousands_sep == nullptr ? '\0' : * (loc->thousands_sep))
, decimal_point(loc->decimal_point == nullptr ? '\0' : * (loc->decimal_point))
, indent_char(ichar)
, indent_string(512, indent_char)
, error_handler(error_handler_)
{}

// delete because of pointer members
Expand Down Expand Up @@ -287,6 +300,10 @@ class serializer
uint8_t state = UTF8_ACCEPT;
std::size_t bytes = 0; // number of bytes written to string_buffer

// number of bytes written at the point of the last valid byte
std::size_t bytes_after_last_accept = 0;
std::size_t undumped_chars = 0;

for (std::size_t i = 0; i < s.size(); ++i)
{
const auto byte = static_cast<uint8_t>(s[i]);
Expand Down Expand Up @@ -384,14 +401,68 @@ class serializer
o->write_characters(string_buffer.data(), bytes);
bytes = 0;
}

// remember the byte position of this accept
bytes_after_last_accept = bytes;
undumped_chars = 0;
break;
}

case UTF8_REJECT: // decode found invalid UTF-8 byte
{
std::string sn(3, '\0');
snprintf(&sn[0], sn.size(), "%.2X", byte);
JSON_THROW(type_error::create(316, "invalid UTF-8 byte at index " + std::to_string(i) + ": 0x" + sn));
switch (error_handler)
{
case error_handler_t::strict:
{
std::string sn(3, '\0');
snprintf(&sn[0], sn.size(), "%.2X", byte);
JSON_THROW(type_error::create(316, "invalid UTF-8 byte at index " + std::to_string(i) + ": 0x" + sn));
}

case error_handler_t::ignore:
case error_handler_t::replace:
{
// in case we saw this character the first time, we
// would like to read it again, because the byte
// may be OK for itself, but just not OK for the
// previous sequence
if (undumped_chars > 0)
{
--i;
}

// reset length buffer to the last accepted index;
// thus removing/ignoring the invalid characters
bytes = bytes_after_last_accept;

if (error_handler == error_handler_t::replace)
{
// add a replacement character
if (ensure_ascii)
{
string_buffer[bytes++] = '\\';
string_buffer[bytes++] = 'u';
string_buffer[bytes++] = 'f';
string_buffer[bytes++] = 'f';
string_buffer[bytes++] = 'f';
string_buffer[bytes++] = 'd';
}
else
{
string_buffer[bytes++] = '\xEF';
string_buffer[bytes++] = '\xBF';
string_buffer[bytes++] = '\xBD';
}
bytes_after_last_accept = bytes;
}

undumped_chars = 0;

// continue processing the string
state = UTF8_ACCEPT;
continue;
}
}
}

default: // decode found yet incomplete multi-byte code point
Expand All @@ -401,11 +472,13 @@ class serializer
// code point will not be escaped - copy byte to buffer
string_buffer[bytes++] = s[i];
}
++undumped_chars;
break;
}
}
}

// we finished processing the string
if (JSON_LIKELY(state == UTF8_ACCEPT))
{
// write buffer
Expand All @@ -417,9 +490,38 @@ class serializer
else
{
// we finish reading, but do not accept: string was incomplete
std::string sn(3, '\0');
snprintf(&sn[0], sn.size(), "%.2X", static_cast<uint8_t>(s.back()));
JSON_THROW(type_error::create(316, "incomplete UTF-8 string; last byte: 0x" + sn));
switch (error_handler)
{
case error_handler_t::strict:
{
std::string sn(3, '\0');
snprintf(&sn[0], sn.size(), "%.2X", static_cast<uint8_t>(s.back()));
JSON_THROW(type_error::create(316, "incomplete UTF-8 string; last byte: 0x" + sn));
}

case error_handler_t::ignore:
{
// write all accepted bytes
o->write_characters(string_buffer.data(), bytes_after_last_accept);
break;
}

case error_handler_t::replace:
{
// write all accepted bytes
o->write_characters(string_buffer.data(), bytes_after_last_accept);
// add a replacement character
if (ensure_ascii)
{
o->write_characters("\\ufffd", 6);
}
else
{
o->write_characters("\xEF\xBF\xBD", 3);
}
break;
}
}
}
}

Expand Down Expand Up @@ -629,6 +731,9 @@ class serializer
const char indent_char;
/// the indentation string
string_t indent_string;

/// error_handler how to react on decoding errors
const error_handler_t error_handler;
};
} // namespace detail
} // namespace nlohmann
17 changes: 13 additions & 4 deletions include/nlohmann/json.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,8 @@ class basic_json
using json_pointer = ::nlohmann::json_pointer<basic_json>;
template<typename T, typename SFINAE>
using json_serializer = JSONSerializer<T, SFINAE>;
/// how to treat decoding errors
using error_handler_t = detail::error_handler_t;
/// helper type for initializer lists of basic_json values
using initializer_list_t = std::initializer_list<detail::json_ref<basic_json>>;

Expand Down Expand Up @@ -1932,6 +1934,10 @@ class basic_json
@param[in] ensure_ascii If @a ensure_ascii is true, all non-ASCII characters
in the output are escaped with `\uXXXX` sequences, and the result consists
of ASCII characters only.
@param[in] error_handler how to react on decoding errors; there are three
possible values: `strict` (throws and exception in case a decoding error
occurs; default), `replace` (replace invalid UTF-8 sequences with U+FFFD),
and `ignore` (ignore invalid UTF-8 sequences during serialization).

@return string containing the serialization of the JSON value

Expand All @@ -1950,13 +1956,16 @@ class basic_json
@see https://docs.python.org/2/library/json.html#json.dump

@since version 1.0.0; indentation character @a indent_char, option
@a ensure_ascii and exceptions added in version 3.0.0
@a ensure_ascii and exceptions added in version 3.0.0; error
handlers added in version 3.4.0.
*/
string_t dump(const int indent = -1, const char indent_char = ' ',
const bool ensure_ascii = false) const
string_t dump(const int indent = -1,
const char indent_char = ' ',
const bool ensure_ascii = false,
const error_handler_t error_handler = error_handler_t::strict) const
{
string_t result;
serializer s(detail::output_adapter<char, string_t>(result), indent_char);
serializer s(detail::output_adapter<char, string_t>(result), indent_char, error_handler);

if (indent >= 0)
{
Expand Down
Loading