Skip to content

Commit 26cfdfc

Browse files
mrhhsgkoarz
authored andcommitted
[fix](json) Add . after in JSON path to support correct token parsing (apache#52543)
Boost tokenizer requires explicit "." after "$" to correctly extract JSON path tokens. Without this, expressions like "$[0].key" cannot be properly split, causing issues in downstream logic. This commit ensures a "." is automatically added after "$" to maintain consistent token parsing behavior.
1 parent a4535eb commit 26cfdfc

7 files changed

Lines changed: 351 additions & 363 deletions

File tree

be/src/vec/functions/function_json.cpp

Lines changed: 43 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18+
#include <glog/logging.h>
1819
#include <rapidjson/allocators.h>
1920
#include <rapidjson/document.h>
2021
#include <rapidjson/encodings.h>
@@ -140,45 +141,7 @@ rapidjson::Value* match_value(const std::vector<JsonPath>& parsed_paths, rapidjs
140141
const std::string& col = parsed_paths[i].key;
141142
int index = parsed_paths[i].idx;
142143
if (LIKELY(!col.empty())) {
143-
if (root->IsArray()) {
144-
array_obj = static_cast<rapidjson::Value*>(
145-
mem_allocator.Malloc(sizeof(rapidjson::Value)));
146-
array_obj->SetArray();
147-
bool is_null = true;
148-
149-
// if array ,loop the array,find out all Objects,then find the results from the objects
150-
for (int j = 0; j < root->Size(); j++) {
151-
rapidjson::Value* json_elem = &((*root)[j]);
152-
153-
if (json_elem->IsArray() || json_elem->IsNull()) {
154-
continue;
155-
} else {
156-
if (!json_elem->IsObject()) {
157-
continue;
158-
}
159-
if (!json_elem->HasMember(col.c_str())) {
160-
if (is_insert_null) { // not found item, then insert a null object.
161-
is_null = false;
162-
rapidjson::Value nullObject(rapidjson::kNullType);
163-
array_obj->PushBack(nullObject, mem_allocator);
164-
}
165-
continue;
166-
}
167-
rapidjson::Value* obj = &((*json_elem)[col.c_str()]);
168-
if (obj->IsArray()) {
169-
is_null = false;
170-
for (int k = 0; k < obj->Size(); k++) {
171-
array_obj->PushBack((*obj)[k], mem_allocator);
172-
}
173-
} else if (!obj->IsNull()) {
174-
is_null = false;
175-
array_obj->PushBack(*obj, mem_allocator);
176-
}
177-
}
178-
}
179-
180-
root = is_null ? &(array_obj->SetNull()) : array_obj;
181-
} else if (root->IsObject()) {
144+
if (root->IsObject()) {
182145
if (!root->HasMember(col.c_str())) {
183146
return nullptr;
184147
} else {
@@ -229,8 +192,17 @@ rapidjson::Value* get_json_object(std::string_view json_string, std::string_view
229192

230193
//Cannot use '\' as the last character, return NULL
231194
if (path_string.back() == '\\') {
232-
document->SetNull();
233-
return document;
195+
return nullptr;
196+
}
197+
198+
std::string fixed_string;
199+
if (path_string.size() >= 2 && path_string[0] == '$' && path_string[1] != '.') {
200+
// Boost tokenizer requires explicit "." after "$" to correctly extract JSON path tokens.
201+
// Without this, expressions like "$[0].key" cannot be properly split.
202+
// This commit ensures a "." is automatically added after "$" to maintain consistent token parsing behavior.
203+
fixed_string = "$.";
204+
fixed_string += path_string.substr(1);
205+
path_string = fixed_string;
234206
}
235207

236208
try {
@@ -247,13 +219,13 @@ rapidjson::Value* get_json_object(std::string_view json_string, std::string_view
247219
}
248220
} catch (boost::escaped_list_error&) {
249221
// meet unknown escape sequence, example '$.name\k'
250-
return document;
222+
return nullptr;
251223
}
252224

253225
parsed_paths = &tmp_parsed_paths;
254226

255227
if (!(*parsed_paths)[0].is_valid) {
256-
return document;
228+
return nullptr;
257229
}
258230

259231
if (UNLIKELY((*parsed_paths).size() == 1)) {
@@ -270,8 +242,7 @@ rapidjson::Value* get_json_object(std::string_view json_string, std::string_view
270242
if (UNLIKELY(document->HasParseError())) {
271243
// VLOG_CRITICAL << "Error at offset " << document->GetErrorOffset() << ": "
272244
// << GetParseError_En(document->GetParseError());
273-
document->SetNull();
274-
return document;
245+
return nullptr;
275246
}
276247

277248
return match_value(*parsed_paths, document, document->GetAllocator());
@@ -856,10 +827,10 @@ template <typename Name, bool remove_quotes>
856827
struct FunctionJsonExtractImpl {
857828
static constexpr auto name = Name::name;
858829

859-
static rapidjson::Value parse_json(const ColumnString* json_col, const ColumnString* path_col,
860-
rapidjson::Document::AllocatorType& allocator,
861-
const size_t row, const size_t col,
862-
std::vector<bool>& column_is_consts) {
830+
static std::pair<bool, rapidjson::Value> parse_json(
831+
const ColumnString* json_col, const ColumnString* path_col,
832+
rapidjson::Document::AllocatorType& allocator, const size_t row, const size_t col,
833+
std::vector<bool>& column_is_consts) {
863834
rapidjson::Value value;
864835
rapidjson::Document document;
865836

@@ -868,10 +839,13 @@ struct FunctionJsonExtractImpl {
868839
const auto path = path_col->get_data_at(index_check_const(row, column_is_consts[col]));
869840
std::string_view path_string(path.data, path.size);
870841
auto* root = get_json_object<JSON_FUN_STRING>(json_string, path_string, &document);
842+
bool found = false;
871843
if (root != nullptr) {
844+
found = true;
872845
value.CopyFrom(*root, allocator);
873846
}
874-
return value;
847+
848+
return {found, std::move(value)};
875849
}
876850

877851
static rapidjson::Value* get_document(const ColumnString* path_col,
@@ -912,8 +886,8 @@ struct FunctionJsonExtractImpl {
912886
rapidjson::StringBuffer buf;
913887
rapidjson::Writer<rapidjson::StringBuffer> writer(buf);
914888
const auto* json_col = data_columns[0];
915-
auto insert_result_lambda = [&](rapidjson::Value& value, size_t row) {
916-
if (value.IsNull()) {
889+
auto insert_result_lambda = [&](rapidjson::Value& value, bool is_null, size_t row) {
890+
if (is_null) {
917891
null_map[row] = 1;
918892
result_column.insert_default();
919893
} else {
@@ -934,12 +908,13 @@ struct FunctionJsonExtractImpl {
934908
}
935909
};
936910
if (data_columns.size() == 2) {
937-
rapidjson::Value value;
938911
if (column_is_consts[1]) {
939912
std::vector<JsonPath> parsed_paths;
940913
auto* root = get_document(data_columns[1], &document, parsed_paths, 0,
941914
column_is_consts[1]);
942915
for (size_t row = 0; row < input_rows_count; row++) {
916+
bool is_null = false;
917+
rapidjson::Value value;
943918
if (root != nullptr) {
944919
const auto& obj = json_col->get_data_at(row);
945920
std::string_view json_string(obj.data, obj.size);
@@ -958,17 +933,18 @@ struct FunctionJsonExtractImpl {
958933
if (root_val != nullptr) {
959934
value.CopyFrom(*root_val, allocator);
960935
} else {
961-
rapidjson::Value tmp;
962-
value.Swap(tmp);
936+
is_null = true;
963937
}
938+
} else {
939+
is_null = true;
964940
}
965-
insert_result_lambda(value, row);
941+
insert_result_lambda(value, is_null, row);
966942
}
967943
} else {
968944
for (size_t row = 0; row < input_rows_count; row++) {
969-
value = parse_json(json_col, data_columns[1], allocator, row, 1,
970-
column_is_consts);
971-
insert_result_lambda(value, row);
945+
auto result = parse_json(json_col, data_columns[1], allocator, row, 1,
946+
column_is_consts);
947+
insert_result_lambda(result.second, !result.first, row);
972948
}
973949
}
974950

@@ -978,12 +954,16 @@ struct FunctionJsonExtractImpl {
978954
value.Reserve(cast_set<rapidjson::SizeType>(data_columns.size() - 1), allocator);
979955
for (size_t row = 0; row < input_rows_count; row++) {
980956
value.Clear();
957+
bool found_any = false;
981958
for (size_t col = 1; col < data_columns.size(); ++col) {
982-
value.PushBack(parse_json(json_col, data_columns[col], allocator, row, col,
983-
column_is_consts),
984-
allocator);
959+
auto result = parse_json(json_col, data_columns[col], allocator, row, col,
960+
column_is_consts);
961+
if (result.first) {
962+
found_any = true;
963+
value.PushBack(std::move(result.second), allocator);
964+
}
985965
}
986-
insert_result_lambda(value, row);
966+
insert_result_lambda(value, !found_any, row);
987967
}
988968
}
989969
}

be/test/vec/function/function_json_test.cpp

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -72,15 +72,13 @@ TEST(FunctionJsonTEST, GetJsonStringTest) {
7272
std::string func_name = "get_json_string";
7373
InputTypeSet input_types = {PrimitiveType::TYPE_VARCHAR, PrimitiveType::TYPE_VARCHAR};
7474
DataSet data_set = {
75-
{{VARCHAR("{\"k1\":\"v1\", \"k2\":\"v2\"}"), VARCHAR("$.k1")}, VARCHAR("v1")},
76-
{{VARCHAR("{\"k1\":\"v1\", \"my.key\":[\"e1\", \"e2\", \"e3\"]}"),
77-
VARCHAR("$.\"my.key\"[1]")},
75+
{{VARCHAR(R"({"k1":"v1", "k2":"v2"})"), VARCHAR("$.k1")}, VARCHAR("v1")},
76+
{{VARCHAR(R"({"k1":"v1", "my.key":["e1", "e2", "e3"]})"), VARCHAR("$.\"my.key\"[1]")},
7877
VARCHAR("e2")},
79-
{{VARCHAR("{\"k1.key\":{\"k2\":[\"v1\", \"v2\"]}}"), VARCHAR("$.\"k1.key\".k2[0]")},
78+
{{VARCHAR(R"({"k1.key":{"k2":["v1", "v2"]}})"), VARCHAR("$.\"k1.key\".k2[0]")},
8079
VARCHAR("v1")},
81-
{{VARCHAR("[{\"k1\":\"v1\"}, {\"k2\":\"v2\"}, {\"k1\":\"v3\"}, {\"k1\":\"v4\"}]"),
82-
VARCHAR("$.k1")},
83-
VARCHAR("[\"v1\",\"v3\",\"v4\"]")}};
80+
{{VARCHAR(R"([{"k1":"v1"}, {"k2":"v2"}, {"k1":"v3"}, {"k1":"v4"}])"), VARCHAR("$.k1")},
81+
Null()}};
8482

8583
static_cast<void>(check_function<DataTypeString, true>(func_name, input_types, data_set));
8684
}
@@ -93,7 +91,7 @@ TEST(FunctionJsonTEST, JsonExtractTest) {
9391
// json_extract root
9492
DataSet data_set = {
9593
{{Null(), STRING("$")}, Null()},
96-
{{STRING("null"), STRING("$")}, Null()},
94+
{{STRING("null"), STRING("$")}, STRING("null")},
9795
{{STRING("true"), STRING("$")}, STRING("true")},
9896
{{STRING("false"), STRING("$")}, STRING("false")},
9997
{{STRING("100"), STRING("$")}, STRING("100")}, //int8
@@ -127,7 +125,7 @@ TEST(FunctionJsonTEST, JsonExtractTest) {
127125

128126
data_set = {
129127
{{Null(), STRING("$")}, Null()},
130-
{{STRING("null"), STRING("$")}, Null()},
128+
{{STRING("null"), STRING("$")}, STRING("null")},
131129
{{STRING("true"), STRING("$")}, STRING("true")},
132130
{{STRING("false"), STRING("$")}, STRING("false")},
133131
{{STRING("100"), STRING("$")}, STRING("100")}, //int8

0 commit comments

Comments
 (0)