Skip to content

Commit 0da2a00

Browse files
authored
branch-3.1: [fix](json) Add . after in JSON path to support correct token parsing #52543 (#52794)
pick #52543
1 parent b987e64 commit 0da2a00

7 files changed

Lines changed: 352 additions & 362 deletions

File tree

be/src/vec/functions/function_json.cpp

Lines changed: 44 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18+
#include <glog/logging.h>
1819
#include <rapidjson/allocators.h>
1920
#include <rapidjson/document.h>
2021
#include <rapidjson/encodings.h>
@@ -144,45 +145,7 @@ rapidjson::Value* match_value(const std::vector<JsonPath>& parsed_paths, rapidjs
144145
const std::string& col = parsed_paths[i].key;
145146
int index = parsed_paths[i].idx;
146147
if (LIKELY(!col.empty())) {
147-
if (root->IsArray()) {
148-
array_obj = static_cast<rapidjson::Value*>(
149-
mem_allocator.Malloc(sizeof(rapidjson::Value)));
150-
array_obj->SetArray();
151-
bool is_null = true;
152-
153-
// if array ,loop the array,find out all Objects,then find the results from the objects
154-
for (int j = 0; j < root->Size(); j++) {
155-
rapidjson::Value* json_elem = &((*root)[j]);
156-
157-
if (json_elem->IsArray() || json_elem->IsNull()) {
158-
continue;
159-
} else {
160-
if (!json_elem->IsObject()) {
161-
continue;
162-
}
163-
if (!json_elem->HasMember(col.c_str())) {
164-
if (is_insert_null) { // not found item, then insert a null object.
165-
is_null = false;
166-
rapidjson::Value nullObject(rapidjson::kNullType);
167-
array_obj->PushBack(nullObject, mem_allocator);
168-
}
169-
continue;
170-
}
171-
rapidjson::Value* obj = &((*json_elem)[col.c_str()]);
172-
if (obj->IsArray()) {
173-
is_null = false;
174-
for (int k = 0; k < obj->Size(); k++) {
175-
array_obj->PushBack((*obj)[k], mem_allocator);
176-
}
177-
} else if (!obj->IsNull()) {
178-
is_null = false;
179-
array_obj->PushBack(*obj, mem_allocator);
180-
}
181-
}
182-
}
183-
184-
root = is_null ? &(array_obj->SetNull()) : array_obj;
185-
} else if (root->IsObject()) {
148+
if (root->IsObject()) {
186149
if (!root->HasMember(col.c_str())) {
187150
return nullptr;
188151
} else {
@@ -233,8 +196,17 @@ rapidjson::Value* get_json_object(std::string_view json_string, std::string_view
233196

234197
//Cannot use '\' as the last character, return NULL
235198
if (path_string.back() == '\\') {
236-
document->SetNull();
237-
return document;
199+
return nullptr;
200+
}
201+
202+
std::string fixed_string;
203+
if (path_string.size() >= 2 && path_string[0] == '$' && path_string[1] != '.') {
204+
// Boost tokenizer requires explicit "." after "$" to correctly extract JSON path tokens.
205+
// Without this, expressions like "$[0].key" cannot be properly split.
206+
// This commit ensures a "." is automatically added after "$" to maintain consistent token parsing behavior.
207+
fixed_string = "$.";
208+
fixed_string += path_string.substr(1);
209+
path_string = fixed_string;
238210
}
239211

240212
try {
@@ -251,13 +223,13 @@ rapidjson::Value* get_json_object(std::string_view json_string, std::string_view
251223
}
252224
} catch (boost::escaped_list_error&) {
253225
// meet unknown escape sequence, example '$.name\k'
254-
return document;
226+
return nullptr;
255227
}
256228

257229
parsed_paths = &tmp_parsed_paths;
258230

259231
if (!(*parsed_paths)[0].is_valid) {
260-
return document;
232+
return nullptr;
261233
}
262234

263235
if (UNLIKELY((*parsed_paths).size() == 1)) {
@@ -272,8 +244,7 @@ rapidjson::Value* get_json_object(std::string_view json_string, std::string_view
272244
if (UNLIKELY(document->HasParseError())) {
273245
// VLOG_CRITICAL << "Error at offset " << document->GetErrorOffset() << ": "
274246
// << GetParseError_En(document->GetParseError());
275-
document->SetNull();
276-
return document;
247+
return nullptr;
277248
}
278249

279250
return match_value(*parsed_paths, document, document->GetAllocator());
@@ -858,9 +829,10 @@ template <typename Name, bool remove_quotes>
858829
struct FunctionJsonExtractImpl {
859830
static constexpr auto name = Name::name;
860831

861-
static rapidjson::Value parse_json(const ColumnString* json_col, const ColumnString* path_col,
862-
rapidjson::Document::AllocatorType& allocator, const int row,
863-
const int col, std::vector<bool>& column_is_consts) {
832+
static std::pair<bool, rapidjson::Value> parse_json(
833+
const ColumnString* json_col, const ColumnString* path_col,
834+
rapidjson::Document::AllocatorType& allocator, const int row, const int col,
835+
std::vector<bool>& column_is_consts) {
864836
rapidjson::Value value;
865837
rapidjson::Document document;
866838

@@ -869,10 +841,13 @@ struct FunctionJsonExtractImpl {
869841
const auto path = path_col->get_data_at(index_check_const(row, column_is_consts[col]));
870842
std::string_view path_string(path.data, path.size);
871843
auto* root = get_json_object<JSON_FUN_STRING>(json_string, path_string, &document);
844+
bool found = false;
872845
if (root != nullptr) {
846+
found = true;
873847
value.CopyFrom(*root, allocator);
874848
}
875-
return value;
849+
850+
return {found, std::move(value)};
876851
}
877852

878853
static rapidjson::Value* get_document(const ColumnString* path_col,
@@ -913,8 +888,9 @@ struct FunctionJsonExtractImpl {
913888
rapidjson::StringBuffer buf;
914889
rapidjson::Writer<rapidjson::StringBuffer> writer(buf);
915890
const auto* json_col = data_columns[0];
916-
auto insert_result_lambda = [&](rapidjson::Value& value, int row) {
917-
if (value.IsNull()) {
891+
892+
auto insert_result_lambda = [&](rapidjson::Value& value, bool is_null, int row) {
893+
if (is_null) {
918894
null_map[row] = 1;
919895
result_column.insert_default();
920896
} else {
@@ -935,12 +911,13 @@ struct FunctionJsonExtractImpl {
935911
}
936912
};
937913
if (data_columns.size() == 2) {
938-
rapidjson::Value value;
939914
if (column_is_consts[1]) {
940915
std::vector<JsonPath> parsed_paths;
941916
auto* root = get_document(data_columns[1], &document, parsed_paths, 0,
942917
column_is_consts[1]);
943918
for (size_t row = 0; row < input_rows_count; row++) {
919+
bool is_null = false;
920+
rapidjson::Value value;
944921
if (root != nullptr) {
945922
const auto& obj = json_col->get_data_at(row);
946923
std::string_view json_string(obj.data, obj.size);
@@ -957,17 +934,18 @@ struct FunctionJsonExtractImpl {
957934
if (root_val != nullptr) {
958935
value.CopyFrom(*root_val, allocator);
959936
} else {
960-
rapidjson::Value tmp;
961-
value.Swap(tmp);
937+
is_null = true;
962938
}
939+
} else {
940+
is_null = true;
963941
}
964-
insert_result_lambda(value, row);
942+
insert_result_lambda(value, is_null, row);
965943
}
966944
} else {
967945
for (size_t row = 0; row < input_rows_count; row++) {
968-
value = parse_json(json_col, data_columns[1], allocator, row, 1,
969-
column_is_consts);
970-
insert_result_lambda(value, row);
946+
auto result = parse_json(json_col, data_columns[1], allocator, row, 1,
947+
column_is_consts);
948+
insert_result_lambda(result.second, !result.first, row);
971949
}
972950
}
973951

@@ -977,12 +955,16 @@ struct FunctionJsonExtractImpl {
977955
value.Reserve(data_columns.size() - 1, allocator);
978956
for (size_t row = 0; row < input_rows_count; row++) {
979957
value.Clear();
958+
bool found_any = false;
980959
for (size_t col = 1; col < data_columns.size(); ++col) {
981-
value.PushBack(parse_json(json_col, data_columns[col], allocator, row, col,
982-
column_is_consts),
983-
allocator);
960+
auto result = parse_json(json_col, data_columns[col], allocator, row, col,
961+
column_is_consts);
962+
if (result.first) {
963+
found_any = true;
964+
value.PushBack(std::move(result.second), allocator);
965+
}
984966
}
985-
insert_result_lambda(value, row);
967+
insert_result_lambda(value, !found_any, row);
986968
}
987969
}
988970
}

be/test/vec/function/function_json_test.cpp

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -72,15 +72,13 @@ TEST(FunctionJsonTEST, GetJsonStringTest) {
7272
std::string func_name = "get_json_string";
7373
InputTypeSet input_types = {TypeIndex::String, TypeIndex::String};
7474
DataSet data_set = {
75-
{{VARCHAR("{\"k1\":\"v1\", \"k2\":\"v2\"}"), VARCHAR("$.k1")}, VARCHAR("v1")},
76-
{{VARCHAR("{\"k1\":\"v1\", \"my.key\":[\"e1\", \"e2\", \"e3\"]}"),
77-
VARCHAR("$.\"my.key\"[1]")},
75+
{{VARCHAR(R"({"k1":"v1", "k2":"v2"})"), VARCHAR("$.k1")}, VARCHAR("v1")},
76+
{{VARCHAR(R"({"k1":"v1", "my.key":["e1", "e2", "e3"]})"), VARCHAR("$.\"my.key\"[1]")},
7877
VARCHAR("e2")},
79-
{{VARCHAR("{\"k1.key\":{\"k2\":[\"v1\", \"v2\"]}}"), VARCHAR("$.\"k1.key\".k2[0]")},
78+
{{VARCHAR(R"({"k1.key":{"k2":["v1", "v2"]}})"), VARCHAR("$.\"k1.key\".k2[0]")},
8079
VARCHAR("v1")},
81-
{{VARCHAR("[{\"k1\":\"v1\"}, {\"k2\":\"v2\"}, {\"k1\":\"v3\"}, {\"k1\":\"v4\"}]"),
82-
VARCHAR("$.k1")},
83-
VARCHAR("[\"v1\",\"v3\",\"v4\"]")}};
80+
{{VARCHAR(R"([{"k1":"v1"}, {"k2":"v2"}, {"k1":"v3"}, {"k1":"v4"}])"), VARCHAR("$.k1")},
81+
Null()}};
8482

8583
static_cast<void>(check_function<DataTypeString, true>(func_name, input_types, data_set));
8684
}
@@ -93,7 +91,7 @@ TEST(FunctionJsonTEST, JsonExtractTest) {
9391
// json_extract root
9492
DataSet data_set = {
9593
{{Null(), STRING("$")}, Null()},
96-
{{STRING("null"), STRING("$")}, Null()},
94+
{{STRING("null"), STRING("$")}, STRING("null")},
9795
{{STRING("true"), STRING("$")}, STRING("true")},
9896
{{STRING("false"), STRING("$")}, STRING("false")},
9997
{{STRING("100"), STRING("$")}, STRING("100")}, //int8
@@ -127,7 +125,7 @@ TEST(FunctionJsonTEST, JsonExtractTest) {
127125

128126
data_set = {
129127
{{Null(), STRING("$")}, Null()},
130-
{{STRING("null"), STRING("$")}, Null()},
128+
{{STRING("null"), STRING("$")}, STRING("null")},
131129
{{STRING("true"), STRING("$")}, STRING("true")},
132130
{{STRING("false"), STRING("$")}, STRING("false")},
133131
{{STRING("100"), STRING("$")}, STRING("100")}, //int8

0 commit comments

Comments
 (0)