From 55bb81c8212737e85e0a8b3ffd818e850387214e Mon Sep 17 00:00:00 2001 From: Balint Uveges Date: Thu, 2 Apr 2026 09:18:56 +0200 Subject: [PATCH 1/5] Libyang erroneously interprets patterns with multiple Unicode blocks: libyang maps every Unicode block to Latin-1 Supplement Unicode block, if multiple Unicode blocks are present within a single [ ]. This commit corrects the fault and introduces multiple unit tests related to the correct behavior. On branch unicode_block_bug Changes to be committed: modified: src/ly_common.c modified: tests/utests/types/string.c --- src/ly_common.c | 8 ++++--- tests/utests/types/string.c | 47 +++++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 3 deletions(-) diff --git a/src/ly_common.c b/src/ly_common.c index b24dd1f38..10b815d0c 100644 --- a/src/ly_common.c +++ b/src/ly_common.c @@ -4,6 +4,7 @@ * @brief common internal definitions for libyang * * Copyright (c) 2018 - 2026 CESNET, z.s.p.o. + * Copyright (c) 2026 Nokia * * This source code is licensed under BSD 3-Clause License (the "License"). * You may not use this file except in compliance with the License. @@ -813,7 +814,7 @@ ly_pat_compile_xmlschema_chblocks_xmlschema2perl(const char *pattern, char **reg {NULL, NULL} }; - size_t idx, idx2, start, end; + size_t idx, idx2, start, end, ublock; char *perl_regex, *ptr; perl_regex = *regex; @@ -849,6 +850,7 @@ ly_pat_compile_xmlschema_chblocks_xmlschema2perl(const char *pattern, char **reg return ly_err_new(err, LY_EVALID, 0, NULL, NULL, "Regular expression \"%s\" is not valid (\"%s\": %s).", pattern, perl_regex + start + 5, "unknown block name"); } + ublock = idx; /* make the space in the string and replace the block (but we cannot include brackets if it was already enclosed in them) */ for (idx2 = 0, idx = 0; idx2 < start; ++idx2) { @@ -863,10 +865,10 @@ ly_pat_compile_xmlschema_chblocks_xmlschema2perl(const char *pattern, char **reg if (idx) { /* skip brackets */ memmove(perl_regex + start + (URANGE_LEN - 2), perl_regex + end, strlen(perl_regex + end) + 1); - memcpy(perl_regex + start, ublock2urange[idx][1] + 1, URANGE_LEN - 2); + memcpy(perl_regex + start, ublock2urange[ublock][1] + 1, URANGE_LEN - 2); } else { memmove(perl_regex + start + URANGE_LEN, perl_regex + end, strlen(perl_regex + end) + 1); - memcpy(perl_regex + start, ublock2urange[idx][1], URANGE_LEN); + memcpy(perl_regex + start, ublock2urange[ublock][1], URANGE_LEN); } } diff --git a/tests/utests/types/string.c b/tests/utests/types/string.c index a921427ad..e3608709a 100644 --- a/tests/utests/types/string.c +++ b/tests/utests/types/string.c @@ -4,6 +4,7 @@ * @brief test for string values * * Copyright (c) 2021 CESNET, z.s.p.o. + * Copyright (c) 2026 Nokia * * This source code is licensed under BSD 3-Clause License (the "License"). * You may not use this file except in compliance with the License. @@ -817,6 +818,52 @@ test_data_xml(void **state) CHECK_LOG_CTX("Unsatisfied pattern - \"abc\" does not match \"a.*b\".", "/T_ANCHOR:port", 1); TEST_ERROR_XML("T_ANCHOR", "cab"); CHECK_LOG_CTX("Unsatisfied pattern - \"cab\" does not match \"a.*b\".", "/T_ANCHOR:port", 1); + + /* Unicode block test 1 - Basic Latin */ + schema = MODULE_CREATE_YANG("T_UB_1", "leaf port {type string { pattern '\\p{IsBasicLatin}+';} } "); + UTEST_ADD_MODULE(schema, LYS_IN_YANG, NULL, NULL); + TEST_SUCCESS_XML("T_UB_1", "B4s1cLatin!", STRING, "B4s1cLatin!"); + + /* Unicode block test 2 - Basic Latin within brackets */ + schema = MODULE_CREATE_YANG("T_UB_2", "leaf port {type string { pattern '[\\p{IsBasicLatin}]+';} } "); + UTEST_ADD_MODULE(schema, LYS_IN_YANG, NULL, NULL); + TEST_SUCCESS_XML("T_UB_2", "B4s1cLatin!", STRING, "B4s1cLatin!"); + + /* Unicode block test 3 - Latin-1 Supplement */ + schema = MODULE_CREATE_YANG("T_UB_3", "leaf port {type string { pattern '[\\p{IsLatin-1Supplement}]+';} } "); + UTEST_ADD_MODULE(schema, LYS_IN_YANG, NULL, NULL); + TEST_SUCCESS_XML("T_UB_3", "ÁÉÍÓÖÜ", STRING, "ÁÉÍÓÖÜ"); + + /* Unicode block test 4 - Latin-1 Supplement */ + schema = MODULE_CREATE_YANG("T_UB_4", "leaf port {type string { pattern '[\\p{IsLatin-1Supplement}]+';} } "); + UTEST_ADD_MODULE(schema, LYS_IN_YANG, NULL, NULL); + TEST_SUCCESS_XML("T_UB_4", "ÁÉÍÓÖÜ", STRING, "ÁÉÍÓÖÜ"); + + /* Unicode block test 5 - Latin Extended-A */ + schema = MODULE_CREATE_YANG("T_UB_5", "leaf port {type string { pattern '[\\p{IsLatinExtended-A}]+';} } "); + UTEST_ADD_MODULE(schema, LYS_IN_YANG, NULL, NULL); + TEST_SUCCESS_XML("T_UB_5", "ŐŰőű", STRING, "ŐŰőű"); + + /* Unicode block test 6 - Basic Latin, Latin-1 Supplement, and Latin Extended-A */ + schema = MODULE_CREATE_YANG("T_UB_6", "leaf port {type string {" + " pattern '[\\p{IsBasicLatin}\\p{IsLatin-1Supplement}\\p{IsLatinExtended-A}]+';" + "}} "); + UTEST_ADD_MODULE(schema, LYS_IN_YANG, NULL, NULL); + TEST_SUCCESS_XML("T_UB_6", "Árvíztűrő tükörfúrógép!", STRING, "Árvíztűrő tükörfúrógép!"); + + /* Unicode block test 7 - Unknown Unicode block */ + schema = MODULE_CREATE_YANG("T_UB_7", "leaf port {type string { pattern '\\p{IsUnknownUnicodeBlock}+';} } "); + UTEST_INVALID_MODULE(schema, LYS_IN_YANG, NULL, LY_EVALID); + CHECK_LOG_CTX("Regular expression \"\\p{IsUnknownUnicodeBlock}+\" " + "is not valid (\"UnknownUnicodeBlock}+\": unknown block name).", "/T_UB_7:port", 0); + + /* Unicode block test 8 - Unknown Unicode block with Basic Latin */ + schema = MODULE_CREATE_YANG("T_UB_8", "leaf port {type string { " + " pattern '[\\p{IsBasicLatin}\\p{IsUnknownUnicodeBlock}]+';" + "}} "); + UTEST_INVALID_MODULE(schema, LYS_IN_YANG, NULL, LY_EVALID); + CHECK_LOG_CTX("Regular expression \"[\\p{IsBasicLatin}\\p{IsUnknownUnicodeBlock}]+\" " + "is not valid (\"UnknownUnicodeBlock}]+\": unknown block name).", "/T_UB_8:port", 0); } static void From 94b7201e3e0511b27de399126994cc2598e4cbd6 Mon Sep 17 00:00:00 2001 From: Balint Uveges Date: Tue, 7 Apr 2026 10:33:56 +0200 Subject: [PATCH 2/5] Update copyright --- src/ly_common.c | 1 - tests/utests/types/string.c | 1 - 2 files changed, 2 deletions(-) diff --git a/src/ly_common.c b/src/ly_common.c index 10b815d0c..a9bfbccc9 100644 --- a/src/ly_common.c +++ b/src/ly_common.c @@ -4,7 +4,6 @@ * @brief common internal definitions for libyang * * Copyright (c) 2018 - 2026 CESNET, z.s.p.o. - * Copyright (c) 2026 Nokia * * This source code is licensed under BSD 3-Clause License (the "License"). * You may not use this file except in compliance with the License. diff --git a/tests/utests/types/string.c b/tests/utests/types/string.c index e3608709a..3537486a0 100644 --- a/tests/utests/types/string.c +++ b/tests/utests/types/string.c @@ -4,7 +4,6 @@ * @brief test for string values * * Copyright (c) 2021 CESNET, z.s.p.o. - * Copyright (c) 2026 Nokia * * This source code is licensed under BSD 3-Clause License (the "License"). * You may not use this file except in compliance with the License. From c8b59709b68f5fc22d73be0184802ce5816f944c Mon Sep 17 00:00:00 2001 From: Balint Uveges Date: Tue, 7 Apr 2026 13:49:30 +0200 Subject: [PATCH 3/5] =?UTF-8?q?Change=20indentation=20of=20multi-line=20st?= =?UTF-8?q?rings=C2=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/utests/types/string.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/utests/types/string.c b/tests/utests/types/string.c index 3537486a0..527cbb00c 100644 --- a/tests/utests/types/string.c +++ b/tests/utests/types/string.c @@ -845,8 +845,8 @@ test_data_xml(void **state) /* Unicode block test 6 - Basic Latin, Latin-1 Supplement, and Latin Extended-A */ schema = MODULE_CREATE_YANG("T_UB_6", "leaf port {type string {" - " pattern '[\\p{IsBasicLatin}\\p{IsLatin-1Supplement}\\p{IsLatinExtended-A}]+';" - "}} "); + " pattern '[\\p{IsBasicLatin}\\p{IsLatin-1Supplement}\\p{IsLatinExtended-A}]+';" + "}} "); UTEST_ADD_MODULE(schema, LYS_IN_YANG, NULL, NULL); TEST_SUCCESS_XML("T_UB_6", "Árvíztűrő tükörfúrógép!", STRING, "Árvíztűrő tükörfúrógép!"); @@ -854,15 +854,15 @@ test_data_xml(void **state) schema = MODULE_CREATE_YANG("T_UB_7", "leaf port {type string { pattern '\\p{IsUnknownUnicodeBlock}+';} } "); UTEST_INVALID_MODULE(schema, LYS_IN_YANG, NULL, LY_EVALID); CHECK_LOG_CTX("Regular expression \"\\p{IsUnknownUnicodeBlock}+\" " - "is not valid (\"UnknownUnicodeBlock}+\": unknown block name).", "/T_UB_7:port", 0); + "is not valid (\"UnknownUnicodeBlock}+\": unknown block name).", "/T_UB_7:port", 0); /* Unicode block test 8 - Unknown Unicode block with Basic Latin */ schema = MODULE_CREATE_YANG("T_UB_8", "leaf port {type string { " - " pattern '[\\p{IsBasicLatin}\\p{IsUnknownUnicodeBlock}]+';" - "}} "); +i " pattern '[\\p{IsBasicLatin}\\p{IsUnknownUnicodeBlock}]+';" + "}} "); UTEST_INVALID_MODULE(schema, LYS_IN_YANG, NULL, LY_EVALID); CHECK_LOG_CTX("Regular expression \"[\\p{IsBasicLatin}\\p{IsUnknownUnicodeBlock}]+\" " - "is not valid (\"UnknownUnicodeBlock}]+\": unknown block name).", "/T_UB_8:port", 0); + "is not valid (\"UnknownUnicodeBlock}]+\": unknown block name).", "/T_UB_8:port", 0); } static void From 0ec2d662866b9daa829f7a7ebf42c048abc2465c Mon Sep 17 00:00:00 2001 From: Balint Uveges Date: Tue, 7 Apr 2026 13:51:28 +0200 Subject: [PATCH 4/5] Correct typo --- tests/utests/types/string.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/utests/types/string.c b/tests/utests/types/string.c index 527cbb00c..476cd6bbc 100644 --- a/tests/utests/types/string.c +++ b/tests/utests/types/string.c @@ -858,7 +858,7 @@ test_data_xml(void **state) /* Unicode block test 8 - Unknown Unicode block with Basic Latin */ schema = MODULE_CREATE_YANG("T_UB_8", "leaf port {type string { " -i " pattern '[\\p{IsBasicLatin}\\p{IsUnknownUnicodeBlock}]+';" + " pattern '[\\p{IsBasicLatin}\\p{IsUnknownUnicodeBlock}]+';" "}} "); UTEST_INVALID_MODULE(schema, LYS_IN_YANG, NULL, LY_EVALID); CHECK_LOG_CTX("Regular expression \"[\\p{IsBasicLatin}\\p{IsUnknownUnicodeBlock}]+\" " From 7eab13874daf3caabcd6101c25731d366866c6c2 Mon Sep 17 00:00:00 2001 From: Balint Uveges Date: Tue, 7 Apr 2026 14:46:05 +0200 Subject: [PATCH 5/5] Fix indentation based on uncrustify results --- tests/utests/types/string.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/utests/types/string.c b/tests/utests/types/string.c index 476cd6bbc..e0ef60411 100644 --- a/tests/utests/types/string.c +++ b/tests/utests/types/string.c @@ -845,8 +845,8 @@ test_data_xml(void **state) /* Unicode block test 6 - Basic Latin, Latin-1 Supplement, and Latin Extended-A */ schema = MODULE_CREATE_YANG("T_UB_6", "leaf port {type string {" - " pattern '[\\p{IsBasicLatin}\\p{IsLatin-1Supplement}\\p{IsLatinExtended-A}]+';" - "}} "); + " pattern '[\\p{IsBasicLatin}\\p{IsLatin-1Supplement}\\p{IsLatinExtended-A}]+';" + "}} "); UTEST_ADD_MODULE(schema, LYS_IN_YANG, NULL, NULL); TEST_SUCCESS_XML("T_UB_6", "Árvíztűrő tükörfúrógép!", STRING, "Árvíztűrő tükörfúrógép!"); @@ -854,15 +854,15 @@ test_data_xml(void **state) schema = MODULE_CREATE_YANG("T_UB_7", "leaf port {type string { pattern '\\p{IsUnknownUnicodeBlock}+';} } "); UTEST_INVALID_MODULE(schema, LYS_IN_YANG, NULL, LY_EVALID); CHECK_LOG_CTX("Regular expression \"\\p{IsUnknownUnicodeBlock}+\" " - "is not valid (\"UnknownUnicodeBlock}+\": unknown block name).", "/T_UB_7:port", 0); + "is not valid (\"UnknownUnicodeBlock}+\": unknown block name).", "/T_UB_7:port", 0); /* Unicode block test 8 - Unknown Unicode block with Basic Latin */ schema = MODULE_CREATE_YANG("T_UB_8", "leaf port {type string { " - " pattern '[\\p{IsBasicLatin}\\p{IsUnknownUnicodeBlock}]+';" - "}} "); + " pattern '[\\p{IsBasicLatin}\\p{IsUnknownUnicodeBlock}]+';" + "}} "); UTEST_INVALID_MODULE(schema, LYS_IN_YANG, NULL, LY_EVALID); CHECK_LOG_CTX("Regular expression \"[\\p{IsBasicLatin}\\p{IsUnknownUnicodeBlock}]+\" " - "is not valid (\"UnknownUnicodeBlock}]+\": unknown block name).", "/T_UB_8:port", 0); + "is not valid (\"UnknownUnicodeBlock}]+\": unknown block name).", "/T_UB_8:port", 0); } static void