From 8c93ce13f656fcf48bf4fc204d2c4570e9925b65 Mon Sep 17 00:00:00 2001 From: Reini Urban Date: Sat, 2 Jan 2021 17:53:30 +0100 Subject: [PATCH] WIP add u8ident Extension for secure utf-8 identifiers, for everybody seeing identifiable names in UTF-8 encodings, like a filename in a terminal or UI widget. Identifiers need to be identifiable, i.e. implement mixed script detection, and such for a Unicode TR39 Moderately Restrictive restriction level. Also identifiers are validated and normalized by default, to be able to compare and find them. --- GNUmakefile | 8 ++++++++ README.md | 5 ++++- ctl/string.h | 1 + ctl/u8ident.h | 46 ++++++++++++++++++++++++++++++++++++++++++++++ ctl/u8string.h | 1 + docs/index.md | 5 ++++- makefile | 5 ++++- 7 files changed, 68 insertions(+), 3 deletions(-) create mode 100644 ctl/u8ident.h diff --git a/GNUmakefile b/GNUmakefile index f2faaf41..84a76b36 100644 --- a/GNUmakefile +++ b/GNUmakefile @@ -292,6 +292,8 @@ ctl/string.i: $(call expand,$(subst .i,,$@)) ctl/u8string.i: $(call expand,$(subst .i,,$@)) +ctl/u8ident.i: + $(call expand,$(subst .i,,$@)) ctl/map.i: $(call expand,$(subst .i,,$@),-DT=strint -DPOD) ctl/unordered_map.i: @@ -353,6 +355,12 @@ tests/func/test_stack: .cflags $(COMMON_H) tests/test.h tests/func/digi.hh ct tests/func/test_string: .cflags $(COMMON_H) tests/test.h ctl/string.h ctl/vector.h \ tests/func/test_string.cc $(CXX) $(CXXFLAGS) -o $@ $@.cc +tests/func/test_u8string: .cflags $(COMMON_H) tests/test.h ctl/u8string.h ctl/vector.h \ + tests/func/test_u8string.cc + $(CXX) $(CXXFLAGS) -o $@ $@.cc +tests/func/test_u8ident: .cflags $(COMMON_H) tests/test.h ctl/u8ident.h ctl/u8string.h \ + ctl/vector.h tests/func/test_u8string.cc + $(CXX) $(CXXFLAGS) -o $@ $@.cc tests/func/test_str_capacity: .cflags $(COMMON_H) tests/test.h ctl/string.h ctl/vector.h \ tests/func/test_str_capacity.cc $(CXX) $(CXXFLAGS) -o $@ $@.cc diff --git a/README.md b/README.md index 9c906859..57fd91f7 100644 --- a/README.md +++ b/README.md @@ -56,7 +56,8 @@ all containers in ISO C99/C11: | [ctl/set.h](docs/set.md) | std::set | set | | [ctl/stack.h](docs/stack.md) | std::stack | stack | | [ctl/string.h](docs/string.md) | std::string | str | -| [ctl/u8string.h](docs/u8string.md) | std::string | str | +| [ctl/u8string.h](docs/u8string.md) | std::string ext | u8str | +| [ctl/u8ident.h](docs/u8ident.md) | - | u8id | | [ctl/vector.h](docs/vector.md) | std::vector | vec | | [ctl/array.h](docs/array.md) | std::array | arrNNNN | | [ctl/map.h](docs/map.md) | std::map | map | @@ -286,6 +287,7 @@ make ctl/set.i make ctl/stack.i make ctl/string.i make ctl/u8string.i +make ctl/u8ident.i make ctl/vector.i make ctl/array.i make ctl/map.i @@ -365,6 +367,7 @@ And in its grandiosity (esp. not header-only): vector.h: realloc string.h: vector.h u8string.h: vector.h ++ + u8ident.h: u8string.h deque.h: realloc (paged) queue.h: deque.h stack.h: deque.h diff --git a/ctl/string.h b/ctl/string.h index 89144462..151a1b9e 100644 --- a/ctl/string.h +++ b/ctl/string.h @@ -280,6 +280,7 @@ static inline int JOIN(A, compare)(A *self, A *other) #undef A #undef I #undef T +#undef POD #else #undef HOLD #endif diff --git a/ctl/u8ident.h b/ctl/u8ident.h new file mode 100644 index 00000000..a9298120 --- /dev/null +++ b/ctl/u8ident.h @@ -0,0 +1,46 @@ +/* POSIX std extension for people using utf-8 identifiers, but + need security. See http://unicode.org/reports/tr39/ + Like a kernel filesystem or user database, in a UTF-8 terminal, + wishes to present identifiers, like names, paths or files identifiable. + I.e. normalized and with identifiable characters only. Most don't display + names as puny-code. + Implement the Moderately Restrictive restriction level as default. + + * All characters in the string are in the ASCII range, or + * The string is single-script, according to the definition in Section 5.1, or + * The string is covered by any of the following sets of scripts, according to + the definition in TR29 Section 5.1: + Latin + Han + Hiragana + Katakana; or equivalently: Latn + Jpan + Latin + Han + Bopomofo; or equivalently: Latn + Hanb + Latin + Han + Hangul; or equivalently: Latn + Kore, or + * The string is covered by Latin and any one other Recommended script, except Cyrillic, Greek. + * The string must be validated UTF-8 and normalized, and only consist of valid identifier + characters. + + Reject violations, optionally warn about confusables. + SPDX-License-Identifier: MIT */ + +#ifndef __CTL_U8IDENT_H__ +#define __CTL_U8IDENT_H__ + +#ifdef T +#error "Template type T defined for " +#endif + +#define HOLD +#define u8id_char8_t u8id +#define vec u8id +#define A u8id +#include + +// TODO Take my code from cperl, which has stable unicode security for some years. +// I'm also just adding this to my safeclib. +// The only other existing example of proper unicode security is Java. + +#undef A +#undef I +#undef T +#undef POD +#undef HOLD + +#endif diff --git a/ctl/u8string.h b/ctl/u8string.h index c62b02d2..988d611c 100644 --- a/ctl/u8string.h +++ b/ctl/u8string.h @@ -336,6 +336,7 @@ JOIN(A, key_compare)(A* self, A* s) #ifdef HOLD /* for u8ident.h */ # undef HOLD #else +# undef POD # undef T # undef A # undef I diff --git a/docs/index.md b/docs/index.md index 918a828e..499b7bb5 100644 --- a/docs/index.md +++ b/docs/index.md @@ -56,7 +56,8 @@ all containers in ISO C99/C11: | [ctl/set.h](set.md) | std::set | set | | [ctl/stack.h](stack.md) | std::stack | stack | | [ctl/string.h](string.md) | std::string | str | -| [ctl/u8string.h](u8string.md) | std::string | str | +| [ctl/u8string.h](u8string.md) | std::string ext | u8str | +| [ctl/u8ident.h](u8ident.md) | - | u8id | | [ctl/vector.h](vector.md) | std::vector | vec | | [ctl/array.h](array.md) | std::array | arrNNNN | | [ctl/map.h](map.md) | std::map | map | @@ -286,6 +287,7 @@ make ctl/set.i make ctl/stack.i make ctl/string.i make ctl/u8string.i +make ctl/u8ident.i make ctl/vector.i make ctl/array.i make ctl/map.i @@ -365,6 +367,7 @@ And in its grandiosity (esp. not header-only): vector.h: realloc string.h: vector.h u8string.h: vector.h ++ + u8ident.h: u8string.h deque.h: realloc (paged) queue.h: deque.h stack.h: deque.h diff --git a/makefile b/makefile index f1d595d6..702b3023 100644 --- a/makefile +++ b/makefile @@ -322,9 +322,12 @@ tests/func/test_string: .cflags ${COMMON_H} tests/test.h ctl/string.h ctl/vect tests/func/test_str_capacity: .cflags ${COMMON_H} tests/test.h ctl/string.h ctl/vector.h \ tests/func/test_str_capacity.cc ${CXX} ${CXXFLAGS} -o $@ $@.cc -tests/func/test_u8string: .cflags ${COMMON_H} ctl/u8string.h ctl/string.h \ +tests/func/test_u8string: .cflags ${COMMON_H} tests/test.h ctl/u8string.h ctl/vector.h \ tests/func/test_u8string.cc ${CXX} ${CXXFLAGS} -o $@ $@.cc +tests/func/test_u8ident: .cflags ${COMMON_H} tests/test.h ctl/u8ident.h ctl/u8ident.h \ + tests/func/test_u8ident.cc + ${CXX} ${CFLAGS} -o $@ $@.cc tests/func/test_vec_capacity: .cflags ${COMMON_H} tests/test.h ctl/vector.h \ tests/func/test_vec_capacity.cc ${CXX} ${CXXFLAGS} -o $@ $@.cc