
On 01.09.18 11:43, Alexander Graf wrote:
On 31.08.18 21:31, Heinrich Schuchardt wrote:
This patch provides a define to initialize a table that maps lower to capital letters for Unicode code point 0x0000 - 0xffff.
Signed-off-by: Heinrich Schuchardt xypron.glpk@gmx.de
v2 add shorter tables for code pages 437 and 1250
MAINTAINERS | 1 + include/capitalization.h | 2028 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 2029 insertions(+) create mode 100644 include/capitalization.h
diff --git a/MAINTAINERS b/MAINTAINERS index 46f826a0fe..8c9cd83347 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -381,6 +381,7 @@ T: git git://github.com/agraf/u-boot.git F: doc/README.uefi F: doc/README.iscsi F: Documentation/efi.rst +F: include/capitalization.h F: include/efi* F: include/pe.h F: include/asm-generic/pe.h diff --git a/include/capitalization.h b/include/capitalization.h new file mode 100644 index 0000000000..2c24e1bf47 --- /dev/null +++ b/include/capitalization.h @@ -0,0 +1,2028 @@ +/* SPDX-License-Identifier: Unicode-DFS-2016 */ +/*
- Capitalization tables
- */
+struct capitalization_table {
- u16 upper;
- u16 lower;
+};
+/*
- Correspondence table for small and capital Unicode letters in the range of
- 0x0000 - 0xffff based on http://www.unicode.org/Public/UCA/11.0.0/allkeys.txt
- */
+#define UNICODE_CAPITALIZATION_TABLE { \
- { 0x0531, /* ARMENIAN CAPITAL LETTER AYB */ \
0x0561, /* ARMENIAN SMALL LETTER AYB */ }, \
[...]
- { 0x2C7F, /* LATIN CAPITAL LETTER Z WITH SWASH TAIL */ \
0x0240, /* LATIN SMALL LETTER Z WITH SWASH TAIL */ }, \
- { 0x0000, /* END OF LIST CAPITAL LETTERS */ \
0x0000, /* END OF LIST SMALL LETTERS */ }, \
+}
+/*
- Correspondence table for small and capital letters of codepage 437.
- */
+#define CP437_CAPITALIZATION_TABLE { \
- { 0x03a6, 0x03c6, }, \
I like how you added comments to each entry on what exactly the character is. Please keep that habit in the trimmed down tables too.
- { 0x03a3, 0x03c3, }, \
- { 0x0041, 0x0061, }, \
- { 0x00c4, 0x00e4, }, \
- { 0x00c5, 0x00e5, }, \
- { 0x00c6, 0x00e6, }, \
- { 0x0042, 0x0062, }, \
- { 0x0043, 0x0063, }, \
- { 0x00c7, 0x00e7, }, \
- { 0x0044, 0x0064, }, \
- { 0x0045, 0x0065, }, \
- { 0x00c9, 0x00e9, }, \
- { 0x0046, 0x0066, }, \
- { 0x0047, 0x0067, }, \
- { 0x0048, 0x0068, }, \
- { 0x0049, 0x0069, }, \
- { 0x004a, 0x006a, }, \
- { 0x004b, 0x006b, }, \
- { 0x004c, 0x006c, }, \
- { 0x004d, 0x006d, }, \
- { 0x004e, 0x006e, }, \
- { 0x00d1, 0x00f1, }, \
- { 0x004f, 0x006f, }, \
- { 0x00d6, 0x00f6, }, \
- { 0x0050, 0x0070, }, \
- { 0x0051, 0x0071, }, \
Most of these are just latin A to Z. These are already covered in your conversion by code, no? So you can just omit them.
- { 0x0052, 0x0072, }, \
- { 0x0053, 0x0073, }, \
- { 0x0054, 0x0074, }, \
- { 0x0055, 0x0075, }, \
- { 0x00dc, 0x00fc, }, \
- { 0x0056, 0x0076, }, \
- { 0x0057, 0x0077, }, \
- { 0x0058, 0x0078, }, \
- { 0x0059, 0x0079, }, \
- { 0x005a, 0x007a, }, \
- { 0x0000, 0x0000, }, \
... that would leave 11 entries for cp437 ...
+}
+/*
- Correspondence table for small and capital letters of codepage 1250.
- */
+#define CP1250_CAPITALIZATION_TABLE { \
- { 0x0041, 0x0061, }, \
Please sort the list by code point - or any other recognizable sorting order ;).
- { 0x00c1, 0x00e1, }, \
- { 0x0102, 0x0103, }, \
- { 0x00c2, 0x00e2, }, \
- { 0x00c4, 0x00e4, }, \
- { 0x0104, 0x0105, }, \
- { 0x0042, 0x0062, }, \
- { 0x0043, 0x0063, }, \
- { 0x0106, 0x0107, }, \
- { 0x010c, 0x010d, }, \
- { 0x00c7, 0x00e7, }, \
- { 0x0044, 0x0064, }, \
- { 0x010e, 0x010f, }, \
- { 0x0110, 0x0111, }, \
- { 0x0045, 0x0065, }, \
- { 0x00c9, 0x00e9, }, \
- { 0x011a, 0x011b, }, \
- { 0x00cb, 0x00eb, }, \
- { 0x0118, 0x0119, }, \
- { 0x0046, 0x0066, }, \
- { 0x0047, 0x0067, }, \
- { 0x0048, 0x0068, }, \
- { 0x0049, 0x0069, }, \
- { 0x00cd, 0x00ed, }, \
- { 0x00ce, 0x00ee, }, \
- { 0x004a, 0x006a, }, \
- { 0x004b, 0x006b, }, \
- { 0x004c, 0x006c, }, \
- { 0x0139, 0x013a, }, \
- { 0x013d, 0x013e, }, \
- { 0x0141, 0x0142, }, \
- { 0x004d, 0x006d, }, \
- { 0x004e, 0x006e, }, \
- { 0x0143, 0x0144, }, \
- { 0x0147, 0x0148, }, \
- { 0x004f, 0x006f, }, \
- { 0x00d3, 0x00f3, }, \
- { 0x00d4, 0x00f4, }, \
- { 0x00d6, 0x00f6, }, \
- { 0x0150, 0x0151, }, \
- { 0x0050, 0x0070, }, \
- { 0x0051, 0x0071, }, \
- { 0x0052, 0x0072, }, \
- { 0x0154, 0x0155, }, \
- { 0x0158, 0x0159, }, \
- { 0x0053, 0x0073, }, \
- { 0x015a, 0x015b, }, \
- { 0x0160, 0x0161, }, \
- { 0x015e, 0x015f, }, \
- { 0x0054, 0x0074, }, \
- { 0x0164, 0x0165, }, \
- { 0x0162, 0x0163, }, \
- { 0x0055, 0x0075, }, \
- { 0x00da, 0x00fa, }, \
- { 0x00dc, 0x00fc, }, \
- { 0x0170, 0x0171, }, \
- { 0x016e, 0x016f, }, \
- { 0x0056, 0x0076, }, \
- { 0x0057, 0x0077, }, \
- { 0x0058, 0x0078, }, \
- { 0x0059, 0x0079, }, \
- { 0x00dd, 0x00fd, }, \
- { 0x005a, 0x007a, }, \
- { 0x0179, 0x017a, }, \
- { 0x017d, 0x017e, }, \
- { 0x017b, 0x017c, }, \
... and 40 unique points for cp1250.
How about we just combine the two tables into one and call it "western"?
Actually, thinking about it again, keeping the tables separate is probably a good idea.
Alex